├── README.md
├── haplotype-block-assignment
    ├── README.md
    ├── nucmer_all_chromosomes_chromosome_level_assemblies_wheat.sh
    ├── nucmer_all_chromosomes_scaffold_level_assemblies_wheat.sh
    ├── get_ref_coords.r
    ├── combine_mummer_and_BLAST.r
    ├── assign_BLAST_blocks_whole_genome.r
    └── assign_mummer_blocks_whole_genome.r
├── pairwise-blast
    └── convert_gene_blocks.sh
├── IBD-regions-stats
    ├── slice_block_stats.sh
    ├── regions-without-IBD.Rmd
    └── regions-without-IBD-validate.Rmd
├── .gitignore
├── LICENSE
├── sequence_complexity
    ├── mummer_show_snps_all_aln_per_chr.sh
    ├── calculate_snp_distribution_6A_CS.r
    ├── calculate_sequence_complexity.rmd
    └── calculate_seq_complexity_stats.r
├── figures
    ├── plot_extended_data_figure_3.rmd
    ├── plot_extended_data_figure_8.rmd
    ├── plot_extended_data_figure_5.rmd
    ├── plot_extended_data_figure_6.rmd
    ├── plot_extended_data_figure_7.rmd
    ├── plot_extended_data_figure_1.Rmd
    ├── figure_1_plots.rmd
    └── figure_4_plots.rmd
└── precision-recall-analysis
    ├── calculate_combined_precision_recall.r
    └── calculate_BLAST_blocks_all_window_sizes_flanking_sequences.r


/README.md:
--------------------------------------------------------------------------------
1 | # pangenome-haplotypes
2 | Scripts to do haplotype analysis on pan genomes. 
3 | 


--------------------------------------------------------------------------------
/haplotype-block-assignment/README.md:
--------------------------------------------------------------------------------
1 | # haplotype-block-assignment
2 | Scripts to assign haplotype blocks based on whole chromosome mummer alignments and pairwise BLAST alignments based on gene projections. 
3 | 


--------------------------------------------------------------------------------
/pairwise-blast/convert_gene_blocks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd pangenome-web
 4 | 
 5 | input_folder=blocks
 6 | 
 7 | for chr in `ls $input_folder | grep chr` ; do
 8 | 
 9 | 	input="${input_folder}/${chr}/haplotype_blocks_BLAST_25_gene_window_2000bp.txt"
10 | 	output="${input_folder}/${chr}/haplotype_blocks_BLAST_25_gene_window_2000bp_converted_${chr}_10genesBlocks.csv"
11 | 	echo $chr
12 | 	rake  "haplotypes:convert_gene_coordinates[${input},${output},Wheat]"
13 | 
14 | done
15 | 
16 | 


--------------------------------------------------------------------------------
/IBD-regions-stats/slice_block_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | cd pangenome-web
 4 | 
 5 | output_folder=block_stats_by_slice_500kbp/
 6 | mkdir -p $output_folder
 7 | folder=regions
 8 | regions_path=$folder/cs_regions.bed
 9 | block_sizes='Tae_5000kbp Tae_2500kbp Tae_1000kbp    '
10 | slice_size=500000
11 | 
12 | for f in $block_sizes ;do
13 | 	echo $f
14 | 	output="${output_folder}/Block_slice500k_stats_${f}.tsv"
15 | 	echo $output
16 | 	rake "haplotypes:export_haplotype_block_stats_in_points[${output},${f},Wheat,$slice_size]" &
17 | done
18 | 
19 | wait
20 | 
21 | for f in $block_sizes ;do
22 | 	echo $f
23 | 	output="${output_folder}/Block_slice500k_stats_${f}.tsv"
24 | 	echo $output
25 | 	gzip  -f $output &
26 | 	gzip  -f $output.missing &
27 | done 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Uauy-Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/sequence_complexity/mummer_show_snps_all_aln_per_chr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | #SBATCH --mem=10000
 3 | #SBATCH --mail-type=END,FAIL 
 4 | #SBATCH -N 1 
 5 | #SBATCH -n 1
 6 | #SBATCH -p jic-short,RG-Cristobal-Uauy
 7 | #SBATCH --array=0-255
 8 | #SBATCH -o ./run_logs/show_snps.%a.%N.%j.out
 9 | #SBATCH -e ./run_logs/show_snps.%a.%N.%j.err
10 | 
11 | #DEFINE CHROMOSOME
12 | CHR_NAME="2B"
13 | 
14 | t_id=$SLURM_ARRAY_TASK_ID
15 | 
16 | x=$(($t_id / 15))
17 | q_x=$(($t_id % 15))
18 | 
19 | 
20 | declare -a names=("arinalrfor" "chinese" "jagger" "julius" "lancer" "landmark" "mace" "norin61" "stanley" "sy_mattis" "cadenza" "claire" "paragon" "robigus" "weebil")
21 | 
22 | source mummer-3.23
23 | 
24 | 
25 | echo $t_id
26 | echo $x
27 | echo $q_x
28 | echo $c_x
29 | 
30 | 
31 | REF_NAME=${names[$x]}
32 | QUERY_NAME=${names[$q_x]}
33 | 
34 | 
35 | echo $REF_NAME
36 | echo $QUERY_NAME
37 | echo $CHR_NAME
38 | 
39 | ALN_DIR='/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/aln/'$REF_NAME/$CHR_NAME
40 | ALN_ID=$REF_NAME'_v_'$QUERY_NAME'.all_'$CHR_NAME'_filtered_L20Kb_rq'
41 | 
42 | OUTPUT_DIR='/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/surrounding_seq/'$CHR_NAME
43 | 
44 | mkdir -p $OUTPUT_DIR
45 | 
46 | if [ "$REF_NAME" == "$QUERY_NAME" ]
47 | then
48 | 	echo "Reference and Query are the same"
49 | else
50 | 	echo "Reference and Query are different, continuing with mummer alignments"
51 | 	#extract the 10bp up and downstream of each snp
52 | 	show-snps -C -T -x 10 $ALN_DIR/$ALN_ID.delta > $OUTPUT_DIR/$ALN_ID'_10bpFlank.snps'
53 | fi
54 | 
55 | 


--------------------------------------------------------------------------------
/figures/plot_extended_data_figure_3.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Plot precision recall graphs"
 3 | author: "Jemima Brinton"
 4 | date: "17/05/2020"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | 
11 | library(ggplot2)
12 | library(RColorBrewer)
13 | ```
14 | 
15 | ```{r}
16 | base_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/precision_recall/"
17 | plot_dir <- paste0(base_dir, "plots/")
18 | 
19 | combined_data <- read.table(file = paste0(plot_dir, "combined_precision_recall_f1_table.tsv"),
20 |             sep = "\t",
21 |             header = TRUE,
22 |             stringsAsFactors =FALSE)
23 | combined_data$gene_block <- factor(combined_data$gene_block)
24 | combined_data$window_size <- factor(combined_data$window_size, levels = c("cdsbp", "0bp", "1000bp", "2000bp", "5000bp"))
25 | 
26 | precision <- ggplot(combined_data, aes(x = window_size, y = precision, fill = gene_block)) +
27 |     geom_boxplot(outlier.size = 0.7) +
28 |     scale_fill_brewer(palette = "Set1") +
29 |     theme_bw() +
30 |     xlab("Flanking Sequence") +
31 |     ylab("Precision")
32 | 
33 | precision
34 | ```
35 | ```{r}
36 | recall <- ggplot(combined_data, aes(x = window_size, y = recall, fill = gene_block)) +
37 |     geom_boxplot(outlier.size = 0.7) +
38 |     scale_fill_brewer(palette = "Set1") +
39 |     theme_bw() +
40 |     xlab("Flanking Sequence") +
41 |     ylab("Recall")
42 | 
43 | recall
44 | ```
45 | 
46 | ```{r}
47 | f1 <- ggplot(combined_data, aes(x = window_size, y = f1_score, fill = gene_block)) +
48 |     geom_boxplot(outlier.size = 0.7) +
49 |     scale_fill_brewer(palette = "Set1") +
50 |     theme_bw() +
51 |     xlab("Flanking Sequence") +
52 |     ylab("F1 score")
53 | 
54 | f1
55 | ```
56 | 
57 | ```{r}
58 | height = 4
59 | width = 5.5
60 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/precision_combined.pdf", height = height, width = width)
61 | precision
62 | dev.off()
63 | 
64 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/recall_combined.pdf", height = height, width = width)
65 | recall
66 | dev.off()
67 | 
68 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/f1_combined.pdf", height = height, width = width)
69 | f1
70 | dev.off()
71 | 
72 | 
73 | ```
74 | 


--------------------------------------------------------------------------------
/haplotype-block-assignment/nucmer_all_chromosomes_chromosome_level_assemblies_wheat.sh:
--------------------------------------------------------------------------------
 1 | ## this will be just to write and submit the scripts for each chromosome
 2 | 
 3 | cd /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome
 4 | 
 5 | chr_names=("1A" "1B" "1D" "2A" "2B" "2D" "3A" "3B" "3D" "4A" "4B" "4D" "5A" "5B" "5D" "6A" "6B" "6D" "7A" "7B" "7D")
 6 | 
 7 | 
 8 | for CHR_NAME in ${chr_names[*]}
 9 | 	do
10 | 	echo $CHR_NAME
11 | 	echo \
12 | '#!/bin/bash -e
13 | #SBATCH --mem=30G
14 | #SBATCH --mail-type=END,FAIL 
15 | #SBATCH -N 1 
16 | #SBATCH -n 1
17 | #SBATCH -p RG-Cristobal-Uauy,jic-long
18 | #SBATCH --array=0-99
19 | #SBATCH -o ./run_logs/run_nucmer.%a.%N.%j.out
20 | #SBATCH -e ./run_logs/run_nucmer.%a.%N.%j.err
21 | 
22 | 
23 | 
24 | t_id=$SLURM_ARRAY_TASK_ID
25 | 
26 | x=$(($t_id / 10))
27 | q_x=$(($t_id % 10))
28 | 
29 | 
30 | declare -a names=("arinalrfor" "chinese" "jagger" "julius" "lancer" "landmark" "mace" "norin61" "stanley" "sy_mattis")
31 | 
32 | source mummer-3.23
33 | 
34 | 
35 | echo $t_id
36 | echo $x
37 | echo $q_x
38 | echo $c_x
39 | 
40 | 
41 | REF_NAME=${names[$x]}
42 | QUERY_NAME=${names[$q_x]}
43 | 
44 | 
45 | echo $REF_NAME
46 | echo $QUERY_NAME
47 | echo '$CHR_NAME'
48 | 
49 | 
50 | ASSEMBLIES_PATH="/nbi/group-data/ifs/NBI/Cristobal-Uauy/ramirezr/Pangenomes/Bn"
51 | OUTPUT_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/aln/$REF_NAME/'$CHR_NAME'"
52 | CHROM_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/fasta/'$CHR_NAME'"
53 | 
54 | mkdir -p $OUTPUT_DIR
55 | 
56 | if [ "$REF_NAME" == "$QUERY_NAME" ]
57 | then
58 | 	echo "Reference and Query are the same"
59 | else
60 | 	echo "Reference and Query are different, continuing with mummer alignments"
61 | 	#run nucmer
62 | 	nucmer --mum --delta $CHROM_DIR/$REF_NAME.'$CHR_NAME'.fa $CHROM_DIR/$QUERY_NAME.'$CHR_NAME'.fa --prefix $OUTPUT_DIR/$REF_NAME.v.$QUERY_NAME.'$CHR_NAME'
63 | 	#filter for alignments of at least 20kb and rq
64 | 	delta-filter -l 20000 -r -q $OUTPUT_DIR/$REF_NAME.v.$QUERY_NAME.'$CHR_NAME'.delta > $OUTPUT_DIR/$REF_NAME.v.$QUERY_NAME.'$CHR_NAME'.filtered_L20Kb_rq.delta
65 | fi
66 | ' > /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_$CHR_NAME.sh
67 | 
68 | sbatch /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_$CHR_NAME.sh
69 | done


--------------------------------------------------------------------------------
/haplotype-block-assignment/nucmer_all_chromosomes_scaffold_level_assemblies_wheat.sh:
--------------------------------------------------------------------------------
 1 | ## this will be just to write and submit the scripts for each chromosome
 2 | 
 3 | cd /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome
 4 | 
 5 | chr_names=("1A" "1B" "1D" "2A" "2B" "2D" "3A" "3B" "3D" "4A" "4B" "4D" "5A" "5B" "5D" "6A" "6B" "6D" "7A" "7B" "7D")
 6 | 
 7 | 
 8 | for CHR_NAME in ${chr_names[*]}
 9 | 	do
10 | 	echo $CHR_NAME
11 | 	echo \
12 | '#!/bin/bash -e
13 | #SBATCH --mem=30G
14 | #SBATCH --mail-type=END,FAIL 
15 | #SBATCH -N 1 
16 | #SBATCH -n 1
17 | #SBATCH -p RG-Cristobal-Uauy,jic-long
18 | #SBATCH --array=0-49
19 | #SBATCH -o ./run_logs/run_nucmer_EI.%a.%N.%j.out
20 | #SBATCH -e ./run_logs/run_nucmer_EI.%a.%N.%j.err
21 | 
22 | t_id=$SLURM_ARRAY_TASK_ID
23 | 
24 | x=$(($t_id % 10))
25 | q_x=$(($t_id / 10))
26 | 
27 | 
28 | declare -a names=("arinalrfor" "chinese" "jagger" "julius" "lancer" "landmark" "mace" "norin61" "stanley" "sy_mattis")
29 | declare -a query_names=("cadenza" "claire" "paragon" "robigus" "weebil")
30 | 
31 | source mummer-3.23
32 | 
33 | 
34 | echo $t_id
35 | echo $x
36 | echo $q_x
37 | 
38 | 
39 | REF_NAME=${names[$x]}
40 | QUERY_NAME=${query_names[$q_x]}
41 | 
42 | 
43 | echo $REF_NAME
44 | echo $QUERY_NAME
45 | echo $CHR_NAME
46 | 
47 | OUTPUT_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/aln/'$REF_NAME/$CHR_NAME'"
48 | CHROM_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/fasta/'$CHR_NAME'"
49 | 
50 | mkdir -p $OUTPUT_DIR
51 | 
52 | if [ "$REF_NAME" == "$QUERY_NAME" ]
53 | then
54 | 	echo "Reference and Query are the same"
55 | else
56 | 	echo "Reference and Query are different, continuing with mummer alignments"
57 | 	#run nucmer
58 | 	nucmer --mum --delta $CHROM_DIR/$REF_NAME'.chr'$CHR_NAME'.fa' $CHROM_DIR/$QUERY_NAME'.'$CHR_NAME'_scaffs_projections.fa' --prefix $OUTPUT_DIR/$REF_NAME'_v_'$QUERY_NAME'.'$CHR_NAME
59 | 	#filter for alignments of at least 20kb and rq
60 | 	delta-filter -l 20000 -r -q $OUTPUT_DIR/$REF_NAME'_v_'$QUERY_NAME'.'$CHR_NAME'.delta' > $OUTPUT_DIR/$REF_NAME'_v_'$QUERY_NAME'.'$CHR_NAME'_filtered_L20Kb_rq.delta'
61 | fi
62 | ' > /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_EI_$CHR_NAME.sh
63 | 
64 | sbatch /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_EI_$CHR_NAME.sh
65 | done


--------------------------------------------------------------------------------
/sequence_complexity/calculate_snp_distribution_6A_CS.r:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | library(stringr)
 4 | library(ggplot2)
 5 | 
 6 | 
 7 | #Compare all the snps against chinese spring on 6A
 8 | #Takes output from mummer show-snps
 9 | 
10 | 
11 | sequence_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/6A/"
12 | output_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/"
13 | 
14 | ref <- "chinese"
15 | 
16 | sequence_files <- list.files(sequence_dir, pattern = paste0(ref, "_v_"))
17 | sequence_files
18 | 
19 | snps_positions_combined <- integer()
20 | 
21 | for (file in sequence_files){
22 |   
23 |   ref_query_temp <- str_split_fixed(file, "\\.", 2)[1]
24 |   
25 |   query <- str_split_fixed(ref_query_temp, "_", 3)[3]
26 |   
27 | 
28 |   if(query == "sy"){
29 |     query <- "sy_mattis"
30 |   }
31 |   
32 |   snp_flanks <- read.table(file = paste0(sequence_dir, file),
33 |                           sep = "\t",
34 |                           skip = 4,
35 |                           header = FALSE,
36 |                           stringsAsFactors = FALSE)
37 | 
38 |   colnames(snp_flanks) <- c("P1", "SUB1", "SUB2", "P2", "BUFF", "DIST", "REF_SEQ", "QUERY_SEQ", "FRM", "TAGS", "REF", "QUERY")
39 |   
40 |   #remove any snps that are Ns and also get rid of all irrelevant info - i.e. only keep chinese spring position and alleles
41 |   snp_flanks_noN <- snp_flanks[!((snp_flanks$SUB1 == "N") | (snp_flanks$SUB2 == "N")), c("P1", "SUB1", "SUB2")]
42 |   nrow(snp_flanks_noN)
43 |   
44 |   #also remove any where the alleles are "." i.e. an indel
45 |   snps_only <- snp_flanks_noN[!((snp_flanks_noN$SUB1 == ".") | (snp_flanks_noN$SUB2 == ".")),]
46 |   nrow(snps_only)
47 |   
48 |   snp_positions <- unique(snps_only$P1)
49 |   length(snp_positions)
50 |   
51 |   snp_positions_to_keep <- snp_positions[!(snp_positions %in% snps_positions_combined)]
52 |   
53 |   snps_positions_combined <- c(snps_positions_combined, snp_positions_to_keep)
54 |   print(file)
55 | }
56 | 
57 | head(snps_positions_combined)
58 | length(snps_positions_combined)
59 | 
60 | snps_positions_combined <- snps_positions_combined[order(snps_positions_combined)]
61 | hist(snps_positions_combined)
62 | 
63 | snps_positions_combined_df <- data.frame(chrom = "chr6A", pos = snps_positions_combined)
64 | 
65 | write.table(snps_positions_combined_df, 
66 |             file = "Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_dist_6A_CS_ref.tsv",
67 |             sep = "\t",
68 |             row.names = FALSE,
69 |             quote = FALSE)
70 | 


--------------------------------------------------------------------------------
/figures/plot_extended_data_figure_8.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Plot haplotype specific markers"
 3 | author: "Jemima Brinton"
 4 | date: "15/05/2020"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | 
11 | library(dplyr)
12 | library(ggplot2)
13 | library(reshape2)
14 | 
15 | convert_long_01_AB <- function(data){
16 |   converted <- data.frame(marker=character(), variable = character(),  value = numeric(),pos = numeric())
17 |   markers <- unique(data$marker)
18 | 
19 |   for (i in seq(1, length(markers))){
20 |     marker <- markers[i]
21 |     marker_data <- data[data$marker == marker,]
22 |     snp_calls <- unique(marker_data$value)
23 |     alleles <- snp_calls[(snp_calls %in% c("A", "B"))]
24 |     #make sure chinese spring allele is always 0
25 |     CS_allele <- marker_data[marker_data$variable == "CHINESE", "value"]
26 |     alt_allele <- alleles[!(alleles %in% CS_allele)]
27 |     marker_data$value <- gsub(CS_allele, "0", marker_data$value)
28 |     marker_data$value <- gsub(alt_allele, "1", marker_data$value)
29 |     converted <- rbind(converted, marker_data)
30 |   }
31 |   converted$value <- gsub("N", "NA", converted$value)
32 |   converted$value <- as.numeric(converted$value)
33 |   return(converted)
34 | }
35 | 
36 | plot_capture_specified_order_small <- function(data, start, end, outfile_prefix, order_list){
37 |   data_zoom <- data[(data$pos > start) & (data$pos < end),]
38 |   data_zoom$variable <- factor(x = data_zoom$variable, levels = order_list, ordered = TRUE)
39 |   plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) +
40 | 		geom_tile(aes(fill = value), color = "gray") +
41 | 		#scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") +
42 |     #scale_fill_continuous(type = "viridis") +
43 |     scale_fill_gradient(low = "grey70", high = "grey20", na.value = "white", aesthetics = "fill") +
44 |     theme_bw() +
45 | 		theme(axis.text.x=element_text(angle=90, hjust=1)) #+
46 |     #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) )
47 |   #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 6, width = 6)
48 |   return(plot)
49 | }
50 | ```
51 | 
52 | ```{r}
53 | hap_specific <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/haplotype_specific_markers_pangenome.csv", 
54 |                            sep = ",", 
55 |                            header = TRUE, 
56 |                            stringsAsFactors = FALSE)
57 | 
58 | hap_specific_long <- melt(hap_specific, id.vars = c("marker", "pos"))
59 | 
60 | #hap_specific_01 <- convert_long_01(hap_specific_long)
61 | data <- hap_specific_long
62 | converted <- data.frame(marker=character(), variable = character(),  value = numeric(),pos = numeric())
63 |   markers <- unique(data$marker)
64 | 
65 |   for (i in seq(1, length(markers))){
66 |     marker <- markers[i]
67 |     marker_data <- data[data$marker == marker,]
68 |     snp_calls <- unique(marker_data$value)
69 |     alleles <- snp_calls[(snp_calls %in% c("A", "B"))]
70 |     #make sure chinese spring allele is always 0
71 |     CS_allele <- marker_data[marker_data$variable == "CHINESE", "value"]
72 |     alt_allele <- alleles[!(alleles %in% CS_allele)]
73 |     marker_data$value <- gsub(CS_allele, "0", marker_data$value)
74 |     marker_data$value <- gsub(alt_allele, "1", marker_data$value)
75 |     converted <- rbind(converted, marker_data)
76 |   }
77 |   converted$value <- gsub("N", "NA", converted$value)
78 |   converted$value <- as.numeric(converted$value)
79 | 
80 | hap_specific_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY")
81 | 
82 | hap_specific_01 <- converted
83 | 
84 | to_plot_subset <- subset(hap_specific_01, variable %in% hap_specific_order)
85 | 
86 | 
87 | plot_hap_specific <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(hap_specific_order))
88 | plot_hap_specific
89 | 
90 | width = 5
91 | height = 4
92 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/haplotype_specific_markers.pdf", width = width, height = height)
93 | print(plot_hap_specific)
94 | dev.off()
95 | ```


--------------------------------------------------------------------------------
/sequence_complexity/calculate_sequence_complexity.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Calculate sequence complexity"
  3 | author: "Jemima Brinton"
  4 | date: "23/03/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | library(dplyr)
 12 | library(magrittr)
 13 | library(GenomicRanges)
 14 | library(ggplot2)
 15 | library(tidyr)
 16 | library(viridis)
 17 | library(stringr)
 18 | library(dada2)
 19 | ```
 20 | 
 21 | ```{r}
 22 | #read in haplotype blocks
 23 | hap_coords <- read.table("X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined_ref_coords_block_numbers_SPELTA_ADDED.tsv",
 24 |                          sep = "\t", 
 25 |                          header = TRUE,
 26 |                          stringsAsFactors = FALSE)
 27 | 
 28 | head(hap_coords)
 29 | 
 30 | hap_coords_mummer <- hap_coords[hap_coords$source == "mummer",]
 31 | 
 32 | hap_coords_mummer_6A <- hap_coords_mummer[hap_coords_mummer$chrom == "chr6A",]
 33 | sequence_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/6A/"
 34 | output_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/"
 35 | 
 36 | sequence_files<- list.files(sequence_dir)
 37 | sequence_files
 38 | 
 39 | 
 40 | all_snps <- data.frame(P1=numeric(),
 41 |                        REF_SEQ = character(),
 42 |                        REF = character(),
 43 |                        QUERY = character(),
 44 |                        hap_block = character())
 45 | 
 46 | for (file in sequence_files){
 47 |   
 48 |   ref_query_temp <- str_split_fixed(file, "\\.", 2)[1]
 49 |   ref <- str_split_fixed(ref_query_temp, "_", 3)[1]
 50 |   query <- str_split_fixed(ref_query_temp, "_", 3)[3]
 51 |   
 52 |   if(ref == "sy"){
 53 |     ref <- "sy_mattis"
 54 |     query <- str_split_fixed(ref_query_temp, "_", 4)[4]
 55 |   }
 56 |   
 57 |   if(query == "sy"){
 58 |     query <- "sy_mattis"
 59 |   }
 60 |   
 61 |   snp_flanks <- read.table(file = paste0(sequence_dir, file),
 62 |                           sep = "\t",
 63 |                           skip = 4,
 64 |                           header = FALSE,
 65 |                           stringsAsFactors = FALSE)
 66 | 
 67 |   colnames(snp_flanks) <- c("P1", "SUB1", "SUB2", "P2", "BUFF", "DIST", "REF_SEQ", "QUERY_SEQ", "FRM", "TAGS", "REF", "QUERY")
 68 |   snp_flanks_noN <- snp_flanks[!((snp_flanks$SUB1 == "N") | (snp_flanks$SUB2 == "N")),]
 69 |   
 70 |   to_keep <- snp_flanks_noN[,c("P1", "REF_SEQ", "REF", "QUERY")]
 71 |   
 72 |   to_keep$hap_block <- NA
 73 |   
 74 |   comp_haps <- hap_coords_mummer_6A[(hap_coords_mummer_6A$ref == ref) & (hap_coords_mummer_6A$query == query),]
 75 |   if (nrow(comp_haps) > 0){
 76 |     for (i in seq(1, nrow(comp_haps))){
 77 |       start <- comp_haps[i, "ref_start"]
 78 |       end <- comp_haps[i, "ref_end"]
 79 |       to_keep[(to_keep$P1 >= start) & (to_keep$P1 <= end), "hap_block"] <- "Y"
 80 |     }
 81 |   }
 82 |     
 83 |   to_keep[is.na(to_keep$hap_block), "hap_block"] <- "N"
 84 |     
 85 |   to_keep$REF_CPLX <- seqComplexity(to_keep$REF_SEQ)
 86 | 
 87 |   write.table(to_keep, 
 88 |               file = paste0(output_dir, ref, "_v_", query, "6A_snps_ref_complexity.tsv"),
 89 |             sep = "\t",
 90 |             col.names = TRUE, 
 91 |             row.names = FALSE,
 92 |             quote = FALSE)
 93 | }
 94 | 
 95 | 
 96 | ```
 97 | 
 98 | Do the same thing for 2B so we can compare a different chromosome
 99 | 
100 | ```{r}
101 | chrom <- "2B"
102 | hap_coords_mummer_chrom <- hap_coords_mummer[hap_coords_mummer$chrom == paste0("chr", chrom),]
103 | 
104 | sequence_dir <- paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/", chrom, "/")
105 | output_dir <- paste0(sequence_dir, "complexity/")
106 | 
107 | dir.create(output_dir, recursive = TRUE)
108 | 
109 | sequence_files <- list.files(sequence_dir, pattern = "snps")
110 | 
111 | all_snps <- data.frame(P1=numeric(),
112 |                        REF_SEQ = character(),
113 |                        REF = character(),
114 |                        QUERY = character(),
115 |                        hap_block = character())
116 | 
117 | for (file in sequence_files){
118 |   
119 |   ref_query_temp <- str_split_fixed(file, "\\.", 2)[1]
120 |   ref <- str_split_fixed(ref_query_temp, "_", 3)[1]
121 |   query <- str_split_fixed(ref_query_temp, "_", 3)[3]
122 |   
123 |   if(ref == "sy"){
124 |     ref <- "sy_mattis"
125 |     query <- str_split_fixed(ref_query_temp, "_", 4)[4]
126 |   }
127 |   
128 |   if(query == "sy"){
129 |     query <- "sy_mattis"
130 |   }
131 |   
132 |   snp_flanks <- read.table(file = paste0(sequence_dir, file),
133 |                           sep = "\t",
134 |                           skip = 4,
135 |                           header = FALSE,
136 |                           stringsAsFactors = FALSE)
137 | 
138 |   colnames(snp_flanks) <- c("P1", "SUB1", "SUB2", "P2", "BUFF", "DIST", "REF_SEQ", "QUERY_SEQ", "FRM", "TAGS", "REF", "QUERY")
139 |   snp_flanks_noN <- snp_flanks[!((snp_flanks$SUB1 == "N") | (snp_flanks$SUB2 == "N")),]
140 |   
141 |   to_keep <- snp_flanks_noN[,c("P1", "REF_SEQ", "REF", "QUERY")]
142 |   
143 |   to_keep$hap_block <- NA
144 |   
145 |   comp_haps <- hap_coords_mummer_chrom[(hap_coords_mummer_chrom$ref == ref) & (hap_coords_mummer_chrom$query == query),]
146 |   if (nrow(comp_haps) > 0){
147 |     for (i in seq(1, nrow(comp_haps))){
148 |       start <- comp_haps[i, "ref_start"]
149 |       end <- comp_haps[i, "ref_end"]
150 |       to_keep[(to_keep$P1 >= start) & (to_keep$P1 <= end), "hap_block"] <- "Y"
151 |     }
152 |   }
153 |     
154 |   to_keep[is.na(to_keep$hap_block), "hap_block"] <- "N"
155 |     
156 |   to_keep$REF_CPLX <- seqComplexity(to_keep$REF_SEQ)
157 | 
158 |   write.table(to_keep, 
159 |               file = paste0(output_dir, ref, "_v_", query, chrom, "_snps_ref_complexity.tsv"),
160 |             sep = "\t",
161 |             col.names = TRUE, 
162 |             row.names = FALSE,
163 |             quote = FALSE)
164 | }
165 | 
166 | ```
167 | 


--------------------------------------------------------------------------------
/IBD-regions-stats/regions-without-IBD.Rmd:
--------------------------------------------------------------------------------
  1 | # Find regions that are not in IBD regions
  2 | 
  3 | From all the IBD regions. For this, I first exported the regions that are identical by descent from the MySQL database. From ther I have to 
  4 | 
  5 | 1. Split the table per assembly
  6 | 2. Fix the chromosome order and length based on each assembly. 
  7 | 3. use ```genomicRanges``` ```diff``` to find the regions that are not overlapping. 
  8 | 
  9 | ```{r, results='hide',message=FALSE}
 10 | suppressWarnings({
 11 | library(ggplot2)
 12 | library(sqldf)
 13 | library(reshape2)
 14 | library(ggbio)
 15 | library(GenomicRanges)
 16 | library(GenomicFeatures)
 17 | library(rtracklayer)
 18 | library(gridExtra)
 19 | library(grid)
 20 |     #require("bio.tilling")
 21 | })
 22 | ```
 23 | 
 24 | Loading the auxiliary functions
 25 | 
 26 | ```{r}
 27 | fixRangesOrder<-function(gr,lens, ordr = c("chr1A","chr1B", "chr1D","chr2A","chr2B", "chr2D","chr3A","chr3B", "chr3D","chr4A","chr4B", "chr4D","chr5A","chr5B", "chr5D","chr6A","chr6B", "chr6D","chr7A","chr7B", "chr7D","chrUn")){
 28 |     as_df <-data.frame(gr)
 29 |     as_df$seqnames <- factor(as.character(as_df$seqnames),levels = ordr )
 30 |     vect = lens$length
 31 |     names(vect) <-lens$seqname
 32 |     gr2 <- makeGRangesFromDataFrame(as_df,keep.extra.columns = T, seqinfo=vect)
 33 |     trim(gr2)
 34 | }
 35 | 
 36 | ```
 37 | 
 38 | 
 39 | Now we need to read the files 
 40 | 
 41 | ```{r,message=FALSE,  results='hide', echo=TRUE}
 42 | input_folder <- "/Users/ramirezr/Dropbox/JIC/SM1/jemima/block_in_pseudomolecules"
 43 | window_sizes <- c("5000kbp")
 44 | 
 45 | readAllBlocks<-function(input_folder, window_size="5000kbp", prefix="Blocks_in_pseudomolecules_Tae_", suffix=".tsv.gz", varieties=c("arinalrfor", "jagger" )){
 46 |     path<-paste0(input_folder, "/", prefix, window_size, suffix)
 47 |     df <- read.csv(file =gzfile(path), header=T, sep = "\t" )
 48 |     all_asm <- unique(df$assembly)
 49 |     missing_cultivars <- setdiff(all_asm,varieties ) 
 50 | 
 51 |     blocks_to_remove <- df[df$assembly %in% missing_cultivars , "block_no"]
 52 |     
 53 |     df <- df[!(df$block_no %in% blocks_to_remove ),]
 54 | 
 55 |     all_grs <- list()
 56 |     for(asm in unique(df$assembly)){
 57 |         tmp_df <- df[df$assembly == asm, ]
 58 |         chromosomes <- tmp_df[, c("chromosome", "chr_length")]
 59 |         colnames(chromosomes) <- c("seqnames", "length")
 60 |         chromosomes <- unique(chromosomes)
 61 |         tmp_df$chr_length <- NULL
 62 |         gr <- makeGRangesFromDataFrame(tmp_df,keep.extra.columns = T)
 63 |         gr <- fixRangesOrder(gr, chromosomes, ordr <- sort(chromosomes$seqnames))
 64 |         all_grs[asm] <- gr
 65 |     }
 66 |     all_grs
 67 | }
 68 | suppressWarnings(blocks <- readAllBlocks(input_folder)
 69 | )
 70 | ```
 71 | 
 72 | 
 73 | 
 74 | ```{r}
 75 | autoplot(blocks$arinalrfor,layout = "karyogram")
 76 | ```
 77 | 
 78 | Lets plot the block density to start: 
 79 | 
 80 | ```{r}
 81 | 
 82 | calculate_block_coverage<-function(gr2){
 83 |     v2 <- gr2    
 84 |     cov_v2 <- coverage(x = v2)   
 85 |     gr_bins <-GRanges(cov_v2)
 86 |     gr_bins$coverage <- gr_bins$score
 87 |     gr_bins$score <- NULL
 88 |     gr_bins$assembly <-  gr2$assembly[1]
 89 |     gr_bins$reference <- gr2$reference[1]
 90 |     return(gr_bins)
 91 | }
 92 | 
 93 | 
 94 | plotBlockDensity<-function(gr,tilewidth=500000, base=2){
 95 |    
 96 |     gr_bins <- calculate_block_coverage(gr)
 97 | 
 98 |     gr_bins$log_cov <- ceiling(log(gr_bins$coverage+1,base=base))
 99 |     gr_bins$log_cov <- ifelse(gr_bins$log_cov < 1 ,0, gr_bins$log_cov )
100 |     cov_ranges<-list()
101 |     for(r in sort(unique(gr_bins$log_cov))){
102 |         tmp_gr = gr_bins[gr_bins$log_cov == r]
103 |         cov_ranges[as.character(r)] = paste0(round(min(tmp_gr$coverage),2),"-", round(max(tmp_gr$coverage),2))
104 |     }
105 |     gr_bins$cov_legend <- factor(as.character(cov_ranges[as.character(gr_bins$log_cov)]), levels=cov_ranges)
106 |     asm <- unique(gr$assembly)
107 |     title <- paste0("Haplotype blocks for: " , asm )
108 |     autoplot(gr_bins,  layout = "karyogram", aes(fill=cov_legend)) + ggtitle(title) + scale_fill_brewer(type="seq", palette = "Oranges") 
109 | }
110 | 
111 | 
112 | ```
113 | 
114 | Now we are actually going to run the code for each dataset: 
115 | 
116 | 
117 | Define the different sets to read
118 | ```{r}
119 | datasets <- list() 
120 | datasets[["only_pseudomolecules_cultivar"]] <- c("arinalrfor","jagger","julius","lancer","landmark","mace","stanley","sy_mattis")
121 | datasets[["only_pseudomolecules"]] <- c("arinalrfor","chinese","jagger","julius","lancer","landmark","mace","norin61","stanley","sy_mattis","spelta")
122 | datasets[["only_cultivar"] ]<- c("arinalrfor","cadenza","claire","jagger","julius","lancer","landmark","mace","paragon","robigus","stanley","sy_mattis","weebil")
123 | datasets[["all_assemblies"]] <- c("arinalrfor","cadenza","chinese","claire","jagger","julius","lancer","landmark","mace","norin61","paragon","robigus","spelta","stanley","sy_mattis","weebil")
124 | datasets
125 | ```
126 | 
127 | ```{r}
128 | 
129 | 
130 | 
131 | 
132 | 
133 | out_df <- NULL
134 | window_sizes <- c("5000kbp","2500kbp","1000kbp")
135 | for(dataset in names(datasets)){
136 |     output_folder <- paste0(input_folder, "/",dataset)
137 |     vars <- datasets[[dataset]]
138 |     print(output_folder)
139 |     print(vars)
140 |     dir.create(output_folder)
141 |     
142 |     for(window_size in window_sizes){
143 |         print(window_size)
144 |         suppressWarnings(blocks <- readAllBlocks(input_folder,varieties=vars,window_size=window_size))
145 |         out_df <- NULL
146 |         path_pdf <- paste0(output_folder,"/blocks_coverage_",window_size,".pdf")
147 |         pdf(path_pdf,onefile = TRUE)
148 |         for(asm in blocks){
149 |             print(plotBlockDensity(asm))
150 |             tmp_df <- data.frame(calculate_block_coverage(asm))
151 |             if(is.null(output_folder)){
152 |                 out_df <- tmp_df
153 |             }else{
154 |                 out_df <- rbind(out_df, tmp_df)
155 |             }
156 |         }
157 |         dev.off()
158 |         write.csv(out_df, file=paste0(output_folder, "/blocks_coverage_",window_size,".csv"))
159 |     }
160 | }
161 | ```


--------------------------------------------------------------------------------
/IBD-regions-stats/regions-without-IBD-validate.Rmd:
--------------------------------------------------------------------------------
  1 | # Find regions that are not in IBD regions
  2 | 
  3 | From all the IBD regions. For this, I first exported the regions that are identical by descent from the MySQL database. From ther I have to 
  4 | 
  5 | 1. Split the table per assembly
  6 | 2. Fix the chromosome order and length based on each assembly. 
  7 | 3. use ```genomicRanges``` ```diff``` to find the regions that are not overlapping. 
  8 | 
  9 | ```{r, results='hide',message=FALSE}
 10 | suppressWarnings({
 11 | library(ggplot2)
 12 | library(sqldf)
 13 | library(reshape2)
 14 | library(ggbio)
 15 | library(GenomicRanges)
 16 | library(GenomicFeatures)
 17 | library(rtracklayer)
 18 | library(gridExtra)
 19 | library(grid)
 20 |     #require("bio.tilling")
 21 | })
 22 | ```
 23 | 
 24 | Loading the auxiliary functions
 25 | 
 26 | ```{r}
 27 | fixRangesOrder<-function(gr,lens, ordr = c("chr1A","chr1B", "chr1D","chr2A","chr2B", "chr2D","chr3A","chr3B", "chr3D","chr4A","chr4B", "chr4D","chr5A","chr5B", "chr5D","chr6A","chr6B", "chr6D","chr7A","chr7B", "chr7D","chrUn")){
 28 |     as_df <-data.frame(gr)
 29 |     as_df$seqnames <- factor(as.character(as_df$seqnames),levels = ordr )
 30 |     vect = lens$length
 31 |     names(vect) <-lens$seqname
 32 |     gr2 <- makeGRangesFromDataFrame(as_df,keep.extra.columns = T, seqinfo=vect)
 33 |     trim(gr2)
 34 | }
 35 | 
 36 | ```
 37 | 
 38 | 
 39 | Now we need to read the files 
 40 | 
 41 | ```{r,message=FALSE,  results='hide', echo=TRUE}
 42 | input_folder <- "/Users/ramirezr/Dropbox/JIC/SM1/jemima/block_in_pseudomolecules"
 43 | window_sizes <- c("5000kbp")
 44 | 
 45 | readAllBlocks<-function(input_folder, window_size="5000kbp", prefix="Blocks_in_pseudomolecules_Tae_", suffix=".tsv.gz", varieties=c("arinalrfor", "jagger" )){
 46 |     path<-paste0(input_folder, "/", prefix, window_size, suffix)
 47 |     df <- read.csv(file =gzfile(path), header=T, sep = "\t" )
 48 |     all_asm <- unique(df$assembly)
 49 |     missing_cultivars <- setdiff(all_asm,varieties ) 
 50 | 
 51 |     blocks_to_remove <- df[df$assembly %in% missing_cultivars , "block_no"]
 52 |     
 53 |     df <- df[!(df$block_no %in% blocks_to_remove ),]
 54 | 
 55 |     all_grs <- list()
 56 |     for(asm in unique(df$assembly)){
 57 |         tmp_df <- df[df$assembly == asm, ]
 58 |         chromosomes <- tmp_df[, c("chromosome", "chr_length")]
 59 |         colnames(chromosomes) <- c("seqnames", "length")
 60 |         chromosomes <- unique(chromosomes)
 61 |         tmp_df$chr_length <- NULL
 62 |         gr <- makeGRangesFromDataFrame(tmp_df,keep.extra.columns = T)
 63 |         gr <- fixRangesOrder(gr, chromosomes, ordr <- sort(chromosomes$seqnames))
 64 |         all_grs[asm] <- gr
 65 |     }
 66 |     all_grs
 67 | }
 68 | suppressWarnings(blocks <- readAllBlocks(input_folder)
 69 | )
 70 | ```
 71 | 
 72 | 
 73 | 
 74 | ```{r}
 75 | autoplot(blocks$arinalrfor,layout = "karyogram")
 76 | ```
 77 | 
 78 | Lets plot the block density to start: 
 79 | 
 80 | ```{r}
 81 | 
 82 | calculate_block_coverage<-function(gr2){
 83 |     v2 <- gr2    
 84 |     cov_v2 <- coverage(x = v2)   
 85 |     gr_bins <-GRanges(cov_v2)
 86 |     gr_bins$coverage <- gr_bins$score
 87 |     gr_bins$score <- NULL
 88 |     gr_bins$assembly <-  gr2$assembly[1]
 89 |     gr_bins$reference <- gr2$reference[1]
 90 |     return(gr_bins)
 91 | }
 92 | 
 93 | 
 94 | plotBlockDensity<-function(gr,tilewidth=500000, base=2){
 95 |    
 96 |     gr_bins <- calculate_block_coverage(gr)
 97 | 
 98 |     gr_bins$log_cov <- ceiling(log(gr_bins$coverage+1,base=base))
 99 |     gr_bins$log_cov <- ifelse(gr_bins$log_cov < 1 ,0, gr_bins$log_cov )
100 |     cov_ranges<-list()
101 |     for(r in sort(unique(gr_bins$log_cov))){
102 |         tmp_gr = gr_bins[gr_bins$log_cov == r]
103 |         cov_ranges[as.character(r)] = paste0(round(min(tmp_gr$coverage),2),"-", round(max(tmp_gr$coverage),2))
104 |     }
105 |     gr_bins$cov_legend <- factor(as.character(cov_ranges[as.character(gr_bins$log_cov)]), levels=cov_ranges)
106 |     asm <- unique(gr$assembly)
107 |     title <- paste0("Haplotype blocks for: " , asm )
108 |     autoplot(gr_bins,  layout = "karyogram", aes(fill=cov_legend)) + ggtitle(title) + scale_fill_brewer(type="seq", palette = "Oranges") 
109 | }
110 | 
111 | 
112 | ```
113 | 
114 | Now we are actually going to run the code for each dataset: 
115 | 
116 | 
117 | Define the different sets to read
118 | ```{r}
119 | datasets <- list() 
120 | datasets[["only_pseudomolecules_cultivar"]] <- c("arinalrfor","jagger","julius","lancer","landmark","mace","stanley","sy_mattis")
121 | datasets[["only_pseudomolecules"]] <- c("arinalrfor","chinese","jagger","julius","lancer","landmark","mace","norin61","stanley","sy_mattis","spelta")
122 | datasets[["only_cultivar"] ]<- c("arinalrfor","cadenza","claire","jagger","julius","lancer","landmark","mace","paragon","robigus","stanley","sy_mattis","weebil")
123 | datasets[["all_assemblies"]] <- c("arinalrfor","cadenza","chinese","claire","jagger","julius","lancer","landmark","mace","norin61","paragon","robigus","spelta","stanley","sy_mattis","weebil")
124 | datasets
125 | ```
126 | 
127 | ```{r}
128 | 
129 | 
130 | 
131 | 
132 | 
133 | out_df <- NULL
134 | window_sizes <- c("5000kbp","2500kbp","1000kbp")
135 | for(dataset in names(datasets)){
136 |     output_folder <- paste0(input_folder, "/validate/",dataset)
137 |     vars <- datasets[[dataset]]
138 |     print(output_folder)
139 |     print(vars)
140 |     dir.create(output_folder)
141 |     
142 |     for(window_size in window_sizes){
143 |         print(window_size)
144 |         suppressWarnings(blocks <- readAllBlocks(input_folder,varieties=vars,window_size=window_size))
145 |         out_df <- NULL
146 |         path_pdf <- paste0(output_folder,"/blocks_coverage_",window_size,".pdf")
147 |         pdf(path_pdf,onefile = TRUE)
148 |         for(asm in blocks){
149 |             print(plotBlockDensity(asm))
150 |             tmp_df <- data.frame(calculate_block_coverage(asm))
151 |             if(is.null(output_folder)){
152 |                 out_df <- tmp_df
153 |             }else{
154 |                 out_df <- rbind(out_df, tmp_df)
155 |             }
156 |         }
157 |         dev.off()
158 |         write.csv(out_df, file=paste0(output_folder, "/blocks_coverage_",window_size,".csv"))
159 |     }
160 | }
161 | ```


--------------------------------------------------------------------------------
/figures/plot_extended_data_figure_5.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Plots for extended data 5"
  3 | author: "Jemima Brinton"
  4 | date: "12/05/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | library(ggplot2)
 11 | library(viridis)
 12 | ```
 13 | 
 14 | Want to plot the % of genome/chromosomes shared with each variety and every other variety
 15 | 
 16 | ```{r}
 17 | haplotypes <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/no_spelta_2gap/no_spelta_2gap/whole_genome_mummer_BLAST_5000000_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv",
 18 |                          sep = "\t",
 19 |                          header = TRUE,
 20 |                          stringsAsFactors = FALSE)
 21 | 
 22 | head(haplotypes)
 23 | 
 24 | chrom_lengths_pangenome <- read.table("W:/assemblies/releasePGSBv2.0/genome/combined_fai.txt",
 25 |                                       header = FALSE,
 26 |                                       stringsAsFactors = FALSE)
 27 | 
 28 | head(chrom_lengths_pangenome)
 29 | 
 30 | chrom_lengths_refseq <- read.table("W:/WGAv1.0/161010_Chinese_Spring_v1.0_pseudomolecules.fai", 
 31 |                                    sep = "\t", 
 32 |                                    header = FALSE,
 33 |                                    stringsAsFactors = FALSE)
 34 | 
 35 | head(chrom_lengths_refseq)
 36 | 
 37 | chrom_lengths <- rbind(chrom_lengths_pangenome[,c(1,2)], chrom_lengths_refseq[,c(1,2)])
 38 | colnames(chrom_lengths) <- c("ref_chrom", "chrom_length")
 39 | 
 40 | head(chrom_lengths)
 41 | 
 42 | #add chrom lengths to the haplotype table
 43 | 
 44 | haplotypes_chrom_len <- merge(haplotypes, chrom_lengths, all.x = TRUE, all.y = FALSE)
 45 | head(haplotypes_chrom_len)
 46 | 
 47 | #calculate each block as a percentage of chromosome length
 48 | haplotypes_chrom_len$block_size_perc <- (haplotypes_chrom_len$block_size / haplotypes_chrom_len$chrom_length)*100
 49 | head(haplotypes_chrom_len)
 50 | 
 51 | # now we want to make a summary table with each ref, query, chromosome and percentage
 52 | percentage_ref_query_chrom <- aggregate(block_size_perc ~ ref + query + chrom, data = haplotypes_chrom_len, FUN = sum)
 53 | head(percentage_ref_query_chrom)
 54 | 
 55 | #also want to calculate the whole genome value for each comparison
 56 | genome_sizes <- aggregate(chrom_length ~ ref, data = unique(haplotypes_chrom_len[,c("ref", "chrom_length")]), FUN = sum)
 57 | colnames(genome_sizes)[2] <- "genome_size"
 58 | head(genome_sizes)
 59 | 
 60 | #for the EI varieties we have an issue - since the block sizes are counted as coords in the reciprocal alignment where available, the genome size calculation is > than 21 chromosomes. Since we use the RefSeqv1.1 positions when considering EI EI varieties, we will use the chinese spring total genome size
 61 | EI_var <- c("cadenza", "claire", "paragon", "robigus", "weebil")
 62 | genome_sizes[genome_sizes$ref %in% EI_var, "genome_size"] <- genome_sizes[genome_sizes$ref == "chinese", "genome_size"]
 63 | head(genome_sizes)
 64 | 
 65 | haplotypes_chrom_len_genome <- merge(haplotypes_chrom_len, genome_sizes, all.x = TRUE, all.y = FALSE)
 66 | head(haplotypes_chrom_len_genome)
 67 | 
 68 | haplotypes_chrom_len_genome$block_size_perc_genome <- (haplotypes_chrom_len_genome$block_size / haplotypes_chrom_len_genome$genome_size)*100
 69 | head(haplotypes_chrom_len_genome)
 70 | 
 71 | # now we want to make a summary table with each ref, query and genome percentage
 72 | percentage_ref_query_genome <- aggregate(block_size_perc_genome ~ ref + query, data = haplotypes_chrom_len_genome, FUN = sum)
 73 | head(percentage_ref_query_genome)
 74 | percentage_ref_query_genome$chrom <- "genome"
 75 | colnames(percentage_ref_query_genome)[3] <- "block_size_perc"
 76 | percentage_ref_query_genome <- percentage_ref_query_genome[,c("ref", "query", "chrom", "block_size_perc")]
 77 | 
 78 | #combine the chromosome and genome into a single data frame
 79 | percentage_ref_query_chrom_genome <- rbind(percentage_ref_query_genome, percentage_ref_query_chrom)
 80 | head(percentage_ref_query_chrom_genome)
 81 | #set the y axis order for plotting
 82 | chromosomes <- unique(percentage_ref_query_chrom$chrom)
 83 | percentage_ref_query_chrom_genome$chrom <- factor(percentage_ref_query_chrom_genome$chrom, levels = c(chromosomes[order(chromosomes, decreasing = TRUE)], "genome"))
 84 | 
 85 | ```
 86 | 
 87 | Ok now we want to plot a heatmap per reference variety
 88 | ```{r}
 89 | text_size <- 4
 90 | width = 5.8
 91 | height = 6.5
 92 | res = 8000
 93 | 
 94 | refs <- unique(percentage_ref_query_chrom_genome$ref)
 95 | 
 96 | for (i in seq(1, length(refs))){
 97 |   ref <- refs[i]
 98 |   print(ref)
 99 |   ref_data <- percentage_ref_query_chrom_genome[percentage_ref_query_chrom_genome$ref == ref,]
100 |   
101 |   #want to order the ref data based on the shared genome percentage
102 |   genome_shared <- ref_data[ref_data$chrom == "genome",]
103 |   genome_shared <- genome_shared[order(genome_shared$block_size_perc),]
104 |   ref_data$query <- factor(ref_data$query, levels = genome_shared$query)
105 |   
106 |   ref_shared_plot <- ggplot(ref_data, aes(y = chrom, x = query)) +
107 |     geom_tile(aes(fill = block_size_perc)) +
108 |     theme_bw() +
109 |     theme(axis.text.x = element_text(angle = 90), panel.background = element_rect(fill = "black"), panel.grid.major = element_blank(),
110 |   panel.grid.minor = element_blank(), legend.position = "none") +
111 |     scale_fill_viridis(option = "inferno", limits = c(0,100)) +
112 |     geom_text(aes(label = round(block_size_perc, digits = 1)), colour = "gray45", size = text_size) +
113 |     ggtitle(ref)
114 |   
115 |   print(ref_shared_plot)
116 |   
117 | 
118 | #png(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/shared_genome_percentage_", ref, ".png"), width = width, height #= height, units = "in", res = res)
119 | #print(ref_shared_plot)
120 | #dev.off()
121 | 
122 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/shared_genome_percentage_", ref, ".pdf"), width = width, height = height)
123 | print(ref_shared_plot)
124 | dev.off()
125 | }
126 | 
127 | ```


--------------------------------------------------------------------------------
/figures/plot_extended_data_figure_6.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Plots for Extended data 5 panel B"
  3 | author: "Jemima Brinton"
  4 | date: "13/05/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | library(ggplot2)
 11 | library(ggbio)
 12 | library(stringr)
 13 | library(GenomicRanges)
 14 | library(viridis)
 15 | library(scales)
 16 | 
 17 | ```
 18 | 
 19 | Want to produced the figures for extended data 5 which show the highly conserved regions for different bin sizes and include locations of agronimically important genes
 20 | 
 21 | ```{r}
 22 | shared_data_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/unique_shared_regions/"
 23 | out_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/"
 24 | set <- "all_assemblies"
 25 | 
 26 | set_dir <- paste0(shared_data_dir, set, "/")
 27 | 
 28 | 
 29 | chrom_lengths_pangenome <- read.table("W:/assemblies/releasePGSBv2.0/genome/combined_fai.txt",
 30 |                                       header = FALSE,
 31 |                                       stringsAsFactors = FALSE)
 32 | 
 33 | head(chrom_lengths_pangenome)
 34 | 
 35 | chrom_lengths_refseq <- read.table("W:/WGAv1.0/161010_Chinese_Spring_v1.0_pseudomolecules.fai", 
 36 |                                    sep = "\t", 
 37 |                                    header = FALSE,
 38 |                                    stringsAsFactors = FALSE)
 39 | 
 40 | head(chrom_lengths_refseq)
 41 | 
 42 | chrom_lengths <- rbind(chrom_lengths_pangenome[,c(1,2)], chrom_lengths_refseq[,c(1,2)])
 43 | colnames(chrom_lengths) <- c("seqnames", "chrom_length")
 44 | 
 45 | head(chrom_lengths)
 46 | 
 47 | important_genes <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/important_locus_locations.csv",
 48 |                               sep = ",",
 49 |                               header = TRUE,
 50 |                               stringsAsFactors = FALSE)
 51 | 
 52 | head(important_genes)
 53 | important_genes <- merge(important_genes, chrom_lengths, all.x = TRUE, all.y = FALSE)
 54 | head(important_genes)
 55 | #add scaled chromosome position
 56 | important_genes$perc_location <- ceiling((important_genes$location/important_genes$chrom_length)*1000000000)
 57 | bin_sizes <- c(2500, 1000)
 58 | ```
 59 | 
 60 | ```{r}
 61 | for (bin_size in bin_sizes){
 62 |   
 63 |   block_set_file <- paste0(set_dir, "blocks_coverage_", bin_size, "kbp.csv")
 64 |   block_set_file
 65 |   coverage <- read.table(block_set_file,
 66 |                          sep = ",",
 67 |                          header = TRUE,
 68 |                          stringsAsFactors = FALSE)
 69 |   
 70 |   
 71 |   head(coverage)
 72 |   unique(coverage$coverage)
 73 |   
 74 |   
 75 |   
 76 |   coverage_len <- merge(coverage, chrom_lengths, all.x = TRUE, all.y = FALSE)
 77 |   
 78 |   #also add a column that has generic chromosome name
 79 |   coverage_len$chromosome <- str_split_fixed(coverage_len$seqnames, "_", 2)[,1]
 80 |   head(coverage_len)
 81 |   
 82 |   #calculate each chunk as a percentage of the length of the chromosome
 83 |   coverage_len$width_perc <- (coverage_len$width/coverage_len$chrom_length)*100
 84 |   
 85 | 
 86 |   ## add the scaled chromosome positions to coverage_len_gen
 87 |   coverage_len$start_perc <- ceiling((coverage_len$start/coverage_len$chrom_length)*1000000000)
 88 |   coverage_len$end_perc <- ceiling((coverage_len$end/coverage_len$chrom_length)*1000000000)
 89 |   coverage_len$perc_chrom_length <- 1000000001
 90 |   
 91 |   
 92 | 
 93 |   # generate the  chromosome length vector for the scaled chromosome plots
 94 |   perc_chrom_length <- rep(1000000001, length(unique(coverage_len$chromosome)))
 95 |   names(perc_chrom_length) <- unique(coverage_len$chromosome)
 96 |   perc_chrom_length <- perc_chrom_length[order(names(perc_chrom_length))]
 97 |   
 98 |   
 99 |   #order the main table by chromosome and then coverage (to ensure correct order of plotting)
100 |   coverage_len_gen_ordered <- coverage_len[order(coverage_len$chromosome), ]
101 |   coverage_len_gen_ordered <- coverage_len_gen_ordered[order(coverage_len_gen_ordered$coverage),]
102 |   
103 |   # want the coverage to be discrete not continuous
104 |   coverage_len_gen_ordered$cov_discrete <- coverage_len_gen_ordered$coverage
105 |   coverage_len_gen_ordered$cov_discrete <- as.factor(as.character(coverage_len_gen_ordered$cov_discrete))
106 |   
107 |   #cap at greater than 4 for the summary plotting
108 |   coverage_len_gen_ordered_high <- coverage_len_gen_ordered[coverage_len_gen_ordered$coverage > 4,]
109 |   coverage_len_gen_ordered_high <- coverage_len_gen_ordered_high[order(coverage_len_gen_ordered_high$coverage),]
110 |   
111 | 
112 |   #Plot the highly conserved regions across all assemblies
113 |   
114 |   
115 |   #get the viridis colours so we can specify the scale to start at 5
116 |   vir_colours <- viridis(11)
117 |   names(vir_colours) <- as.character(seq(1, 11))
118 |   
119 |   coverage_len_gen_ordered_high$cov_discrete <- factor(coverage_len_gen_ordered_high$cov_discrete, levels = as.character(seq(1, 11)))
120 |   
121 |   coverage_GR_high <- GRanges(seqnames = coverage_len_gen_ordered_high$chromosome,
122 |                          ranges = IRanges(start = coverage_len_gen_ordered_high$start_perc,
123 |                                           end = coverage_len_gen_ordered_high$end_perc),
124 |                          seqlengths = perc_chrom_length,
125 |                          coverage = coverage_len_gen_ordered_high$coverage,
126 |                          cov_discrete = coverage_len_gen_ordered_high$cov_discrete)
127 |   
128 | 
129 |   important_genes_GR <- GRanges(seqnames = important_genes$seqnames,
130 |                          ranges = IRanges(start = important_genes$perc_location,
131 |                                           end = important_genes$perc_location+1),
132 |                          y_value = 1,
133 |                          y_value_label = -6,
134 |                          seqlengths = perc_chrom_length,
135 |                          IDs = important_genes$Locus) 
136 |   
137 |     summarised_conserved <- autoplot(coverage_GR_high, layout = "karyogram", aes(fill = cov_discrete, colour = cov_discrete), lwd = 0.2) +
138 |     #scale_fill_viridis(discrete = TRUE)
139 |     scale_fill_manual(values = vir_colours) +
140 |     scale_colour_manual(values = vir_colours) +
141 |       layout_karyogram(data = important_genes_GR,
142 |                        geom = "point",
143 |                        aes(x = start, y = y_value, label = IDs),
144 |                        colour = "red",
145 |                        fill = "red",
146 |                        size = 1,
147 |                        shape = 2) +
148 |       layout_karyogram(data = important_genes_GR,
149 |                        geom = "text",
150 |                        aes(x = start, y = y_value_label, label = IDs),
151 |                        colour = "black",
152 |                        size = 3)
153 |   
154 |   summarised_conserved
155 |   
156 |   
157 | 
158 |   
159 |   
160 |   width = 6
161 |   height = 6
162 |   pdf(paste0(out_dir,"extended_data_5_summarised_conserved_regions_", bin_size, "bin.pdf"), width = width, height = height)
163 |   print(summarised_conserved)
164 |   dev.off()
165 | 
166 | }
167 | ```


--------------------------------------------------------------------------------
/figures/plot_extended_data_figure_7.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Plot 6A recombinants"
  3 | author: "Jemima Brinton"
  4 | date: "15/05/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | library(stringr)
 12 | library(ggplot2)
 13 | library("ggdendro")
 14 | library(plyr)
 15 | library(reshape2)
 16 | library("grid")
 17 | library(agricolae)
 18 | library("gdata")
 19 | library(RColorBrewer)
 20 | 
 21 | convert_long_01 <- function(data){
 22 |   converted <- data.frame(marker=character(), variable = character(),  value = numeric(),pos = numeric())
 23 |   markers <- unique(data$marker)
 24 | 
 25 |   for (i in seq(1, length(markers))){
 26 |     marker <- markers[i]
 27 |     marker_data <- data[data$marker == marker,]
 28 |     snp_calls <- unique(marker_data$value)
 29 |     alleles <- snp_calls[(snp_calls %in% c("A", "C", "T", "G"))]
 30 |     if (length(alleles) > 1){
 31 |     marker_data$value <- gsub(alleles[1], "0", marker_data$value)
 32 |     marker_data$value <- gsub(alleles[2], "1", marker_data$value)
 33 |     } else if (length(alleles) == 1) {
 34 |     marker_data$value <- gsub(alleles[1], "0", marker_data$value)
 35 |     print(marker)
 36 |       print(alleles)
 37 |     } else if (length(alleles) == 0 | length(alleles) >2) {
 38 |       print(marker)
 39 |       print(alleles)
 40 |     }
 41 |     converted <- rbind(converted, marker_data)
 42 |   }
 43 |   #converted$value <- gsub("N", "NA", converted$value)
 44 |   converted$value <- as.numeric(converted$value)
 45 |   return(converted)
 46 | }
 47 | 
 48 | plot_capture_clust <- function(data, start, end, outfile_prefix){
 49 |    data_zoom <- data[(data$pos > start) & (data$pos < end),]
 50 |   #print(table(data_zoom$aln_type))
 51 |   zoom_mat <- dcast(data_zoom[,c("variable", "pos", "value")], variable ~ pos)
 52 |   zoom.matrix <- as.matrix(zoom_mat[,c(2:(ncol(zoom_mat)-1))])
 53 |   dim(zoom.matrix)
 54 |   rownames(zoom.matrix) <- zoom_mat$variable
 55 |   zoom.dendro <- as.dendrogram(hclust(d=dist(x = zoom.matrix)))
 56 |   #dendro.plot <- ggdendrogram(data = zoom.dendro, rotate = TRUE)
 57 |   #ggsave(plot = dendro.plot, file = paste0(outfile_prefix, "_dendrogram.png"), dpi = 600, height = 18, width = 8)
 58 |   zoom.order <- order.dendrogram(zoom.dendro)
 59 |   data_zoom$variable <- factor(x = data_zoom$variable, levels = zoom_mat$variable[zoom.order], ordered = TRUE)
 60 |   plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) +
 61 | 		geom_tile(aes(fill = data_zoom$value)) +
 62 | 		#scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") +
 63 |     #scale_fill_continuous(type = "viridis") +
 64 |     scale_fill_gradient(low = "#440154FF", high = "#1F968BFF", na.value = "grey50", aesthetics = "fill") +
 65 | 		theme(axis.text.x=element_text(angle=90, hjust=1)) #+
 66 |     #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) )
 67 |   #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 8, width = 18)
 68 |   return(plot)
 69 | }
 70 | 
 71 | plot_capture_clust_AGCT <- function(data, start, end, outfile_prefix){
 72 |    data_zoom <- data[(data$pos > start) & (data$pos < end),]
 73 |   #print(table(data_zoom$aln_type))
 74 |   zoom_mat <- dcast(data_zoom[,c("variable", "pos", "value")], variable ~ pos)
 75 |   zoom.matrix <- as.matrix(zoom_mat[,c(2:(ncol(zoom_mat)-1))])
 76 |   dim(zoom.matrix)
 77 |   rownames(zoom.matrix) <- zoom_mat$variable
 78 |   zoom.dendro <- as.dendrogram(hclust(d=dist(x = zoom.matrix)))
 79 |   #dendro.plot <- ggdendrogram(data = zoom.dendro, rotate = TRUE)
 80 |   #ggsave(plot = dendro.plot, file = paste0(outfile_prefix, "_dendrogram.png"), dpi = 600, height = 18, width = 8)
 81 |   zoom.order <- order.dendrogram(zoom.dendro)
 82 |   data_zoom$variable <- factor(x = data_zoom$variable, levels = zoom_mat$variable[zoom.order], ordered = TRUE)
 83 |   plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) +
 84 | 		geom_tile(aes(fill = data_zoom$value)) +
 85 | 		#scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") +
 86 |     #scale_fill_continuous(type = "viridis") +
 87 |     #scale_fill_gradient(low = "#440154FF", high = "#1F968BFF", na.value = "grey50", aesthetics = "fill") +
 88 | 		theme(axis.text.x=element_text(angle=90, hjust=1)) #+
 89 |     #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) )
 90 |   #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 8, width = 18)
 91 |   return(plot)
 92 | }
 93 | ```
 94 | 
 95 | ```{r}
 96 | summarised_data <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/recombinants/anova_adjusted_means_5_year_summary_6A_recs.csv",
 97 |                               sep = ",",
 98 |                               header = TRUE, 
 99 |                               stringsAsFactors = FALSE)
100 | 
101 | head(summarised_data)
102 | 
103 | means <- aggregate(grain_width ~ group + group_call, data = summarised_data, FUN = mean)
104 | 
105 | head(means)
106 | 
107 | means$sd <- aggregate(grain_width ~ group + group_call, data = summarised_data, FUN = sd)[,3]
108 | means$count <- aggregate(grain_width ~ group + group_call, data = summarised_data, FUN = length)[,3]
109 | 
110 | means$sterr <- means$sd/(sqrt(means$count))
111 | 
112 | groups <- rev(unique(summarised_data$group))
113 | means$group <- factor(means$group, levels = groups)
114 | ```
115 | 
116 | plot bar chart of values
117 | 
118 | ```{r}
119 | 
120 | group_colours <- c("S" = "white", "SR" = "gray63", "R" = "gray15")
121 | means_plot <- ggplot(means, aes(x = grain_width, y = group, fill = group_call)) +
122 |   geom_bar(stat="identity", colour = "black", width = 0.6) + 
123 |   theme_bw() +
124 |   coord_cartesian(xlim = c(3.4, 3.8)) +
125 |   geom_errorbar(aes(xmin=grain_width-sterr, xmax=grain_width+sterr), width=.2,
126 |                  position=position_dodge(.9)) +
127 |   scale_fill_manual(values = group_colours)
128 | 
129 | means_plot
130 | ```
131 | ```{r}
132 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/recombinants/anova_adjusted_means_5_year_summary_6A.pdf", height = 6, width = 7)
133 | means_plot
134 | dev.off()
135 | 
136 | ```
137 | 
138 | Also want to plots the 35K markers across this regions for the bottom panel of the figure
139 | 
140 | ```{r}
141 | markers_6A <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/H3_35K_markers_whole6A.csv",
142 |                          sep = ",",
143 |                          header = TRUE,
144 |                          stringsAsFactors = FALSE)
145 | 
146 | haplotype_alloc <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/UKRL_H3_plotting_order.csv",
147 |                          sep = ",",
148 |                          header = TRUE,
149 |                          stringsAsFactors = FALSE)
150 | 
151 | H3_vars <- haplotype_alloc$line
152 | 
153 | colnames(markers_6A)[1] <- "marker"
154 | 
155 | markers_6A_long <- melt(markers_6A, id.vars = c("marker", "pos"))
156 | 
157 | ```
158 | 
159 | ```{r}
160 | markers_H3_AGCT <- markers_6A_long[markers_6A_long$variable %in% H3_vars,]
161 | base_colours <- c("A" = "#7570B3", "C" = "#E7298A", "G" = "#1B9E77", "T" = "#E6AB02")
162 | 
163 | 
164 | data = markers_H3_AGCT
165 | start = 23602422
166 | end = 581841760
167 | data_zoom <- data[(data$pos > start) & (data$pos < end),]
168 | data_zoom$variable <- factor(data_zoom$variable, levels = rev(H3_vars))
169 | 
170 | 
171 | plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) +
172 | 		geom_tile(aes(fill = value)) +
173 | 		theme(axis.text.x=element_text(angle=90, hjust=1)) +
174 |     scale_fill_manual(values = base_colours) +
175 |   theme_bw()
176 | 
177 | plot
178 | 
179 | height= 4
180 | width = 10
181 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/H3_35K_6Aregion.pdf", height = height, width = width)
182 | plot
183 | dev.off()
184 | ```
185 | 
186 | 


--------------------------------------------------------------------------------
/haplotype-block-assignment/get_ref_coords.r:
--------------------------------------------------------------------------------
  1 | ## Get the position of the BLAST blocks with respect to the specific reference assemblies, rather than chinese spring RefSeq
  2 | library(stringr)
  3 | 
  4 | ## define functions here
  5 | get_ref_start <- function(data, ref){
  6 |   ref_start_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["start_transcript"], ]
  7 |   if (nrow(ref_start_data) > 1){
  8 |     ref_start <- ref_start_data[ref_start_data$score == max(ref_start_data$score), "start"]
  9 |     if (length(ref_start) > 1){
 10 |       temp <- ref_start_data[ref_start_data$score == max(ref_start_data$score), ]
 11 |       ref_start <- temp[temp$start == min(temp$start), "start"]
 12 |     }
 13 |   } else {
 14 |     ref_start <- ref_start_data[, "start"]
 15 |   }
 16 |   ref_start
 17 | }
 18 | 
 19 | get_ref_start_transcript <- function(data, ref){
 20 |   ref_start_transcript_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["start_transcript"],]
 21 |   if (nrow(ref_start_transcript_data) > 1){
 22 |     ref_start_transcript <- ref_start_transcript_data[ref_start_transcript_data$score == max(ref_start_transcript_data$score), "var_id"]
 23 |     if (length(ref_start_transcript) > 1){
 24 |       temp <- ref_start_transcript_data[ref_start_transcript_data$score == max(ref_start_transcript_data$score), ]
 25 |       ref_start_transcript <- temp[temp$start == min(temp$start), "var_id"]
 26 |     }
 27 |   } else {
 28 |     ref_start_transcript <- ref_start_transcript_data[, "var_id"]
 29 |   }
 30 | }
 31 | 
 32 | get_ref_end <- function(data, ref){
 33 |   ref_end_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["end_transcript"], ]
 34 |   if (nrow(ref_end_data) > 1){
 35 |     ref_end <- ref_end_data[ref_end_data$score == max(ref_end_data$score), "end"]
 36 |     if (length(ref_end) > 1){
 37 |       temp <- ref_end_data[ref_end_data$score == max(ref_end_data$score), ]
 38 |       ref_end <- temp[temp$start == min(temp$start), "end"]
 39 |     }
 40 |   } else {
 41 |     ref_end <- ref_end_data[, "end"]
 42 |   }
 43 |   ref_end
 44 | }
 45 | 
 46 | get_ref_end_transcript <- function(data, ref){
 47 |   ref_end_transcript_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["end_transcript"], ]
 48 |   if (nrow(ref_end_transcript_data) > 1){
 49 |     ref_end_transcript <- ref_end_transcript_data[ref_end_transcript_data$score == max(ref_end_transcript_data$score), "var_id"]
 50 |     if (length(ref_end_transcript) > 1){
 51 |       temp <- ref_end_transcript_data[ref_end_transcript_data$score == max(ref_end_transcript_data$score), ]
 52 |       ref_end_transcript <- temp[temp$start == min(temp$start), "var_id"]
 53 |     }
 54 |   } else {
 55 |     ref_end_transcript <- ref_end_transcript_data[, "var_id"]
 56 |   }
 57 |   ref_end_transcript
 58 | }
 59 | 
 60 | ## end of functions
 61 | data_dir <- "X:/brintonj/haplotype/whole_genome_blast/blocks/"
 62 | 
 63 | chromosomes <- list.files(data_dir)
 64 | 
 65 | for (chr in chromosomes){
 66 |   BLAST_path <- paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp.txt")
 67 | 
 68 |   BLAST <- read.table(BLAST_path,
 69 |                     sep = "\t", 
 70 |                     header = TRUE,
 71 |                     stringsAsFactors = FALSE)
 72 | 
 73 |   head(BLAST)
 74 | 
 75 |   BLAST$ref <- str_split_fixed(BLAST$aln_type, "->", 2)[,1]
 76 |   BLAST$query <- str_split_fixed(BLAST$aln_type, "->", 2)[,2]
 77 | 
 78 |   #make the opposite ref query scenario for the blast
 79 |   BLAST_copy <- BLAST
 80 |   BLAST_copy$ref <- BLAST$query
 81 |   BLAST_copy$query <- BLAST$ref
 82 | 
 83 |   BLAST_recip <- rbind(BLAST, BLAST_copy)
 84 | 
 85 |   EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil")
 86 |   EI_blocks <- subset(BLAST_recip, ref %in% EI_vars)
 87 | 
 88 |   references <- unique(BLAST_recip$ref)
 89 |   references <- references[!(references %in% EI_vars)]
 90 | 
 91 |   ref_gff_dir <- "W:/assemblies/releasePGSBv2.0/gff/"
 92 | 
 93 |   BLAST_coords <- data.frame(block_no=numeric(),
 94 |                            block_start = numeric(),
 95 |                            block_end = numeric(),
 96 |                            aln_type = character(),
 97 |                            start_transcript = character(),
 98 |                            end_transcript = character(),
 99 |                            window = numeric(),
100 |                            ref = character(),
101 |                            query = character(),
102 |                            ref_start = numeric(),
103 |                            ref_start_transcript = character(),
104 |                            ref_end = numeric(),
105 |                            ref_end_transcript = character())
106 |   
107 |   for (ref in references) {
108 |   
109 |     ref_gff_path <- paste0(ref_gff_dir, ref, ".gff")
110 | 
111 |     ref_gff <- read.table(ref_gff_path, 
112 |                       sep = "\t", 
113 |                       header = FALSE,
114 |                       stringsAsFactors = FALSE)
115 | 
116 | 
117 |     ref_gff_gene_only <- ref_gff[ref_gff$V3 == "gene",]
118 | 
119 |     ref_gff_gene_only <- cbind(ref_gff_gene_only, str_split_fixed(ref_gff_gene_only$V9, ";", 4))
120 |     colnames(ref_gff_gene_only) <- c("chr", "annotation", "biotype", "start", "end", "score", "strand", "missing", "info", "ref_id", "cs_id", "frame", "note")
121 | 
122 |     ref_gff_gene_only$var_id <- gsub("ID=", "", ref_gff_gene_only$ref_id)
123 |     ref_gff_gene_only$cs_id <- gsub("srcmodel=", "", ref_gff_gene_only$cs_id)
124 |     ref_gff_gene_only$cs_id <- str_split_fixed(ref_gff_gene_only$cs_id, "\\.", 2)[,1]
125 | 
126 | 
127 |     ## this bit gets the reference blocks
128 |   ## we also need to make sure that we incorportate the EI varieties because we can't get their position (scaffolds) so we will use the query position for this too
129 |     BLAST_ref <- BLAST_recip[BLAST_recip$ref == ref,]
130 |     BLAST_ref <- rbind(BLAST_ref, EI_blocks[EI_blocks$query == ref,])
131 |     BLAST_ref <- BLAST_ref[complete.cases(BLAST_ref),]
132 | 
133 |     BLAST_ref$ref_start <- apply(BLAST_ref, 1, FUN=get_ref_start, ref = ref)
134 |     BLAST_ref$ref_start_transcript <- apply(BLAST_ref, 1, FUN=get_ref_start_transcript, ref = ref)
135 |     BLAST_ref$ref_end <- apply(BLAST_ref, 1, FUN=get_ref_end, ref = ref)
136 |     BLAST_ref$ref_end_transcript <- apply(BLAST_ref, 1, FUN=get_ref_end_transcript, ref = ref)
137 | 
138 |     BLAST_coords <- rbind(BLAST_coords, BLAST_ref)
139 |   }
140 | 
141 |   BLAST_coords$block_no <- as.character(BLAST_coords$block_no)
142 |   BLAST_coords$block_start <- as.numeric(as.character(BLAST_coords$block_start))
143 |   BLAST_coords$block_end <- as.numeric(as.character(BLAST_coords$block_end))
144 |   BLAST_coords$aln_type <- as.character(BLAST_coords$aln_type)
145 |   BLAST_coords$start_transcript <- as.character(BLAST_coords$start_transcript)
146 |   BLAST_coords$end_transcript <- as.character(BLAST_coords$end_transcript)
147 |   BLAST_coords$window <- as.numeric(as.character(BLAST_coords$window))
148 |   BLAST_coords$ref <- as.character(BLAST_coords$ref)
149 |   BLAST_coords$query <- as.character(BLAST_coords$query)
150 |   BLAST_coords$ref_start <- as.numeric(as.character(BLAST_coords$ref_start))
151 |   BLAST_coords$ref_start_transcript <- as.character(BLAST_coords$ref_start_transcript)
152 |   BLAST_coords$ref_end <- as.numeric(as.character(BLAST_coords$ref_end))
153 |   BLAST_coords$ref_end_transcript <- as.character(BLAST_coords$ref_end_transcript)
154 |   
155 |   write.table(BLAST_coords,
156 |               file = paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt"),
157 |               sep = "\t",
158 |               col.names = TRUE,
159 |               row.names = FALSE,
160 |               quote = FALSE)
161 | }
162 | 
163 | ## Unfortunately this misses out EI EI comparisons, so we need to add them back
164 | for (chr in chromosomes){
165 |   BLAST_path <- paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp.txt")
166 |   
167 |   BLAST <- read.table(BLAST_path,
168 |                       sep = "\t", 
169 |                       header = TRUE,
170 |                       stringsAsFactors = FALSE)
171 | 
172 |   BLAST_coords_path <- paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt")
173 |   
174 |   BLAST_coords <- read.table(BLAST_coords_path,
175 |                       sep = "\t", 
176 |                       header = TRUE,
177 |                       stringsAsFactors = FALSE)
178 |   
179 |   BLAST$ref <- str_split_fixed(BLAST$aln_type, "->", 2)[,1]
180 |   BLAST$query <- str_split_fixed(BLAST$aln_type, "->", 2)[,2]
181 | 
182 |   EI_BLAST <- BLAST[BLAST$ref %in% EI_vars & BLAST$query %in% EI_vars,]  
183 |   
184 |   EI_BLAST$ref_start <- EI_BLAST$block_start
185 |   EI_BLAST$ref_start_transcript <- EI_BLAST$start_transcript
186 |   EI_BLAST$ref_end <- EI_BLAST$block_end
187 |   EI_BLAST$ref_end_transcript <- EI_BLAST$end_transcript
188 |   
189 |   EI_recip <- EI_BLAST
190 |   EI_recip$ref <- EI_BLAST$query
191 |   EI_recip$query <- EI_BLAST$ref
192 |   EI_recip <- rbind(EI_BLAST, EI_recip)
193 |   
194 |   BLAST_coords <- rbind(BLAST_coords, EI_recip)
195 |   
196 |   write.table(BLAST_coords,
197 |               file = paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt"),
198 |               sep = "\t",
199 |               col.names = TRUE,
200 |               row.names = FALSE,
201 |               quote = FALSE)
202 | }
203 | 


--------------------------------------------------------------------------------
/haplotype-block-assignment/combine_mummer_and_BLAST.r:
--------------------------------------------------------------------------------
  1 | ## Script written by Jemima Brinton 2019
  2 | ## Aim - combine haplotype blocks from nucmer and BLAST alignments. First use mummer/nucmer blocks, then check BLAST blocks - any not represented by the mummer blocks and >= bin size should be added to the final set 
  3 | 
  4 | library(GenomicRanges)
  5 | ##functions
  6 | 
  7 | check_block <- function(data, EI_vars, mummer_blocks){
  8 |   #print(data)
  9 |   if ((data["ref"] %in% EI_vars) & (data["query"] %in% EI_vars)){
 10 |     count_overlaps <- 0
 11 |   } else {
 12 |     if ((data["ref"] %in% EI_vars)){
 13 |       mummer_gdf <- mummer_blocks[mummer_blocks$query == data["ref"] & mummer_blocks$ref == data["query"], c("chrom", "block_start", "block_end")]
 14 |     } else {
 15 |       mummer_gdf <- mummer_blocks[mummer_blocks$query == data["query"] & mummer_blocks$ref == data["ref"], c("chrom", "block_start", "block_end")]
 16 |     }
 17 |     if(nrow(mummer_gdf)>0){
 18 |       colnames(mummer_gdf) <- c("chr", "start", "end")
 19 |       #print(mummer_gdf)
 20 |       mummer_gr <- makeGRangesFromDataFrame(mummer_gdf)
 21 |     
 22 |       to_check_gdf <- data.frame(chr = character(), start = numeric(), end = numeric(), stringsAsFactors = FALSE)
 23 |       to_check_gdf[1,"chr"] <- data["chrom"]
 24 |       to_check_gdf[1,"start"] <- data[("ref_start")]
 25 |       to_check_gdf[1,"end"] <- data[("ref_end")]
 26 |       #print(to_check_gdf)
 27 |     
 28 |       to_check_gr <- makeGRangesFromDataFrame(to_check_gdf)
 29 |     
 30 |       overlaps <- findOverlaps(to_check_gr, mummer_gr)
 31 |       count_overlaps <- length(unique(subjectHits(overlaps)))
 32 |     } else {
 33 |       count_overlaps <- 0
 34 |     }
 35 |   }
 36 |   return(count_overlaps)
 37 |   #print("finish")
 38 | }
 39 | 
 40 | ###
 41 | 
 42 | mummer_dir <- "X:/brintonj/haplotype/whole_genome_mummer/blocks/"
 43 | BLAST_dir <- "X:/brintonj/haplotype/whole_genome_blast/blocks/"
 44 | 
 45 | chromosomes <- list.files(mummer_dir)
 46 | 
 47 | bin_size <- 5000000
 48 | 
 49 | combined_blocks <- data.frame(ref = character(),
 50 |                               query = character(),
 51 |                               chrom = character(),
 52 |                               ref_start = numeric(),
 53 |                               ref_end = numeric(),
 54 |                               ref_start_transcript = character(),
 55 |                               ref_end_transcript = character(),
 56 |                               cs_start = numeric(),
 57 |                               cs_end = numeric(),
 58 |                               cs_start_transcript = character(),
 59 |                               cs_end_transcript = character(),
 60 |                               source = character())
 61 | 
 62 | for (chr in chromosomes){
 63 |   mummer_blocks <- read.table(paste0(mummer_dir, chr, "/mummer_blocks_chr", chr, ".min20000.5Mb_bins.txt"),
 64 |                               header = TRUE, 
 65 |                               sep = "\t",
 66 |                               stringsAsFactors = FALSE)
 67 |   
 68 |   BLAST_blocks <- read.table(paste0(BLAST_dir, "chr", chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt"),
 69 |                              header = TRUE, 
 70 |                              sep = "\t",
 71 |                              stringsAsFactors = FALSE)
 72 |   
 73 |   #filter the BLAST for blocks only above the mummer bin size
 74 |   #remove spelta
 75 |   BLAST_blocks$chrom <- paste0("chr", chr)
 76 |   BLAST_blocks <- BLAST_blocks[!(BLAST_blocks$ref == "spelta" | BLAST_blocks$query == "spelta"),]
 77 |   BLAST_blocks$block_size <- BLAST_blocks$ref_end - BLAST_blocks$ref_start
 78 |   BLAST_bin <- BLAST_blocks[BLAST_blocks$block_size >= bin_size,]
 79 |   BLAST_bin <- BLAST_bin[complete.cases(BLAST_bin$block_start),]
 80 | 
 81 |   EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil")
 82 |   BLAST_bin$overlap <- apply(BLAST_bin, 1, check_block, EI_vars = EI_vars, mummer_blocks = mummer_blocks)
 83 |   BLAST_bin$source <- "BLAST"
 84 |   BLAST_bin$in_reciprocal <- NA
 85 |   
 86 |   mummer_blocks$source <- "mummer"
 87 |   
 88 |   BLAST_to_add <- BLAST_bin[BLAST_bin$overlap == 0, c("ref", 
 89 |                                                       "query", 
 90 |                                                       "chrom", 
 91 |                                                       "ref_start", 
 92 |                                                       "ref_end", 
 93 |                                                       "ref_start_transcript", 
 94 |                                                       "ref_end_transcript",
 95 |                                                       "block_start", 
 96 |                                                       "block_end",
 97 |                                                       "start_transcript",
 98 |                                                       "end_transcript",
 99 |                                                       "source")]
100 |   
101 |   colnames(BLAST_to_add)[c(8:11)] <- c("cs_start", "cs_end", "cs_start_transcript", "cs_end_transcript")
102 |   
103 |   mummer_blocks$cs_start <- NA
104 |   mummer_blocks$cs_end <- NA
105 |   mummer_blocks$cs_start_transcript <- NA
106 |   mummer_blocks$cs_end_transcript <- NA
107 |   mummer_blocks$ref_start_transcript <- NA
108 |   mummer_blocks$ref_end_transcript <- NA
109 |   
110 |   mummer_to_add <- mummer_blocks[, c("ref", 
111 |                                      "query", 
112 |                                      "chrom", 
113 |                                      "block_start", 
114 |                                      "block_end", 
115 |                                      "ref_start_transcript", 
116 |                                      "ref_end_transcript",
117 |                                      "cs_start", 
118 |                                      "cs_end",
119 |                                      "cs_start_transcript",
120 |                                      "cs_end_transcript",
121 |                                      "source")]
122 | colnames(mummer_to_add)[c(4,5)] <- c("ref_start", "ref_end")
123 | 
124 | mummer_BLAST <- rbind(mummer_to_add, BLAST_to_add)
125 | 
126 | combined_blocks <- rbind(combined_blocks, mummer_BLAST)
127 | }
128 | 
129 | write.table(combined_blocks, file = "X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined.tsv", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
130 | 
131 | ref_coords_only <- combined_blocks[,c(1:5, 12)]
132 | 
133 | write.table(ref_coords_only, file = "X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined_ref_coords.tsv", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
134 | 
135 | ## we also want to format the dataframe with the blocks being given a unique identifier
136 | ## columns we want: assembly chromosome start end block_no assembly_coords
137 | 
138 | EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil")
139 | 
140 | ref_coords_only_copy <- ref_coords_only
141 | 
142 | ref_coords_only_copy$block_no <- NA
143 | ref_coords_only_copy$assembly_coords <- NA
144 | 
145 | ref_coords_only_copy$block_size <- ref_coords_only_copy$ref_end - ref_coords_only_copy$ref_start
146 | ref_coords_only_copy <- ref_coords_only_copy[order(ref_coords_only_copy$block_size, decreasing = TRUE),]
147 | 
148 | block_no <- 1
149 | 
150 | 
151 | for(i in seq(1, nrow(ref_coords_only_copy))){
152 |   data <- ref_coords_only_copy[i,]
153 |   
154 |   if(is.na(ref_coords_only_copy[i, "block_no"])){
155 |     ref <- ref_coords_only_copy[i,"ref"]
156 |     query <- ref_coords_only_copy[i,"query"]
157 |     chrom <- ref_coords_only_copy[i,"chrom"]
158 |     
159 |     ref_coords_only_copy[i,"block_no"] <- block_no
160 |     
161 |     matching_rows <- which(((ref_coords_only_copy$ref == query) &
162 |                               (ref_coords_only_copy$query == ref) &
163 |                              (ref_coords_only_copy$chrom == chrom)) & (((ref_coords_only_copy$ref_start >= data$ref_start & ref_coords_only_copy$ref_start <= data$ref_end) |
164 |                             (ref_coords_only_copy$ref_end >= data$ref_start & ref_coords_only_copy$ref_end <= data$ref_end)) | 
165 |                             ((data$ref_start >= ref_coords_only_copy$ref_start & data$ref_start <= ref_coords_only_copy$ref_end) |
166 |                             (data$ref_end >= ref_coords_only_copy$ref_start & data$ref_end <= ref_coords_only_copy$ref_end))))
167 | 
168 |     
169 |     if(length(matching_rows) > 0){
170 |       ref_coords_only_copy[matching_rows, "block_no"] <- block_no
171 |     } else if (query %in% EI_vars) {
172 |       last_line <- nrow(ref_coords_only_copy)
173 |       ref_coords_only_copy <- rbind(ref_coords_only_copy, ref_coords_only_copy[i,])
174 |       ref_coords_only_copy[last_line+1,"block_no"] <- block_no
175 |       ref_coords_only_copy[last_line+1,"ref"] <- query
176 |       ref_coords_only_copy[last_line+1,"query"] <- ref
177 |       ref_coords_only_copy[last_line+1,"assembly_coords"] <- ref
178 |     }
179 |     if(query %in% EI_vars & ref %in% EI_vars){
180 |       ref_coords_only_copy[i,"assembly_coords"] <- "IWGSCv1.1"
181 |       ref_coords_only_copy[matching_rows,"assembly_coords"] <- "IWGSCv1.1"
182 |     } else if (query %in% EI_vars) {
183 |       ref_coords_only_copy[i,"assembly_coords"] <- ref
184 |       ref_coords_only_copy[matching_rows,"assembly_coords"] <- ref
185 |     } else if (ref %in% EI_vars) {
186 |       ref_coords_only_copy[i,"assembly_coords"] <- query
187 |       ref_coords_only_copy[matching_rows,"assembly_coords"] <- query
188 |     } else {
189 |       ref_coords_only_copy[i,"assembly_coords"] <- ref
190 |       ref_coords_only_copy[matching_rows,"assembly_coords"] <- query
191 |     }
192 |     block_no <- block_no + 1
193 |   } else {
194 |     print("sorted")
195 |   }
196 | }
197 | 
198 | ref_coords_only_copy <- ref_coords_only_copy[order(ref_coords_only_copy$chrom, ref_coords_only_copy$ref, ref_coords_only_copy$ref_start),]
199 | 
200 | #combine with the name of the ref chroms
201 | 
202 | ref_suffix <- read.table("X:/brintonj/haplotype/assemblies_suffix.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
203 | ref_coords_only_copy$assembly_chrom <- ref_coords_only_copy$chrom
204 | 
205 | for (i in seq(1, nrow(ref_coords_only_copy))){
206 |   assembly <- ref_coords_only_copy[i, "assembly_coords"]
207 |   chrom <- ref_coords_only_copy[i, "chrom"]
208 |   if (assembly == "IWGSCv1.1"){
209 |   } else {
210 |     ref_coords_only_copy[i, "assembly_chrom"] <- paste0(chrom, "__", ref_suffix[ref_suffix$name == assembly, "suffix"])
211 |   }
212 | }
213 | 
214 | colnames(ref_coords_only_copy)[8] <- "ref_assembly"
215 | colnames(ref_coords_only_copy)[10] <- "ref_chrom"
216 | 
217 | write.table(ref_coords_only_copy, file = "X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined_ref_coords_block_numbers.tsv", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
218 | 
219 | 


--------------------------------------------------------------------------------
/precision-recall-analysis/calculate_combined_precision_recall.r:
--------------------------------------------------------------------------------
  1 | #Jemima Brinton 2020
  2 | #Script to carry out the precision recall calculations for the mummer and blast blocks
  3 | 
  4 | library(GenomicRanges)
  5 | library(ggplot2)
  6 | ## Functions
  7 | 
  8 | read_BLAST_remove_EI <- function(blast_blocks_path, chrom, EI_vars, gene_window_sizes){
  9 |   blast_blocks <- data.frame(assembly = character(),
 10 |                              reference = character(),
 11 |                              chromosome	= character(),
 12 |                              start = numeric(),
 13 |                              end = numeric(),
 14 |                              block_no = numeric(),
 15 |                              chr_length = numeric(),
 16 |                              window_size = numeric(),
 17 |                              gene_window = numeric())
 18 |   
 19 |   for (gene_window in gene_window_sizes){
 20 |     temp <- read.table(paste0(blast_blocks_path, "coord_converted.chr", chrom, "_haplotype_blocks_aln_", gene_window, "_gene_window_drop", ceiling(gene_window*0.1), ".tsv"),
 21 |                        sep = "\t",
 22 |                        header = TRUE,
 23 |                        stringsAsFactors = FALSE)
 24 |     
 25 |     temp$gene_window <- gene_window
 26 |     temp$window_block <- paste0(temp$block_no, "_", gene_window, "gene")
 27 |     blast_blocks <- rbind(blast_blocks, temp)
 28 |   }
 29 |   
 30 |   #remove EI v EI comparisons as we can't compare with mummer
 31 |   
 32 |   block_numbers <- unique(blast_blocks$window_block)
 33 |   
 34 |   blocks_to_remove <- character()
 35 |   
 36 |   for (block_number in block_numbers){
 37 |     assemblies <- unique(blast_blocks[blast_blocks$window_block == block_number, "assembly"])
 38 |     if (length(assemblies) > 2){
 39 |       print("ERROR")
 40 |       print(block_number)
 41 |       print(assemblies)
 42 |     } else if((assemblies[1] %in% EI_vars) & (assemblies[2] %in% EI_vars)){
 43 |       blocks_to_remove <- c(blocks_to_remove, block_number)
 44 |       #print(block_number)
 45 |       #print(assemblies)
 46 |       
 47 |     }
 48 |   }
 49 |   
 50 |   blast_blocks_to_use <- blast_blocks[!(blast_blocks$window_block %in% blocks_to_remove),]
 51 |   blast_blocks_to_use[blast_blocks_to_use$window_size == "cds", "window_size"] <- "cdsbp"
 52 |   return(blast_blocks_to_use)
 53 | }
 54 | 
 55 | precision_recall_flanking_window_sizes <- function(overall_blocks_to_use, mummer){
 56 |   precision_recall <- data.frame(aln_type = character(), window_size = character(), BLAST_blocks = numeric(), BLAST_found = numeric(), mummer_blocks = numeric(), mummer_found = numeric(), precision = numeric(), recall = numeric())
 57 |   
 58 |   window_sizes <- unique(overall_blocks_to_use$window_size)
 59 |   alns <- unique(mummer$aln_type)
 60 |   
 61 |   for (window_size in window_sizes){
 62 |     window_overall_blocks <- overall_blocks_to_use[overall_blocks_to_use$window_size == window_size,]
 63 |     block_numbers_window <- unique(window_overall_blocks$block_no)
 64 |     for (aln in alns){
 65 |       mummer_aln <- mummer[mummer$aln_type == aln,]
 66 |       
 67 |       ref <- unique(mummer_aln$ref)
 68 |       query <- unique(mummer_aln$query)
 69 |       
 70 |       aln_blocks <- numeric()
 71 |       #get the corresponding blast block IDs
 72 |       for (block_number in block_numbers_window){
 73 |         assemblies <- unique(window_overall_blocks[window_overall_blocks$block_no == block_number, "assembly"])
 74 |         if((ref %in% assemblies) & (query %in% assemblies)){
 75 |           aln_blocks <- c(aln_blocks, block_number)
 76 |         }
 77 |       }
 78 |       
 79 |       window_aln <- window_overall_blocks[(window_overall_blocks$block_no %in% aln_blocks) & (window_overall_blocks$assembly == ref),]
 80 |       
 81 |       
 82 |       mummer_blocks <- nrow(mummer_aln)
 83 |       BLAST_blocks <- nrow(window_aln)
 84 |       
 85 |       if(mummer_blocks == 0 | BLAST_blocks == 0){
 86 |         BLAST_found <- 0
 87 |         mummer_found <- 0
 88 |       } else {
 89 |         mummer_gdf <- mummer_aln[,c("ref_chrom", "block_start", "block_end")]
 90 |         colnames(mummer_gdf) <- c("chr", "start", "end")
 91 |         
 92 |         window_gdf <- window_aln[,c("chromosome", "start", "end")]
 93 |         colnames(window_gdf) <- c("chr", "start", "end")
 94 |         
 95 |         # make genomic ranges objects for each
 96 |         mummer_gr <- makeGRangesFromDataFrame(mummer_gdf)
 97 |         window_gr <- makeGRangesFromDataFrame(window_gdf)
 98 |         
 99 |         overlaps <- findOverlaps(mummer_gr, window_gr)
100 |         #precision first - how many BLAST blocks found in the  mummer haplotype?
101 |         BLAST_found <- length(unique(subjectHits(overlaps)))
102 |         #recall - how many mummer blocks found in the  BLAST haplotype?
103 |         mummer_found <- length(unique(queryHits(overlaps)))
104 |       }
105 |       
106 |       precision <- BLAST_found/BLAST_blocks
107 |       recall <- mummer_found/mummer_blocks
108 |       precision_recall_aln <- data.frame(aln_type = aln, window_size, BLAST_blocks, BLAST_found, mummer_blocks, mummer_found, precision, recall)
109 |       precision_recall <- rbind(precision_recall, precision_recall_aln)
110 |     }
111 |   }
112 |   return(precision_recall)
113 | }
114 | 
115 | f1 <- function(precision, recall){
116 |   f1_score <- 2*((precision*recall)/(precision+recall))
117 |   return(f1_score)
118 | }
119 | 
120 | precision_recall_all_windows <- function(blast_blocks_to_use, gene_window_sizes){
121 |   precision_recall_all_blocks <- data.frame(aln_type = character(), 
122 |                                             window_size = character(), 
123 |                                             BLAST_blocks = numeric(), 
124 |                                             BLAST_found = numeric(), 
125 |                                             mummer_blocks = numeric(), 
126 |                                             mummer_found = numeric(), 
127 |                                             precision = numeric(), 
128 |                                             recall = numeric(),
129 |                                             gene_block = character())
130 |   
131 |   
132 |   
133 |   for (gene_window in gene_window_sizes){
134 |     gene_window_data <- blast_blocks_to_use[blast_blocks_to_use$gene_window == gene_window,]
135 |     
136 |     
137 |     precision_recall_block <- precision_recall_flanking_window_sizes(overall_blocks_to_use = gene_window_data, mummer = mummer_blocks)
138 |     
139 |     precision_recall_block$gene_block <- gene_window
140 |     precision_recall_all_blocks <- rbind(precision_recall_all_blocks, precision_recall_block)
141 |   }
142 |   
143 |   
144 |   
145 |   precision_recall_all_blocks$window_size <- factor(precision_recall_all_blocks$window_size, levels = c("cdsbp", "0bp", "1000bp", "2000bp", "5000bp"))
146 |   precision_recall_all_blocks$gene_block <- as.character(precision_recall_all_blocks$gene_block)
147 |   precision_recall_all_blocks$gene_block <- factor(precision_recall_all_blocks$gene_block, levels = c("10", "15", "20", "25", "30"))
148 |   
149 |   precision_recall_all_blocks$f1_score <- f1(precision = precision_recall_all_blocks$precision, recall = precision_recall_all_blocks$recall)
150 |   return(precision_recall_all_blocks)
151 | }
152 | 
153 | plots_precision_recall <- function(precision_recall_all_blocks, chrom, plot_dir){
154 |   precision <- ggplot(precision_recall_all_blocks, aes(x = window_size, y = precision, fill = gene_block)) +
155 |     geom_boxplot() +
156 |     scale_fill_brewer(palette = "Set1")
157 |   
158 |   ggsave(precision, file = paste0(plot_dir, "chr", chrom, "precision_window_sizes_varying_gene_blocks.png"), 
159 |          height = 3, width = 6)
160 |   
161 |   recall <- ggplot(precision_recall_all_blocks, aes(x = window_size, y = recall, fill = gene_block)) +
162 |     geom_boxplot() +
163 |     scale_fill_brewer(palette = "Set1")
164 |   
165 |   ggsave(recall, file = paste0(plot_dir, "chr", chrom, "recall_window_sizes_varying_gene_blocks.png"), 
166 |          height = 3, width = 6)
167 |   
168 |   f1 <- ggplot(precision_recall_all_blocks, aes(x = window_size, y = f1_score, fill = gene_block)) +
169 |     geom_boxplot() +
170 |     scale_fill_brewer(palette = "Set1")
171 |   
172 |   ggsave(f1, file = paste0(plot_dir, "chr", chrom, "f1_score_window_sizes_varying_gene_blocks.png"),
173 |          height = 3, width = 6)
174 |   
175 | }
176 | ## end of functions
177 | 
178 | base_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/precision_recall/"
179 | data_dir <- paste0(base_dir, "all_chrom_data_to_use/")
180 | plot_dir <- paste0(base_dir, "plots/")
181 | 
182 | dir.create(plot_dir)
183 | 
184 | mummer_blocks_path <- "X:/brintonj/haplotype/whole_genome_mummer/blocks/"
185 | 
186 | EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil")
187 | 
188 | mummer_bin_size <- 5000000
189 | 
190 | gene_window_sizes <- c(10, 15, 20, 25,30)
191 | 
192 | chromosomes <- c("1D", "2B", "3B", "4D", "6A", "7A")
193 | 
194 | precision_recall_all_blocks_all_chroms <- data.frame(aln_type = character(), 
195 |                                           window_size = character(), 
196 |                                           BLAST_blocks = numeric(), 
197 |                                           BLAST_found = numeric(), 
198 |                                           mummer_blocks = numeric(), 
199 |                                           mummer_found = numeric(), 
200 |                                           precision = numeric(), 
201 |                                           recall = numeric(),
202 |                                           gene_block = character(),
203 |                                           f1 = numeric(),
204 |                                           chrom = character())
205 | 
206 | for (chrom in chromosomes){
207 |   print(chrom)
208 |   #read in mummer blocks
209 |   mummer_blocks <- read.table(paste0(mummer_blocks_path, chrom, "/mummer_blocks_chr", chrom, ".min20000.5Mb_bins.txt"),
210 |                               header = TRUE, 
211 |                               stringsAsFactors = FALSE)
212 |   
213 |   mummer_blocks$aln_type <- paste0(mummer_blocks$ref, "->", mummer_blocks$query)
214 |   
215 |   #read in blast blocks
216 |   blast_blocks_to_use <- read_BLAST_remove_EI(blast_blocks_path = data_dir,
217 |                                               chrom = chrom,
218 |                                               EI_vars = EI_vars,
219 |                                               gene_window_sizes = gene_window_sizes)
220 |   
221 |   precision_recall_all_blocks <- precision_recall_all_windows(blast_blocks_to_use = blast_blocks_to_use,
222 |                                                               gene_window_sizes = gene_window_sizes)
223 |   
224 |   write.table(precision_recall_all_blocks, file = paste0(plot_dir, "chr", chrom, "precision_recall_f1_table.tsv"),
225 |               sep = "\t",
226 |               col.names = TRUE,
227 |               row.names = FALSE,
228 |               quote = FALSE)
229 |   
230 |   plots_precision_recall(precision_recall_all_blocks = precision_recall_all_blocks,
231 |                          chrom = chrom,
232 |                          plot_dir = plot_dir)
233 |   
234 |   precision_recall_all_blocks$chrom <- chrom
235 |   
236 |   precision_recall_all_blocks_all_chroms <- rbind(precision_recall_all_blocks_all_chroms, precision_recall_all_blocks)
237 |   
238 | }
239 | 
240 | write.table(precision_recall_all_blocks_all_chroms, file = paste0(plot_dir, "combined_precision_recall_f1_table.tsv"),
241 |             sep = "\t",
242 |             col.names = TRUE,
243 |             row.names = FALSE,
244 |             quote = FALSE)
245 | 
246 | plots_precision_recall(precision_recall_all_blocks = precision_recall_all_blocks_all_chroms,
247 |                        chrom = "combined",
248 |                        plot_dir = plot_dir)
249 | 
250 | 


--------------------------------------------------------------------------------
/precision-recall-analysis/calculate_BLAST_blocks_all_window_sizes_flanking_sequences.r:
--------------------------------------------------------------------------------
  1 | library(stringr)
  2 | library(plyr)
  3 | library(reshape2)
  4 | 
  5 | ## functions
  6 | 
  7 | read_pairwise_position <- function(blast_path_gz, gtf, outfile){
  8 |   all_comp <- read.table(gzfile(blast_path_gz), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
  9 |   
 10 |   head(all_comp)
 11 |   
 12 |   #get the refseq position of the genes
 13 |   gtf$transcript_id <- str_split_fixed(gtf$V9, ";", 2)[,1]
 14 |   gtf$transcript_id <- gsub("transcript_id ", "", gtf$transcript_id)
 15 |   
 16 |   gene_only <- gtf[gtf$V3 == "gene",]
 17 |   
 18 |   gene_positions <- gene_only[,c("transcript_id", "V1", "V4",  "V5", "V7")]
 19 |   colnames(gene_positions) <- c("transcript", "chr", "start", "end", "strand")
 20 |   gene_positions$transcript <- gsub("ID=", "", gene_positions$transcript)
 21 |   
 22 |   #now add positions to the pairwise comparison file
 23 |   
 24 |   all_comp_positions <- merge(all_comp, gene_positions, all.x = TRUE, all.y = FALSE)
 25 |   head(all_comp_positions)
 26 |   
 27 |   write.table(all_comp_positions, file = outfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
 28 |   return(all_comp_positions)
 29 | }
 30 | 
 31 | separate_output_identities <- function(comp_data, outdir, suffix){
 32 |   varieties <- unique(comp_data$var_query)
 33 |   for (i in seq(1, length(varieties))){
 34 |     variety <- varieties[i]
 35 |     print(variety)
 36 |     variety_data <- comp_data[grep(variety, comp_data$aln_type),]
 37 |     write.table(variety_data, file = paste0(outdir, variety, suffix, ".tab"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
 38 |   }
 39 | }
 40 | 
 41 | read_pairwise_position_no_header <- function(blast_path_gz, gtf, outfile, col_names){
 42 |   all_comp <- read.table(gzfile(blast_path_gz), sep = "\t", header = FALSE, stringsAsFactors = FALSE)
 43 |   colnames(all_comp) <- col_names
 44 |   head(all_comp)
 45 |   
 46 |   #get the refseq position of the genes
 47 |   gtf$transcript_id <- str_split_fixed(gtf$V9, ";", 2)[,1]
 48 |   gtf$transcript_id <- gsub("transcript_id ", "", gtf$transcript_id)
 49 |   
 50 |   gene_only <- gtf[gtf$V3 == "gene",]
 51 |   
 52 |   gene_positions <- gene_only[,c("transcript_id", "V1", "V4",  "V5", "V7")]
 53 |   colnames(gene_positions) <- c("transcript", "chr", "start", "end", "strand")
 54 |   gene_positions$transcript <- gsub("ID=", "", gene_positions$transcript)
 55 |   
 56 |   #now add positions to the pairwise comparison file
 57 |   
 58 |   all_comp_positions <- merge(all_comp, gene_positions, all.x = TRUE, all.y = FALSE)
 59 |   head(all_comp_positions)
 60 |   
 61 |   write.table(all_comp_positions, file = outfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
 62 |   return(all_comp_positions)
 63 | }
 64 | 
 65 | separate_output_identities <- function(comp_data, outdir, suffix){
 66 |   varieties <- unique(comp_data$var_query)
 67 |   for (i in seq(1, length(varieties))){
 68 |     variety <- varieties[i]
 69 |     print(variety)
 70 |     variety_data <- comp_data[grep(variety, comp_data$aln_type),]
 71 |     write.table(variety_data, file = paste0(outdir, variety, suffix, ".tab"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
 72 |   }
 73 | }
 74 | calculate_pid_windows <- function(aln_data, window_size = 20, drop_no = 2){
 75 |   aln_data <- aln_data[order(aln_data$start),]
 76 |   aln <- unique(aln_data$aln_type)
 77 |   mean_pidents <- data.frame(start = numeric(), end = numeric(), start_position = numeric(), end_position = numeric(), pident_mean = numeric(), aln_type = character()) 
 78 |   
 79 |   for(j in seq(1, nrow(aln_data)-window_size)){
 80 |     window_start = j
 81 |     start_position = aln_data[window_start, "start"]
 82 |     window_end = (window_start + window_size) - 1
 83 |     end_position = aln_data[window_end, "end"]
 84 |     pidents <- aln_data[c(window_start:window_end), "pident"]
 85 |     pidents <- pidents[order(pidents, decreasing = TRUE)]
 86 |     pidents_sub <- pidents[c(1:(length(pidents)-drop_no))]
 87 |     pident_mean <- mean(pidents_sub)
 88 |     aln_to_add = data.frame(start = window_start, 
 89 |                             end = window_end, 
 90 |                             start_position = start_position, 
 91 |                             end_position = end_position, 
 92 |                             pident_mean = pident_mean, 
 93 |                             aln_type = aln)
 94 |     mean_pidents <- rbind(mean_pidents, aln_to_add)
 95 |   }
 96 |   return(mean_pidents)
 97 | }
 98 | 
 99 | 
100 | assign_blocks <-function(mean_pidents){
101 |   block_positions <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character())
102 |   
103 |   mean_pidents_copy <- mean_pidents
104 |   mean_pidents_copy$block_no <- NA
105 |   block_no = 1
106 |   new_block <- "no"
107 |   
108 |   for (i in seq(1, nrow(mean_pidents_copy))){
109 |     print(i)
110 |     if(mean_pidents_copy[i, "pident_mean"] < 100){
111 |       print("less than 100%")
112 |       mean_pidents_copy[i, "block_no"] <- NA
113 |     } else if (mean_pidents_copy[i, "pident_mean"] == 100){
114 |       if(new_block == "no"){
115 |         mean_pidents_copy[i, "block_no"] <- block_no
116 |       } else if (new_block == "yes"){
117 |         if(mean_pidents_copy[i, "start_position"] < end_prev_block){
118 |           block_no <- block_no-1
119 |           mean_pidents_copy[i, "block_no"] <- block_no
120 |         } else {
121 |           mean_pidents_copy[i, "block_no"] <- block_no
122 |         }
123 |       }
124 |       if(i == nrow(mean_pidents_copy)){
125 |         print("end")
126 |       } else if (mean_pidents_copy[i+1, "pident_mean"] == 100){
127 |         block_no <- block_no
128 |         new_block <- "no"
129 |         print("same block")
130 |       } else if (mean_pidents_copy[i+1, "pident_mean"] < 100){
131 |         block_no <- block_no+1
132 |         new_block <- "yes"
133 |         print("new_block")
134 |         end_prev_block <- mean_pidents_copy[i, "end_position"]
135 |       }
136 |     }
137 |   }
138 |   return(mean_pidents_copy)
139 | }
140 | 
141 | summarise_blocks <- function(mean_pidents_copy){
142 |   #now get the start and end of each block
143 |   blocks <- unique(mean_pidents_copy$block_no)
144 |   blocks <- blocks[complete.cases(blocks)]
145 |   
146 |   block_positions <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character())
147 |   
148 |   for(block in blocks){
149 |     block_data <- subset(mean_pidents_copy, block_no == block)
150 |     block_start <- min(block_data$start_position)
151 |     block_end <- max(block_data$end_position)
152 |     aln_type <- unique(block_data$aln_type)
153 |     to_add <- data.frame(block_no = block, block_start = block_start, block_end = block_end, aln_type = aln_type)
154 |     block_positions <- rbind(block_positions, to_add)
155 |   }
156 |   return(block_positions)
157 | }
158 | 
159 | calculate_blocks <- function (data_dir, window_sizes, gene_no_window, drop_no, chrom = "6A"){
160 |   
161 |   overall_blocks <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character(), window = character())
162 |   
163 |   for(window in window_sizes){
164 |     print(window)
165 |     window_dir <- paste0(data_dir, window)
166 |     id_data_path <- paste0(window_dir, "/varieties_", chrom, "_identities_", window, "_refseq_positions.tab")
167 |     id_data <- read.table(id_data_path, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
168 |     id_data_noN <- id_data[id_data$Ns_total == 0,]
169 |     
170 |     window_blocks <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character(), window_size = character())
171 |     
172 |     alns <- unique(id_data_noN$aln_type)
173 |     for(aln in alns){
174 |       print(aln)
175 |       aln_data <- id_data_noN[id_data_noN$aln_type == aln,]
176 |       #block_info <- calculate_hap_blocks(aln_data = aln_data)
177 |       
178 |       aln_mean_pidents <- calculate_pid_windows(aln_data = aln_data, window_size = gene_no_window, drop_no = drop_no)
179 |       aln_mean_pidents <- aln_mean_pidents[complete.cases(aln_mean_pidents),]
180 |       if(max(aln_mean_pidents$pident_mean) < 100){
181 |         block_info <- data.frame(block_no = NA, block_start = NA, block_end = NA, aln_type = aln, window_size = window)
182 |       } else {
183 |         aln_mean_pidents_copy <- assign_blocks(mean_pidents = aln_mean_pidents)
184 |         block_info <- summarise_blocks(mean_pidents_copy = aln_mean_pidents_copy)
185 |         block_info$window_size <- window
186 |       }
187 |       window_blocks <- rbind(window_blocks, block_info)
188 |     }
189 |     overall_blocks <- rbind(overall_blocks, window_blocks)
190 |   }
191 |   outfile <- paste0(data_dir, "haplotype_blocks_aln_", gene_no_window, "_gene_window_drop", drop_no, ".tsv")
192 |   write.table(overall_blocks, file = outfile, sep = "\t", row.names = FALSE, quote = FALSE)
193 |   return(overall_blocks)
194 | }
195 | 
196 | ## End of functions
197 | 
198 | ## calculate the blocks for 1D, 2B, 3B, 4D, 6A, 7A 
199 | 
200 | raw_data_dir <- "W:/ramirezr/SM1/final_pairwise_blast/"
201 | out_dir <- "X:/brintonj/haplotype/whole_genome_blast/precision_recall/varying_window_blocks/"
202 | 
203 | pairwise_header <- c("transcript", "query", "subject", "var_query", "var_subject", "aln_type", "length", "pident", "Ns_query", "Ns_subject", "Ns_total", "Flanking")
204 | #same gtf for all (using RefSeq order for now)
205 | HC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_HC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
206 | LC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_LC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
207 | ALL_gtf <- rbind(HC_gtf, LC_gtf)
208 | 
209 | chromosomes_to_calculate <- c("1D", "2B", "3B", "4D", "6A", "7A")
210 | 
211 | for (chrom in chromosomes_to_calculate){
212 |   files_to_run <- list.files(raw_data_dir)[grep(paste0("varieties_", chrom, "_identities"), list.files(raw_data_dir))]
213 |   
214 |   for (i in seq(1, length(files_to_run))){
215 |     file <- files_to_run[i]
216 |     window <- gsub(paste0("varieties_", chrom, "_identities_"), "", file)
217 |     window <- gsub("\\.tab.gz", "", window)
218 |     window_dir <- paste0(out_dir, window, "/")
219 |     
220 |     dir.create(window_dir)
221 |    
222 |     all_comp_positions <- read_pairwise_position_no_header(blast_path_gz = paste0(raw_data_dir, file), gtf = ALL_gtf, outfile = paste0(window_dir, "/varieties_", chrom, "_identities_", window, "_refseq_positions.tab"), col_names = pairwise_header)
223 |     
224 |     #separate_output_identities(comp_data = all_comp_positions, outdir = window_dir, suffix = paste0( "_", chrom, "_identities_refseq_positions_", window))
225 |   }
226 |   
227 |   ## ok now we need to call the blocks for all the different window sizes and criteria
228 |   
229 |   window_sizes <- c("cds", "0bp", "1000bp", "2000bp", "5000bp")
230 |   
231 |   #OK now we want to calculate the blocks for all the different comparisons for the different window sizes
232 |   
233 |   gene_no_window <- 10
234 |   
235 |   window_10_blocks <- calculate_blocks(data_dir = out_dir, 
236 |                                        window_sizes = window_sizes, 
237 |                                        gene_no_window = gene_no_window, 
238 |                                        drop_no = ceiling(gene_no_window*0.1),
239 |                                        chrom = chrom)
240 |   
241 |   window_10_blocks$block_size <- window_10_blocks$block_end-window_10_blocks$block_start
242 |   window_10_blocks$chr <- paste0("chr", chrom)
243 |   window_10_blocks$gene_block <- gene_no_window
244 |   
245 |   gene_no_window <- 15
246 |   
247 |   window_15_blocks <- calculate_blocks(data_dir = out_dir, 
248 |                                        window_sizes = window_sizes, 
249 |                                        gene_no_window = gene_no_window, 
250 |                                        drop_no = ceiling(gene_no_window*0.1),
251 |                                        chrom = chrom)
252 |   
253 |   window_15_blocks$block_size <- window_15_blocks$block_end-window_15_blocks$block_start
254 |   window_15_blocks$chr <- paste0("chr", chrom)
255 |   window_15_blocks$gene_block <- gene_no_window
256 |   
257 |   gene_no_window <- 25
258 |   
259 |   window_25_blocks <- calculate_blocks(data_dir = out_dir, 
260 |                                        window_sizes = window_sizes, 
261 |                                        gene_no_window = gene_no_window, 
262 |                                        drop_no = ceiling(gene_no_window*0.1),
263 |                                        chrom = chrom)
264 |   
265 |   window_25_blocks$block_size <- window_25_blocks$block_end-window_25_blocks$block_start
266 |   window_25_blocks$chr <- paste0("chr", chrom)
267 |   window_25_blocks$gene_block <- gene_no_window
268 |   
269 |   gene_no_window <- 30
270 |   
271 |   window_30_blocks <- calculate_blocks(data_dir = out_dir, 
272 |                                        window_sizes = window_sizes, 
273 |                                        gene_no_window = gene_no_window, 
274 |                                        drop_no = ceiling(gene_no_window*0.1),
275 |                                        chrom = chrom)
276 |   
277 |   window_30_blocks$block_size <- window_30_blocks$block_end-window_30_blocks$block_start
278 |   window_30_blocks$chr <- paste0("chr", chrom)
279 |   window_30_blocks$gene_block <- gene_no_window
280 |   
281 |   gene_no_window <- 20
282 |   
283 |   window_20_blocks <- calculate_blocks(data_dir = out_dir, 
284 |                                        window_sizes = window_sizes, 
285 |                                        gene_no_window = gene_no_window, 
286 |                                        drop_no = ceiling(gene_no_window*0.1),
287 |                                        chrom = chrom)
288 |   
289 |   window_20_blocks$block_size <- window_20_blocks$block_end-window_20_blocks$block_start
290 |   window_20_blocks$chr <- paste0("chr", chrom)
291 |   window_20_blocks$gene_block <- gene_no_window
292 |   
293 |   all_window_blocks <- rbind(window_10_blocks, window_15_blocks, window_20_blocks, window_25_blocks, window_30_blocks)
294 |   
295 |   write.table(all_window_blocks, file = paste0(out_dir, "combined_", chrom, "_all_flanking_all_windown_sizes_BLAST_blocks.tsv"),
296 |               sep = "\t", 
297 |               col.names = TRUE, 
298 |               row.names = FALSE, 
299 |               quote = FALSE)
300 | }
301 | 
302 | #####
303 | 


--------------------------------------------------------------------------------
/haplotype-block-assignment/assign_BLAST_blocks_whole_genome.r:
--------------------------------------------------------------------------------
  1 | ## Script written by Jemima Brinton 2019
  2 | ## Aim - define haplotype blocks across whole wheat genome based on pairwise BLAST alignments 
  3 | 
  4 | 
  5 | ##DEFINE FUNCTIONS
  6 | library(stringr)
  7 | library(ggplot2)
  8 | library("ggdendro")
  9 | library(plyr)
 10 | library(reshape2)
 11 | library("grid")
 12 | 
 13 | #read in BLAST alignments and get positions 
 14 | read_pairwise_position <- function(blast_path_gz, gtf, outfile, write_table = TRUE){
 15 |   all_comp <- read.table(gzfile(blast_path_gz), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
 16 |   
 17 |   head(all_comp)
 18 |   
 19 |   #get the refseq position of the genes
 20 |   gtf$transcript_id <- str_split_fixed(gtf$V9, ";", 2)[,1]
 21 |   gtf$transcript_id <- gsub("transcript_id ", "", gtf$transcript_id)
 22 |   
 23 |   gene_only <- gtf[gtf$V3 == "gene",]
 24 |   
 25 |   gene_positions <- gene_only[,c("transcript_id", "V1", "V4",  "V5", "V7")]
 26 |   colnames(gene_positions) <- c("transcript", "chr", "start", "end", "strand")
 27 |   gene_positions$transcript <- gsub("ID=", "", gene_positions$transcript)
 28 |   
 29 |   #now add positions to the pairwise comparison file
 30 |   
 31 |   all_comp_positions <- merge(all_comp, gene_positions, all.x = TRUE, all.y = FALSE)
 32 |   head(all_comp_positions)
 33 |   
 34 |   if(write_table == TRUE){
 35 |   write.table(all_comp_positions, file = outfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
 36 |   }
 37 |   return(all_comp_positions)
 38 | }
 39 | 
 40 | #cap percentage identity at a certain value
 41 | cap_data <- function(all_comp_positions, cap_value){
 42 |   all_comp_positions$capped_ident <- all_comp_positions$pident
 43 |   all_comp_positions[all_comp_positions$capped_ident < cap_value, "capped_ident"] <- 0
 44 |   return(all_comp_positions)
 45 | }
 46 | 
 47 | #filter for the number of missing alignments
 48 | filter_aln_no <- function(comp_data, missing_limit){
 49 |   #total_number of alignments
 50 |   total_aln <- length(unique(comp_data$aln_type))
 51 |   aln_limit <- total_aln - missing_limit
 52 |   
 53 |   transcript_counts<- data.frame(table(comp_data$transcript))
 54 |   transcripts_to_keep <- transcript_counts[transcript_counts$Freq >= aln_limit, "Var1"]
 55 |   
 56 |   comp_data_filt <- subset(comp_data, transcript %in% transcripts_to_keep)
 57 |   
 58 |   nrow(comp_data)
 59 |   nrow(comp_data_filt)
 60 |   length(unique(comp_data$transcript))
 61 |   length(unique(comp_data_filt$transcript))
 62 |   
 63 |   return(comp_data_filt)
 64 | }
 65 | 
 66 | 
 67 | #this zooms in on an region and plots the 100% identity and orders alignments via heirachical clustering
 68 | plot_zoom_100_clust <- function(data, start, end, outfile){
 69 |   data <- data[order(data$start),]
 70 |   data_zoom <- data[(data$end > start) & (data$start < end),]
 71 |   #print(table(data_zoom$aln_type))
 72 |   zoom_mat <- dcast(data_zoom[,c("aln_type", "transcript", "capped_ident")], aln_type ~ transcript)
 73 |   zoom.matrix <- as.matrix(zoom_mat[,c(2:(ncol(zoom_mat)-1))])
 74 |   dim(zoom.matrix)
 75 |   rownames(zoom.matrix) <- zoom_mat$aln_type
 76 |   zoom.dendro <- as.dendrogram(hclust(d=dist(x = zoom.matrix)))
 77 |   dendro.plot <- ggdendrogram(data = zoom.dendro, rotate = TRUE)
 78 |   zoom.order <- order.dendrogram(zoom.dendro)
 79 |   data_zoom$aln_type <- factor(x = data_zoom$aln_type, levels = zoom_mat$aln_type[zoom.order], ordered = TRUE)
 80 |   plot <- ggplot(data_zoom , aes(x=factor(transcript),y=aln_type)) +
 81 |     geom_tile(aes(fill = data_zoom$capped_ident)) +
 82 |     scale_fill_distiller(limits=c(0,100), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "% ID") +
 83 |     theme(axis.text.x=element_text(angle=90, hjust=1)) 
 84 |   ggsave(plot = plot, file =outfile, dpi = 600, height = 8, width = 18)
 85 |   return(plot)
 86 | }
 87 | 
 88 | #calculate sliding windows of percentage identity based on a certain number of consective transcripts (window_size) and dropping a specific numer with the lowest identity before calculating the mean % id
 89 | calculate_pid_windows <- function(aln_data, window_size = 20, drop_no = 2){
 90 |   aln_data <- aln_data[order(aln_data$start),]
 91 |   aln <- unique(aln_data$aln_type)
 92 |   mean_pidents <- data.frame(start = numeric(), 
 93 |                              end = numeric(), 
 94 |                              start_position = numeric(), 
 95 |                              end_position = numeric(), 
 96 |                              pident_mean = numeric(), 
 97 |                              aln_type = character(),
 98 |                              start_transcript = character(),
 99 |                              end_transcript = character()) 
100 |   
101 |   for(j in seq(1, nrow(aln_data)-window_size)){
102 |     window_start = j
103 |     start_position = aln_data[window_start, "start"]
104 |     start_transcript = aln_data[window_start, "transcript"]
105 |     window_end = (window_start + window_size) - 1
106 |     end_position = aln_data[window_end, "end"]
107 |     end_transcript = aln_data[window_end, "transcript"]
108 |     pidents <- aln_data[c(window_start:window_end), "pident"]
109 |     pidents <- pidents[order(pidents, decreasing = TRUE)]
110 |     pidents_sub <- pidents[c(1:(length(pidents)-drop_no))]
111 |     pident_mean <- mean(pidents_sub)
112 |     aln_to_add = data.frame(start = window_start, 
113 |                             end = window_end, 
114 |                             start_position = start_position, 
115 |                             end_position = end_position, 
116 |                             pident_mean = pident_mean, 
117 |                             aln_type = aln, 
118 |                             start_transcript = start_transcript,
119 |                             end_transcript = end_transcript)
120 |     mean_pidents <- rbind(mean_pidents, aln_to_add)
121 |   }
122 |   return(mean_pidents)
123 | }
124 | 
125 | #assign haplotype blocks based on the sliding window mean of percentage identity
126 | assign_blocks <-function(mean_pidents){
127 |   mean_pidents_copy <- mean_pidents
128 |   mean_pidents_copy$block_no <- NA
129 |   block_no = 1
130 |   new_block <- "no"
131 |   
132 |   for (i in seq(1, nrow(mean_pidents_copy))){
133 |     #print(i)
134 |     if(mean_pidents_copy[i, "pident_mean"] < 100){
135 |       print("less than 100%")
136 |       mean_pidents_copy[i, "block_no"] <- NA
137 |     } else if (mean_pidents_copy[i, "pident_mean"] == 100){
138 |       if(new_block == "no"){
139 |         mean_pidents_copy[i, "block_no"] <- block_no
140 |       } else if (new_block == "yes"){
141 |         if(mean_pidents_copy[i, "start_position"] < end_prev_block){
142 |           block_no <- block_no-1
143 |           mean_pidents_copy[i, "block_no"] <- block_no
144 |         } else {
145 |           mean_pidents_copy[i, "block_no"] <- block_no
146 |         }
147 |       }
148 |       if(i == nrow(mean_pidents_copy)){
149 |         print("end")
150 |       } else if (mean_pidents_copy[i+1, "pident_mean"] == 100){
151 |         block_no <- block_no
152 |         new_block <- "no"
153 |         print("same block")
154 |       } else if (mean_pidents_copy[i+1, "pident_mean"] < 100){
155 |         block_no <- block_no+1
156 |         new_block <- "yes"
157 |         print("new_block")
158 |         end_prev_block <- mean_pidents_copy[i, "end_position"]
159 |       }
160 |     }
161 |   }
162 |   return(mean_pidents_copy)
163 | }
164 | 
165 | #summarise adjacted haplotype blocks calculated from sliding window mean
166 | summarise_blocks <- function(mean_pidents_copy){
167 |   #now get the start and end of each block
168 |   blocks <- unique(mean_pidents_copy$block_no)
169 |   blocks <- blocks[complete.cases(blocks)]
170 |   
171 |   block_positions <- data.frame(block_no = numeric(), 
172 |                                 block_start = numeric(), 
173 |                                 block_end = numeric(), 
174 |                                 aln_type = character(),
175 |                                 start_transcript = character(),
176 |                                 end_transcript = character())
177 |   
178 |   for(block in blocks){
179 |     block_data <- subset(mean_pidents_copy, block_no == block)
180 |     block_start <- min(block_data$start_position)
181 |     start_transcript <- unique(block_data[block_data$start_position == block_start, "start_transcript"])
182 |     block_end <- max(block_data$end_position)
183 |     end_transcript <- unique(block_data[block_data$end_position == block_end, "end_transcript"])
184 |     aln_type <- unique(block_data$aln_type)
185 |     to_add <- data.frame(block_no = block, 
186 |                          block_start = block_start, 
187 |                          block_end = block_end, 
188 |                          aln_type = aln_type,
189 |                          start_transcript = start_transcript,
190 |                          end_transcript = end_transcript)
191 |     
192 |     block_positions <- rbind(block_positions, to_add)
193 |   }
194 |   return(block_positions)
195 | }
196 | 
197 | calculate_blocks_single_window <- function (data, gene_no_window, drop_no, window, out_dir, chrom){
198 |   
199 |   overall_blocks <- data.frame(block_no = numeric(), 
200 |                                block_start = numeric(), 
201 |                                block_end = numeric(), 
202 |                                aln_type = character(),
203 |                                start_transcript = character(),
204 |                                end_transcript = character(),
205 |                                window = character())
206 |   
207 |   alns <- unique(data$aln_type)
208 |     for(aln in alns){
209 |       print(aln)
210 |       aln_data <- data[data$aln_type == aln,]
211 |       #block_info <- calculate_hap_blocks(aln_data = aln_data)
212 |       
213 |       aln_mean_pidents <- calculate_pid_windows(aln_data = aln_data, window_size = gene_no_window, drop_no = drop_no)
214 |       aln_mean_pidents <- aln_mean_pidents[complete.cases(aln_mean_pidents),]
215 |       if(max(aln_mean_pidents$pident_mean) < 100){
216 |         block_info <- data.frame(block_no = NA, 
217 |                                  block_start = NA, 
218 |                                  block_end = NA, 
219 |                                  aln_type = aln, 
220 |                                  start_transcript = NA,
221 |                                  end_transcript = NA,
222 |                                  window = window)
223 |       } else {
224 |         aln_mean_pidents_copy <- assign_blocks(mean_pidents = aln_mean_pidents)
225 |         block_info <- summarise_blocks(mean_pidents_copy = aln_mean_pidents_copy)
226 |         block_info$window <- window
227 |       }
228 |     overall_blocks <- rbind(overall_blocks, block_info)
229 |     }
230 |   
231 |   #outfile <- paste0(out_dir, "haplotype_blocks_aln_", gene_no_window, "_gene_window_drop", drop_no, ".tsv")
232 |   #write.table(overall_blocks, file = outfile, sep = "\t", row.names = FALSE, quote = FALSE)
233 |   return(overall_blocks)
234 | }
235 | 
236 | ## END OF FUNCTIONS
237 | 
238 | raw_data_path <- "W:/ramirezr/SM1/pairwise_blast_nov_2019/varieties_all_identities_2000bp.tab.gz"
239 | out_dir <- "X:/brintonj/haplotype/whole_genome_blast/"
240 | plot_dir <- paste0(out_dir, "plots/")
241 | blocks_dir <- paste0(out_dir, "blocks/")
242 | 
243 | dir.create(plot_dir)
244 | dir.create(blocks_dir)
245 | 
246 | #same gtf for all (using RefSeq order for now)
247 | HC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_HC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
248 | LC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_LC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
249 | ALL_gtf <- rbind(HC_gtf, LC_gtf)
250 | 
251 | raw_data <- read_pairwise_position(blast_path_gz = raw_data_path, 
252 |                                    gtf = ALL_gtf, 
253 |                                    write_table = FALSE)
254 | 
255 | #make a capped column in the all positions data_frame
256 | all_comp_positions <- cap_data(all_comp_positions = raw_data, cap_value = 100)
257 | 
258 | #filter for Ns
259 | all_comp_positions_noN <- all_comp_positions[all_comp_positions$Ns_total == 0,]
260 | 
261 | #plot how many Ns we lose for each chromosome
262 | gene_counts_all <- data.frame(table(all_comp_positions[,c("aln_type", "chr")]))
263 | gene_counts_all$filter <- "all"
264 | gene_counts_noN <- data.frame(table(all_comp_positions_noN[,c("aln_type", "chr")]))
265 | gene_counts_noN$filter <- "noN"
266 | 
267 | gene_counts_filter <- rbind(gene_counts_all, gene_counts_noN)
268 | 
269 | ggplot(gene_counts_filter, aes(x = chr, y = Freq, fill = filter)) +
270 |   geom_boxplot() +
271 |   theme(axis.text.x = element_text(angle = 90)) +
272 |   ylab("number of genes")
273 | 
274 | ggsave(file = paste0(out_dir, "gene_counts_pairwise_blast_whole_genome_all_v_noN.png"), height = 5, width = 6, dpi = 300)
275 | 
276 | chromosomes <- unique(all_comp_positions_noN$chr)
277 | #get rid of chrUn - we need physical order
278 | chromosomes <- chromosomes[!(chromosomes == "chrUn")]
279 | 
280 | window <- unique(all_comp_positions_noN$Flanking)
281 | 
282 | #use a sliding window size of 25 consecutive transcripts
283 | gene_no_window <- 25
284 | 
285 | for (chr in chromosomes){
286 |   chr_data <- all_comp_positions_noN[all_comp_positions_noN$chr == chr,]
287 |   chrom_plot_dir <- paste0(plot_dir, chr, "/")
288 |   dir.create(chrom_plot_dir, recursive = TRUE)
289 |   
290 |   chrom_blocks_dir <- paste0(blocks_dir, chr, "/")
291 |   dir.create(chrom_blocks_dir, recursive = TRUE)
292 |   
293 |   varieties <- unique(chr_data$var_query)
294 | 
295 |     for (i in seq(1, length(varieties))){
296 |       variety <- varieties[i]
297 |     
298 |       variety_data <- chr_data[grep(variety, chr_data$aln_type),]
299 |     
300 |       #now filter for those in all comparisons
301 |       variety_data_all <- filter_aln_no(comp_data = variety_data, missing_limit = 0)
302 |     
303 |       #exclude those transcripts missing more than two comparisons
304 |       variety_data_2 <- filter_aln_no(comp_data = variety_data, missing_limit = 2)
305 |     
306 |       #plot heatmap with all data
307 |       all_plot <- plot_zoom_100_clust(variety_data, 
308 |                                     start = 0, 
309 |                                     end = ceiling(max(variety_data$end)), 
310 |                                     outfile = paste(chrom_plot_dir, variety, "_", window, "_noN_100cap_clustered.png", sep = ""))
311 |     
312 |       #plot heatmap with no missing data
313 |       all_plot <- plot_zoom_100_clust(variety_data_all, 
314 |                                     start = 0, 
315 |                                     end = ceiling(max(variety_data$end)), 
316 |                                     outfile = paste(chrom_plot_dir, variety, "_", window, "_0missing_noN_100cap_clustered.png", sep = ""))
317 |     
318 |     #plot heatmap with up to 2 missing data points
319 |     all_plot <- plot_zoom_100_clust(variety_data_2, 
320 |                                     start = 0, 
321 |                                     end = ceiling(max(variety_data$end)), 
322 |                                     outfile = paste(chrom_plot_dir, variety, "_", window, "_max2missing_noN_100cap_clustered.png", sep = ""))
323 |     }
324 | 
325 |   #now do the block calculation
326 |   chrom_blocks <- calculate_blocks_single_window(data = chr_data,
327 |                                                  gene_no_window = gene_no_window, 
328 |                                                  drop_no = ceiling(gene_no_window*0.1), 
329 |                                                  window = window, 
330 |                                                  out_dir = chrom_blocks_dir, 
331 |                                                  chrom = chr)
332 |   
333 |   write.table(chrom_blocks, file = paste0(chrom_blocks_dir, "haplotype_blocks_BLAST_", gene_no_window, "_gene_window_", window, "bp.txt"),
334 |               sep = "\t", 
335 |               col.names = TRUE, row.names = FALSE, quote = FALSE)
336 | 
337 |   
338 | }
339 | 


--------------------------------------------------------------------------------
/haplotype-block-assignment/assign_mummer_blocks_whole_genome.r:
--------------------------------------------------------------------------------
  1 | ## Script written by Jemima Brinton 2019
  2 | ## Aim - define haplotype blocks across whole wheat genome based on pairwise nucmer alignments between genome assemblies
  3 | 
  4 | ###FUNCTIONS
  5 | library(dplyr)
  6 | library(magrittr)
  7 | library(GenomicRanges)
  8 | library(ggplot2)
  9 | library(tidyr)
 10 | library(viridis)
 11 | library(stringr)
 12 | 
 13 | 
 14 | #Read Delta functions and plotting functions (adapted) used are from https://jmonlong.github.io/Hippocamplus/2017/09/19/mummerplots-with-ggplot2/
 15 | #Reads in delta file output from nucmer into R and summarises into dataframe
 16 | readDelta <- function(deltafile){
 17 |   lines = scan(deltafile, 'a', sep='\n', quiet=TRUE)
 18 |   lines = lines[-1]
 19 |   lines.l = strsplit(lines, ' ')
 20 |   lines.len = lapply(lines.l, length) %>% as.numeric
 21 |   lines.l = lines.l[lines.len != 1]
 22 |   lines.len = lines.len[lines.len != 1]
 23 |   head.pos = which(lines.len == 4)
 24 |   head.id = rep(head.pos, c(head.pos[-1], length(lines.l)+1)-head.pos)
 25 |   mat = matrix(as.numeric(unlist(lines.l[lines.len==7])), 7)
 26 |   res = as.data.frame(t(mat[1:5,]))
 27 |   colnames(res) = c('rs','re','qs','qe','error')
 28 |   res$qid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 2))
 29 |   res$rid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 1)) %>% gsub('^>', '', .)
 30 |   res$strand = ifelse(res$qe-res$qs > 0, '+', '-')
 31 |   res
 32 | }
 33 | 
 34 | #calculates percentage identity and mid point for each mummer alignment
 35 | calculate_perc_id_mid_points <- function(data){
 36 |   data$r_length <- (data$re - data$rs)
 37 |   data$perc_id <- ((data$r_length - data$error)/data$r_length)*100
 38 |   data$perc_id_factor <- data$perc_id
 39 |   data[data$perc_id < 100, "perc_id_factor"] <- "<100"
 40 |   data$perc_id_factor <- as.factor(data$perc_id_factor)
 41 |   data$r_mid <- (data$rs + data$re)/2
 42 |   data$q_mid <- (data$qs + data$qe)/2
 43 |   return(data)
 44 | }
 45 | 
 46 | #reads in delta file, calculates percentage ID and filters for a minimum alignment size
 47 | pre_plot_analysis <- function(delta_path, min_size = 20000){
 48 |   data = readDelta(delta_path)
 49 |   data <- calculate_perc_id_mid_points(data)
 50 |   data_filt <- data[data$r_length >= min_size,]
 51 |   return(data_filt)
 52 | }
 53 | 
 54 | # Plots diagonal scatter plot of a nucmer pairwise alignment with points coloured by percentage identity of alignment - x axis is reference, y axis is query
 55 | plot_by_perc_id_cap <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){
 56 |   ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=qs, yend=qe, colour=perc_id, shape = strand)) + geom_segment() +
 57 |     geom_point(alpha=.5) +
 58 |     theme_bw() + 
 59 |     theme(strip.text.y=element_text(angle=180, size=5),
 60 |           strip.background=element_blank()) +
 61 |     xlab(unique(data$rid)) + ylab(unique(data$qid)) +
 62 |     scale_colour_viridis(limits = c(cap_lower, cap_upper))
 63 | }
 64 | 
 65 | #scatter plot against reference position, y axis is percentage identity colour is size of alignment
 66 | plot_perc_id_v_ref <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100){
 67 |   ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id, colour=r_length)) +
 68 |     theme_bw() + xlab(data$rid) + ylab(paste0('percentage ID v ', data$qid)) +
 69 |     geom_point(alpha=.5) +
 70 |     ylim(ymin, ymax) +
 71 |     scale_colour_viridis()
 72 | }
 73 | 
 74 | #bins alignment dataframe by reference position
 75 | bin_data <- function(data, bin_size, max_chrom_size){
 76 |   bins <- seq(0, max_chrom_size, by = bin_size)
 77 |   data$bin <- NA
 78 |   for (i in bins){
 79 |     data$bin <- ifelse(((data$r_mid > (i-bin_size)) & (data$r_mid < (i-1))), i, data$bin)
 80 |   }
 81 |   return(data)
 82 | }
 83 | 
 84 | #plots median percentage identity of each bin against reference position, colour of point is based on if the median percentage identity passes the identity threshold
 85 | plot_medians_line_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){
 86 |   comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
 87 |   comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
 88 |   colnames(comparison_medians)[2] <- "perc_id_median"
 89 |   comparison_medians$cut_off <- NA
 90 |   comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
 91 |   comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
 92 |   
 93 |   ggplot(comparison_medians, aes(x=bin, y = perc_id_median, colour = cut_off)) +
 94 |     geom_line(colour = "grey") + 
 95 |     geom_point(size = 1)  + 
 96 |     ylim(ymin,ymax) +
 97 |     scale_colour_manual(values = c("#73D055FF", "#440154FF")) +
 98 |     labs(colour = "% id") +
 99 |     #scale_colour_viridis() +
100 |     xlab(data$rid) +
101 |     
102 |     ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off))
103 | }
104 | 
105 | #plots boxplots of percentage identity of each bin against reference position, colour of boxplot is based on if the median percentage identity passes the identity threshold
106 | plot_boxplots_median_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){
107 |   comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
108 |   comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
109 |   colnames(comparison_medians)[2] <- "perc_id_median"
110 |   comparison_medians$cut_off <- NA
111 |   comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
112 |   comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
113 |   
114 |   comparison_to_plot <- merge(comparison_filt_bin, comparison_medians)
115 |   
116 |   ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) +
117 |     geom_boxplot(outlier.shape = NA) +
118 |     ylim(ymin,ymax) +
119 |     scale_fill_manual(values = c("#73D055FF", "#440154FF")) +
120 |     labs(fill = "% id") +
121 |     #scale_colour_viridis() +
122 |     xlab(data$rid) +
123 |     ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off))
124 | }
125 | 
126 | #assigns haplotype blocks based on bins of alignments exceeding the 99.99 % id threshold
127 | assign_blocks_mummer <-function(median_cutoffs){
128 |   median_cutoffs_copy <- median_cutoffs
129 |   median_cutoffs_copy$block_no <- NA
130 |   block_no = 1
131 |   
132 |   for (i in seq(1, nrow(median_cutoffs_copy))){
133 |     print(i)
134 |     if(median_cutoffs_copy[i, "perc_id_median"] < 99.99){
135 |       median_cutoffs_copy[i, "block_no"] <- NA
136 |     } else if (median_cutoffs_copy[i, "perc_id_median"] >= 99.99){
137 |       median_cutoffs_copy[i, "block_no"] <- block_no
138 |       if (i > (nrow(median_cutoffs_copy)-3)){
139 |         print("coming to end")
140 |       } else if ((median_cutoffs_copy[i+1, "perc_id_median"] < 99.99) & (median_cutoffs_copy[i+2, "perc_id_median"] < 99.99) & (median_cutoffs_copy[i+3, "perc_id_median"] < 99.99)){
141 |         block_no <- block_no + 1
142 |       }
143 |     }
144 |   }
145 |   return(median_cutoffs_copy)
146 | }
147 | 
148 | #summarises adjacent bins into haplotype blocks
149 | summarise_blocks <- function(median_cutoffs_copy, bin_size){
150 |   #now get the start and end of each block
151 |   blocks <- unique(median_cutoffs_copy$block_no)
152 |   blocks <- blocks[complete.cases(blocks)]
153 |   
154 |   block_positions <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), ref = character(), query = character())
155 |   
156 |   for(block in blocks){
157 |     block_data <- subset(median_cutoffs_copy, block_no == block)
158 |     block_start <- min(block_data$bin) - bin_size
159 |     block_end <- max(block_data$bin)
160 |     ref <- unique(block_data$ref)
161 |     query <- unique(block_data$query)
162 |     to_add <- data.frame(block_no = block, block_start = block_start, block_end = block_end, ref = ref, query = query)
163 |     block_positions <- rbind(block_positions, to_add)
164 |   }
165 |   return(block_positions)
166 | }
167 | 
168 | #checks if haplotype blocks are also called in the reciprocal mummer alignment
169 | check_reciprocal <- function(data, all_ref_query_coords, chrom){
170 |   ref <- data["ref"]
171 |   query <- data["query"]
172 |   
173 |   ref_vars <- as.character(unique(all_ref_query_coords$ref))
174 |   
175 |   if(query %in% ref_vars){
176 |     recip_table <- all_ref_query_coords[(all_ref_query_coords$ref == query) & (all_ref_query_coords$query == ref), ]
177 |     if (nrow(recip_table) == 0){
178 |       in_reciprocal <- "N"
179 |     } else {
180 |       
181 |       ref_gdf <- data.frame(chr=character(), start=numeric(), end=numeric())
182 |       to_add <- c(data["chrom"], data["block_start"], data["block_end"])
183 |       ref_gdf <- rbind(ref_gdf, to_add)
184 |       colnames(ref_gdf) <- c("chr", "start", "end")
185 |       
186 |       
187 |       query_gdf <- recip_table[, c("chrom", "block_start", "block_end")]
188 |       colnames(query_gdf) <- c("chr", "start", "end")
189 |       
190 |       # make genomic ranges objects for each
191 |       ref_gr <- makeGRangesFromDataFrame(ref_gdf)
192 |       query_gr <- makeGRangesFromDataFrame(query_gdf)
193 |       
194 |       overlaps <- findOverlaps(ref_gr, query_gr)
195 |       #precision first - how many BLAST blocks found in the  mummer haplotype?
196 |       count_overlaps <- length(unique(subjectHits(overlaps)))
197 |       if (count_overlaps > 0){
198 |         in_reciprocal <- "Y"
199 |         
200 |       } else {
201 |         in_reciprocal <- "N"
202 |       }
203 |     }
204 |   } else {
205 |     in_reciprocal <- NA
206 |   }
207 |   return(in_reciprocal)
208 | }
209 | 
210 | ##END OF FUNCTIONS
211 | 
212 | ## plots the median cut offs and call the blocks for each chromosome
213 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/"
214 | 
215 | plot_base <- paste0(base_dir, "plots")
216 | blocks_base <- paste0(base_dir, "blocks")
217 | 
218 | dir.create(plot_base)
219 | dir.create(blocks_base)
220 | 
221 | #read in the fasta indexes so we can set the max chromosome length
222 | chrom_lengths <- read.table("W:/assemblies/releasePGSBv2.0/genome/combined_fai.txt", sep = "\t", header = FALSE, stringsAsFactors = FALSE)
223 | #remove spelta and synthetic from this table as we are not including them
224 | chrom_lengths <- chrom_lengths[!grepl("ash", chrom_lengths[,1]),]
225 | chrom_lengths <- chrom_lengths[!grepl("tsp", chrom_lengths[,1]),]
226 | 
227 | #get a list of all the varieties to plot as references (chromosome level as references only)
228 | varieties_to_plot <- list.files(paste0(base_dir, "aln"))
229 | chromosomes_to_plot <- list.files(paste0(base_dir, "aln/", varieties_to_plot[1]))
230 | 
231 | #we will only use the 20kb filter here (this is already filtered using mummer)
232 | min_size <- 20000
233 | cut_off <- 99.99
234 | bin_size <- 5000000
235 | 
236 | for (i in seq(1, length(chromosomes_to_plot))){
237 |   chrom <- paste0("chr", chromosomes_to_plot[i])
238 |   print(chrom)
239 |   max_chrom_size <- max(chrom_lengths[grepl(chrom, chrom_lengths[,1]),2])
240 |   
241 |   blocks_chrom_dir <- paste0(blocks_base, "/", chromosomes_to_plot[i])
242 |   plots_chrom_dir <- paste0(plot_base, "/", chromosomes_to_plot[i])
243 |   
244 |   dir.create(blocks_chrom_dir)
245 |   dir.create(plots_chrom_dir)
246 |   
247 |   chrom_blocks_coords <- data.frame(block_no = numeric(), 
248 |                                     block_start = numeric(), 
249 |                                     block_end = numeric(), 
250 |                                     ref = character(), 
251 |                                     query = character(),
252 |                                     chrom = character(),
253 |                                     ref_chrom = character())
254 | 
255 |   
256 |   for (variety in varieties_to_plot){
257 |     print(variety)
258 |     data_dir <- paste0(base_dir, "aln/", variety, "/", chromosomes_to_plot[i], "/")
259 |     plot_dir <- paste0(plots_chrom_dir, "/", variety, "/")
260 |     dir.create(plot_dir)
261 |     
262 |     all_files <- list.files(data_dir)
263 |     #This gets just the filtered deltas 
264 |     filtered_delta <- all_files[grep("_L20Kb_rq.delta", all_files)]
265 |     
266 |     for (comparison in filtered_delta){
267 |       comparison_delta_path <- paste0(data_dir, "/", comparison)
268 |       comparison_id <- str_split_fixed(comparison, "\\.", 3)[1]
269 |       
270 |       ref <- variety
271 |       if (ref == "sy_mattis"){
272 |         query <- str_split_fixed(comparison_id, "_", 4)[4]
273 |       } else {
274 |         query <- str_split_fixed(comparison_id, "_", 3)[3]
275 |       }
276 |       
277 |       if (query == "sy"){
278 |         query <- "sy_mattis"
279 |       }
280 |     
281 |       if(file.info(comparison_delta_path)$size == 0){
282 |         print("file empty")
283 |       } else {
284 |         #read and filter for the minimum size
285 |         comparison_filt <- pre_plot_analysis(delta_path = comparison_delta_path, min_size = min_size)
286 |         print("data read in and filtered")
287 |         
288 |         #plot the diagonal dot plots
289 |         diagonal_dot_plot <- plot_by_perc_id_cap(comparison_filt, cap_lower = 97, cap_upper = 100)
290 |         ggsave(diagonal_dot_plot, file = paste0(plot_dir, comparison_id, ".", chrom, ".diagonal_dot.min", min_size, ".png"), dpi = 300, height = 5, width = 9)
291 |         
292 |         #plot the percentage dotplot capped at 97%
293 |         perc_dot_plot_97 <- plot_perc_id_v_ref(comparison_filt, ymin = 97)
294 |         ggsave(perc_dot_plot_97, file = paste0(plot_dir, comparison_id, ".", chrom, ".percentage_dot.min", min_size, "_cap97.png"), dpi = 300, height = 5, width = 9)
295 |        
296 |         #now plot the median cut offs
297 |         perc_line_5M <- plot_medians_line_colour_cut_off(comparison_filt, bin_size = bin_size, max_chrom_size = max_chrom_size)
298 |         ggsave(perc_line_5M, file = paste0(plot_dir, comparison_id, ".", chrom, ".percentage_line.min", min_size, "_5Mb_bin_99.99median.png"), dpi = 300, height = 5, width = 9)
299 |         
300 |         perc_boxplot_5M <- plot_boxplots_median_colour_cut_off(comparison_filt, bin_size = bin_size, max_chrom_size = max_chrom_size)
301 |         ggsave(perc_boxplot_5M, file = paste0(plot_dir, comparison_id, ".", chrom, ".percentage_boxplot.min", min_size, "_5Mb_bin_99.99median.png"), dpi = 300, height = 5, width = 9)
302 |         
303 |         #also output table with the medians of the bins
304 |         
305 |         comparison_filt_bin <- bin_data(comparison_filt, bin_size = bin_size, max_chrom_size = max_chrom_size)
306 |         comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
307 |         colnames(comparison_medians)[2] <- "perc_id_median"
308 |         comparison_medians$cut_off <- NA
309 |         comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
310 |         comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
311 |         
312 |         comparison_medians$ref <- ref
313 |         comparison_medians$query <- query
314 |         
315 |         write.table(comparison_medians, file = paste0(blocks_chrom_dir, "/", comparison_id, ".", chrom, ".percentage_medians.min", min_size, "_5Mb_bin.txt"), col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
316 |         
317 |         #now assign the blocks
318 |         ref_query_blocks <- assign_blocks_mummer(comparison_medians)
319 |         ref_query_coords <- summarise_blocks(ref_query_blocks, bin_size = bin_size)
320 |         if (nrow(ref_query_coords) > 0){
321 |           ref_query_coords$chrom <- chrom
322 |           ref_query_coords$ref_chrom <- unique(comparison_filt$rid)
323 |         
324 |           chrom_blocks_coords <- rbind(chrom_blocks_coords, ref_query_coords)
325 |         }
326 |       }
327 |     }
328 |   }
329 |   #now check if blocks are in reciprocals
330 |   chrom_blocks_coords$in_reciprocal <- apply(chrom_blocks_coords, 1, check_reciprocal, all_ref_query_coords = chrom_blocks_coords)
331 |   
332 |   write.table(chrom_blocks_coords, 
333 |               file = paste0(blocks_chrom_dir, "/mummer_blocks_", chrom, ".min", min_size, ".", bin_size, "_bins.txt"), 
334 |               sep = "\t",
335 |               col.names = TRUE, 
336 |               row.names = FALSE, 
337 |               quote = FALSE)
338 | }
339 | 
340 | 
341 | 
342 | 


--------------------------------------------------------------------------------
/sequence_complexity/calculate_seq_complexity_stats.r:
--------------------------------------------------------------------------------
  1 | library(dplyr)
  2 | library(ggplot2)
  3 | ##calculate for 6A
  4 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/"
  5 | 
  6 | seq_files <- list.files(base_dir, pattern = "tsv")
  7 | 
  8 | length(seq_files)
  9 | sample_no <- 10000
 10 | hap_blocks <- data.frame(matrix( nrow = length(seq_files)*sample_no, ncol = 6))
 11 | colnames(hap_blocks) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX")
 12 | not_block <- data.frame(matrix(nrow = length(seq_files)*sample_no, ncol = 6))
 13 | colnames(not_block) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX")
 14 | 
 15 | stats <- data.frame(matrix(nrow = length(seq_files), ncol = 13))
 16 | colnames(stats) <- c("FILE", 
 17 |                      "N_HAP", 
 18 |                      "N_NOT", 
 19 |                      "MEDIAN_HAP", 
 20 |                      "MEDIAN_NOT", 
 21 |                      "MEAN_HAP", 
 22 |                      "MEAN_NOT", 
 23 |                      "P_VALUE", 
 24 |                      "MEDIAN_HAP_SUB", 
 25 |                      "MEDIAN_NOT_SUB", 
 26 |                      "MEAN_HAP_SUB", 
 27 |                      "MEAN_NOT_SUB", 
 28 |                      "P_VALUE_SUB")
 29 | 
 30 | row_start <- 1
 31 | 
 32 | for (i in seq(1, length(seq_files))){
 33 |   file <- seq_files[i]
 34 |   name <- gsub("_snps_ref_complexity.tsv", "", file)
 35 |   seqs <- read.table(file = paste0(base_dir, file),
 36 |                      sep = "\t",
 37 |                      header = TRUE, 
 38 |                      stringsAsFactors = FALSE)
 39 |   
 40 |   seqs$hap_block <- factor(seqs$hap_block, levels = c("Y", "N"))
 41 |   
 42 |   distr <- ggplot(seqs, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) +
 43 |             geom_density(alpha = 0.5) +
 44 |             ggtitle(paste0(file, "\nN_hap: ", nrow(seqs[seqs$hap_block == "Y",]), "; N_not: ", nrow(seqs[seqs$hap_block == "N",]) ))
 45 |   
 46 | 
 47 |   pdf(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/", name, "_complexity_distribution.pdf"), height = 3, width = 3.5)
 48 |   print(distr)
 49 |   dev.off()
 50 |   
 51 |   head(seqs)
 52 |   nrow(seqs)
 53 |   counts <- data.frame(table(seqs$hap_block))
 54 |   
 55 |   medians <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = median))
 56 |   means <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = mean))
 57 |   
 58 |   stats[i,"FILE"] <- file
 59 |   stats[i,"N_HAP"] <- counts[counts$Var1 == "Y", "Freq"]
 60 |   stats[i,"N_NOT"] <- counts[counts$Var1 == "N", "Freq"]
 61 |   
 62 |   if (nrow(medians[medians$hap_block == "Y",]) > 0){
 63 |   stats[i,"MEDIAN_HAP"] <- medians[medians$hap_block == "Y","REF_CPLX"]
 64 |   stats[i,"MEAN_HAP"] <- means[means$hap_block == "Y","REF_CPLX"]
 65 |   }
 66 |   if (nrow(medians[medians$hap_block == "N",]) > 0){
 67 |   stats[i,"MEDIAN_NOT"] <- medians[medians$hap_block == "N","REF_CPLX"]
 68 |   stats[i,"MEAN_NOT"] <- means[means$hap_block == "N","REF_CPLX"]
 69 |   }
 70 |   
 71 |   if ((nrow(medians[medians$hap_block == "N",]) > 0) & (nrow(medians[medians$hap_block == "Y",]) > 0)){
 72 |   stats[i,"P_VALUE"] <- (pairwise.wilcox.test(seqs$REF_CPLX,seqs$hap_block,  p.adjust.method = "BH"))$p.value
 73 |   }
 74 |   
 75 |   row_end <- sample_no*i
 76 |   if((nrow(seqs[seqs$hap_block == "Y",]) < sample_no) | (nrow(seqs[seqs$hap_block == "N",]) < sample_no)){
 77 |     print("not enough samples")
 78 |     hap_blocks[c(row_start:row_end),"REF"] <- file
 79 |     not_block[c(row_start:row_end),"REF"] <- file
 80 |   } else {
 81 |     seqs$hap_block <- as.character(seqs$hap_block)
 82 |     hap_samples <- sample_n(seqs[seqs$hap_block == "Y",], sample_no)
 83 |     hap_blocks[c(row_start:row_end),] <- hap_samples
 84 |     not_samples <- sample_n(seqs[seqs$hap_block == "N",], sample_no)
 85 |     not_block[c(row_start:row_end),] <- not_samples
 86 |     
 87 |     #also want to calculate the subsampled stats per alignment
 88 |     subsamples <- rbind(hap_samples, not_samples)
 89 |     
 90 |     medians_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = median))
 91 |     means_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = mean))
 92 |     
 93 |     stats[i,"MEDIAN_HAP_SUB"] <- medians_sub[medians_sub$hap_block == "Y","REF_CPLX"]
 94 |     stats[i,"MEAN_HAP_SUB"] <- means_sub[means_sub$hap_block == "Y","REF_CPLX"]
 95 |     
 96 |     stats[i,"MEDIAN_NOT_SUB"] <- medians_sub[medians_sub$hap_block == "N","REF_CPLX"]
 97 |     stats[i,"MEAN_NOT_SUB"] <- means_sub[means_sub$hap_block == "N","REF_CPLX"]
 98 |     
 99 |     stats[i,"P_VALUE_SUB"] <- (pairwise.wilcox.test(subsamples$REF_CPLX, subsamples$hap_block,  p.adjust.method = "BH"))$p.value
100 |   }
101 |   row_start <- row_end + 1
102 | }
103 | 
104 | write.table(stats, file = paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_complexity_per_chrom", sample_no, "random.txt"),
105 |             sep = "\t", 
106 |             col.names = TRUE, 
107 |             row.names = FALSE,
108 |             quote = FALSE)
109 | 
110 | head(hap_blocks)
111 | head(not_block)
112 | 
113 | haps_noNA <- hap_blocks[complete.cases(hap_blocks$P1),]
114 | haps_noNA$hap_block <- "Y"
115 | not_noNA <- not_block[complete.cases(not_block$P1),]
116 | not_noNA$hap_block <- "N"
117 | 
118 | combined <- rbind(haps_noNA, not_noNA)
119 | sink(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_overall_", sample_no, "random.txt"))
120 | median(haps_noNA$REF_CPLX)
121 | median(not_noNA$REF_CPLX)
122 | 
123 | pairwise.wilcox.test(combined$REF_CPLX, combined$hap_block,  p.adjust.method = "BH")
124 | 
125 | sink()
126 | 
127 | ggplot(combined, aes(x = hap_block, y = REF_CPLX, group = hap_block)) +
128 |   geom_boxplot()
129 | 
130 | ggsave(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_overall_", sample_no, "random_boxplot.png"))
131 | 
132 | #plot density distribution
133 | combined$hap_block <- factor(combined$hap_block, levels = c("Y", "N"))
134 | 
135 | distr_dens <- ggplot(combined, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) +
136 | geom_density(alpha = 0.5) +
137 |   ggtitle(paste0("6A random ", sample_no, "\nN_hap: ", nrow(combined[combined$hap_block == "Y",]), "; N_not: ", nrow(combined[combined$hap_block == "N",]) ))
138 | 
139 | pdf(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_overall_", sample_no, "random_density.pdf"), height = 3, width = 3.5)
140 | print(distr_dens)
141 | dev.off()
142 | 
143 | ##now calculate for 2B
144 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/2B/complexity/"
145 | out_dir <- paste0(base_dir, "stats/")
146 | dir.create(out_dir)
147 | chrom <- "2B"
148 | seq_files <- list.files(base_dir, pattern = "tsv")
149 | 
150 | length(seq_files)
151 | sample_no <- 10000
152 | hap_blocks <- data.frame(matrix( nrow = length(seq_files)*sample_no, ncol = 6))
153 | colnames(hap_blocks) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX")
154 | not_block <- data.frame(matrix(nrow = length(seq_files)*sample_no, ncol = 6))
155 | colnames(not_block) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX")
156 | 
157 | stats <- data.frame(matrix(nrow = length(seq_files), ncol = 13))
158 | colnames(stats) <- c("FILE", 
159 |                      "N_HAP", 
160 |                      "N_NOT", 
161 |                      "MEDIAN_HAP", 
162 |                      "MEDIAN_NOT", 
163 |                      "MEAN_HAP", 
164 |                      "MEAN_NOT", 
165 |                      "P_VALUE", 
166 |                      "MEDIAN_HAP_SUB", 
167 |                      "MEDIAN_NOT_SUB", 
168 |                      "MEAN_HAP_SUB", 
169 |                      "MEAN_NOT_SUB", 
170 |                      "P_VALUE_SUB")
171 | 
172 | row_start <- 1
173 | 
174 | for (i in seq(1, length(seq_files))){
175 |   file <- seq_files[i]
176 |   name <- gsub("_snps_ref_complexity.tsv", "", file)
177 |   seqs <- read.table(file = paste0(base_dir, file),
178 |                      sep = "\t",
179 |                      header = TRUE, 
180 |                      stringsAsFactors = FALSE)
181 |   
182 |     seqs$hap_block <- factor(seqs$hap_block, levels = c("Y", "N"))
183 |   
184 |   distr <- ggplot(seqs, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) +
185 |     geom_density(alpha = 0.5) +
186 |     ggtitle(paste0(file, "\nN_hap: ", nrow(seqs[seqs$hap_block == "Y",]), "; N_not: ", nrow(seqs[seqs$hap_block == "N",]) ))
187 |   
188 |   pdf(paste0(out_dir, name, "_", chrom, "_complexity_distribution.pdf"), height = 3, width = 3.5)
189 |   print(distr)
190 |   dev.off()
191 |   
192 |   #ggsave(distr, file = paste0(out_dir, name, "_", chrom, "_complexity_distribution.png"), height = 5, width = 6)
193 |   
194 |   head(seqs)
195 |   nrow(seqs)
196 |   counts <- data.frame(table(seqs$hap_block))
197 |   
198 |   medians <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = median))
199 |   means <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = mean))
200 |   
201 |   stats[i,"FILE"] <- file
202 |   stats[i,"N_HAP"] <- counts[counts$Var1 == "Y", "Freq"]
203 |   stats[i,"N_NOT"] <- counts[counts$Var1 == "N", "Freq"]
204 |   
205 |   if (nrow(medians[medians$hap_block == "Y",]) > 0){
206 |     stats[i,"MEDIAN_HAP"] <- medians[medians$hap_block == "Y","REF_CPLX"]
207 |     stats[i,"MEAN_HAP"] <- means[means$hap_block == "Y","REF_CPLX"]
208 |   }
209 |   if (nrow(medians[medians$hap_block == "N",]) > 0){
210 |     stats[i,"MEDIAN_NOT"] <- medians[medians$hap_block == "N","REF_CPLX"]
211 |     stats[i,"MEAN_NOT"] <- means[means$hap_block == "N","REF_CPLX"]
212 |   }
213 |   
214 |   if ((nrow(medians[medians$hap_block == "N",]) > 0) & (nrow(medians[medians$hap_block == "Y",]) > 0)){
215 |     stats[i,"P_VALUE"] <- (pairwise.wilcox.test(seqs$REF_CPLX,seqs$hap_block,  p.adjust.method = "BH"))$p.value
216 |   }
217 |   
218 |   row_end <- sample_no*i
219 |   if((nrow(seqs[seqs$hap_block == "Y",]) < sample_no) | (nrow(seqs[seqs$hap_block == "N",]) < sample_no)){
220 |     print("not enough samples")
221 |     hap_blocks[c(row_start:row_end),"REF"] <- file
222 |     not_block[c(row_start:row_end),"REF"] <- file
223 |   } else {
224 |     seqs$hap_block <- as.character(seqs$hap_block)
225 |     hap_samples <- sample_n(seqs[seqs$hap_block == "Y",], sample_no)
226 |     hap_blocks[c(row_start:row_end),] <- hap_samples
227 |     not_samples <- sample_n(seqs[seqs$hap_block == "N",], sample_no)
228 |     not_block[c(row_start:row_end),] <- not_samples
229 |     
230 |     #also want to calculate the subsampled stats per alignment
231 |     subsamples <- rbind(hap_samples, not_samples)
232 |     
233 |     medians_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = median))
234 |     means_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = mean))
235 |     
236 |     stats[i,"MEDIAN_HAP_SUB"] <- medians_sub[medians_sub$hap_block == "Y","REF_CPLX"]
237 |     stats[i,"MEAN_HAP_SUB"] <- means_sub[means_sub$hap_block == "Y","REF_CPLX"]
238 |     
239 |     stats[i,"MEDIAN_NOT_SUB"] <- medians_sub[medians_sub$hap_block == "N","REF_CPLX"]
240 |     stats[i,"MEAN_NOT_SUB"] <- means_sub[means_sub$hap_block == "N","REF_CPLX"]
241 |     
242 |     stats[i,"P_VALUE_SUB"] <- (pairwise.wilcox.test(subsamples$REF_CPLX, subsamples$hap_block,  p.adjust.method = "BH"))$p.value
243 |   }
244 |   row_start <- row_end + 1
245 | }
246 | 
247 | write.table(stats, file = paste0(out_dir, "stats_seq_complexity_per_chrom_", chrom, sample_no, "random.txt"),
248 |             sep = "\t", 
249 |             col.names = TRUE, 
250 |             row.names = FALSE,
251 |             quote = FALSE)
252 | 
253 | head(hap_blocks)
254 | head(not_block)
255 | 
256 | haps_noNA <- hap_blocks[complete.cases(hap_blocks$P1),]
257 | haps_noNA$hap_block <- "Y"
258 | not_noNA <- not_block[complete.cases(not_block$P1),]
259 | not_noNA$hap_block <- "N"
260 | 
261 | combined <- rbind(haps_noNA, not_noNA)
262 | sink(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random.txt"))
263 | median(haps_noNA$REF_CPLX)
264 | median(not_noNA$REF_CPLX)
265 | 
266 | pairwise.wilcox.test(combined$REF_CPLX, combined$hap_block,  p.adjust.method = "BH")
267 | 
268 | sink()
269 | 
270 | ggplot(combined, aes(x = hap_block, y = REF_CPLX, group = hap_block)) +
271 |   geom_boxplot()
272 | 
273 | ggsave(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_boxplot.png"))
274 | combined$hap_block <- factor(combined$hap_block, levels = c("Y", "N"))
275 | 
276 | distr_dens <- ggplot(combined, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) +
277 |   geom_density(alpha = 0.5) +
278 |   ggtitle(paste0(file, "\nN_hap: ", nrow(combined[combined$hap_block == "Y",]), "; N_not: ", nrow(combined[combined$hap_block == "N",]) ))
279 | 
280 | pdf(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_density.pdf"), height = 3, width = 3.5)
281 | print(distr_dens)
282 | dev.off()
283 | 
284 | 
285 | 
286 | #Also want to remove any sequences that have any Ns in them from the calculations
287 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/2B/complexity/"
288 | out_dir <- paste0(base_dir, "stats/noN_unique/")
289 | dir.create(out_dir)
290 | chrom <- "2B"
291 | seq_files <- list.files(base_dir, pattern = "tsv")
292 | 
293 | length(seq_files)
294 | sample_no <- 10000
295 | hap_blocks <- data.frame(matrix( nrow = length(seq_files)*sample_no, ncol = 6))
296 | colnames(hap_blocks) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX")
297 | not_block <- data.frame(matrix(nrow = length(seq_files)*sample_no, ncol = 6))
298 | colnames(not_block) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX")
299 | 
300 | stats <- data.frame(matrix(nrow = length(seq_files), ncol = 13))
301 | colnames(stats) <- c("FILE", 
302 |                      "N_HAP", 
303 |                      "N_NOT", 
304 |                      "MEDIAN_HAP", 
305 |                      "MEDIAN_NOT", 
306 |                      "MEAN_HAP", 
307 |                      "MEAN_NOT", 
308 |                      "P_VALUE", 
309 |                      "MEDIAN_HAP_SUB", 
310 |                      "MEDIAN_NOT_SUB", 
311 |                      "MEAN_HAP_SUB", 
312 |                      "MEAN_NOT_SUB", 
313 |                      "P_VALUE_SUB")
314 | 
315 | row_start <- 1
316 | 
317 | for (i in seq(1, length(seq_files))){
318 |   file <- seq_files[i]
319 |   name <- gsub("_snps_ref_complexity.tsv", "", file)
320 |   seqs_raw <- read.table(file = paste0(base_dir, file),
321 |                      sep = "\t",
322 |                      header = TRUE, 
323 |                      stringsAsFactors = FALSE)
324 |   
325 |   seqs_noN <- seqs_raw[grep("N", seqs_raw$REF_SEQ),]
326 | 
327 |   seqs <- unique(seqs_noN)
328 |   
329 |   seqs$hap_block <- factor(seqs$hap_block, levels = c("Y", "N"))
330 |   
331 |   distr <- ggplot(seqs, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) +
332 |     geom_density(alpha = 0.5) +
333 |     ggtitle(paste0(file, "\nN_hap: ", nrow(seqs[seqs$hap_block == "Y",]), "; N_not: ", nrow(seqs[seqs$hap_block == "N",]) ))
334 |   
335 |   
336 |   ggsave(distr, file = paste0(out_dir, name, "_", chrom, "_complexity_distribution_noN_unique.png"), height = 5, width = 6)
337 |   
338 |   head(seqs)
339 |   nrow(seqs)
340 |   counts <- data.frame(table(seqs$hap_block))
341 |   
342 |   medians <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = median))
343 |   means <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = mean))
344 |   
345 |   stats[i,"FILE"] <- file
346 |   stats[i,"N_HAP"] <- counts[counts$Var1 == "Y", "Freq"]
347 |   stats[i,"N_NOT"] <- counts[counts$Var1 == "N", "Freq"]
348 |   
349 |   if (nrow(medians[medians$hap_block == "Y",]) > 0){
350 |     stats[i,"MEDIAN_HAP"] <- medians[medians$hap_block == "Y","REF_CPLX"]
351 |     stats[i,"MEAN_HAP"] <- means[means$hap_block == "Y","REF_CPLX"]
352 |   }
353 |   if (nrow(medians[medians$hap_block == "N",]) > 0){
354 |     stats[i,"MEDIAN_NOT"] <- medians[medians$hap_block == "N","REF_CPLX"]
355 |     stats[i,"MEAN_NOT"] <- means[means$hap_block == "N","REF_CPLX"]
356 |   }
357 |   
358 |   if ((nrow(medians[medians$hap_block == "N",]) > 0) & (nrow(medians[medians$hap_block == "Y",]) > 0)){
359 |     stats[i,"P_VALUE"] <- (pairwise.wilcox.test(seqs$REF_CPLX,seqs$hap_block,  p.adjust.method = "BH"))$p.value
360 |   }
361 |   
362 |   row_end <- sample_no*i
363 |   if((nrow(seqs[seqs$hap_block == "Y",]) < sample_no) | (nrow(seqs[seqs$hap_block == "N",]) < sample_no)){
364 |     print("not enough samples")
365 |     hap_blocks[c(row_start:row_end),"REF"] <- file
366 |     not_block[c(row_start:row_end),"REF"] <- file
367 |   } else {
368 |     seqs$hap_block <- as.character(seqs$hap_block)
369 |     hap_samples <- sample_n(seqs[seqs$hap_block == "Y",], sample_no)
370 |     hap_blocks[c(row_start:row_end),] <- hap_samples
371 |     not_samples <- sample_n(seqs[seqs$hap_block == "N",], sample_no)
372 |     not_block[c(row_start:row_end),] <- not_samples
373 |     
374 |     #also want to calculate the subsampled stats per alignment
375 |     subsamples <- rbind(hap_samples, not_samples)
376 |     
377 |     medians_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = median))
378 |     means_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = mean))
379 |     
380 |     stats[i,"MEDIAN_HAP_SUB"] <- medians_sub[medians_sub$hap_block == "Y","REF_CPLX"]
381 |     stats[i,"MEAN_HAP_SUB"] <- means_sub[means_sub$hap_block == "Y","REF_CPLX"]
382 |     
383 |     stats[i,"MEDIAN_NOT_SUB"] <- medians_sub[medians_sub$hap_block == "N","REF_CPLX"]
384 |     stats[i,"MEAN_NOT_SUB"] <- means_sub[means_sub$hap_block == "N","REF_CPLX"]
385 |     
386 |     stats[i,"P_VALUE_SUB"] <- (pairwise.wilcox.test(subsamples$REF_CPLX, subsamples$hap_block,  p.adjust.method = "BH"))$p.value
387 |   }
388 |   row_start <- row_end + 1
389 | }
390 | 
391 | write.table(stats, file = paste0(out_dir, "stats_seq_complexity_per_chrom_", chrom, sample_no, "random_noN_unique.txt"),
392 |             sep = "\t", 
393 |             col.names = TRUE, 
394 |             row.names = FALSE,
395 |             quote = FALSE)
396 | 
397 | head(hap_blocks)
398 | head(not_block)
399 | 
400 | haps_noNA <- hap_blocks[complete.cases(hap_blocks$P1),]
401 | haps_noNA$hap_block <- "Y"
402 | not_noNA <- not_block[complete.cases(not_block$P1),]
403 | not_noNA$hap_block <- "N"
404 | 
405 | combined <- rbind(haps_noNA, not_noNA)
406 | sink(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_noN_unique.txt"))
407 | median(haps_noNA$REF_CPLX)
408 | median(not_noNA$REF_CPLX)
409 | 
410 | pairwise.wilcox.test(combined$REF_CPLX, combined$hap_block,  p.adjust.method = "BH")
411 | 
412 | sink()
413 | 
414 | 
415 | ggplot(combined, aes(x = hap_block, y = REF_CPLX, group = hap_block)) +
416 |   geom_boxplot()
417 | 
418 | ggsave(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_boxplot_noN_unique.png"))
419 | 


--------------------------------------------------------------------------------
/figures/plot_extended_data_figure_1.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Supplemental_plots"
  3 | author: "Jemima Brinton"
  4 | date: "20/04/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | library(dplyr)
 12 | library(magrittr)
 13 | library(GenomicRanges)
 14 | library(ggplot2)
 15 | library(tidyr)
 16 | library(viridis)
 17 | library(stringr)
 18 | library(gridExtra)
 19 | 
 20 | 
 21 | #Read Delta functions and plotting functions (adapted) used are from https://jmonlong.github.io/Hippocamplus/2017/09/19/mummerplots-with-ggplot2/
 22 | #Reads in delta file output from nucmer into R and summarises into dataframe
 23 | readDelta <- function(deltafile){
 24 |   lines = scan(deltafile, 'a', sep='\n', quiet=TRUE)
 25 |   lines = lines[-1]
 26 |   lines.l = strsplit(lines, ' ')
 27 |   lines.len = lapply(lines.l, length) %>% as.numeric
 28 |   lines.l = lines.l[lines.len != 1]
 29 |   lines.len = lines.len[lines.len != 1]
 30 |   head.pos = which(lines.len == 4)
 31 |   head.id = rep(head.pos, c(head.pos[-1], length(lines.l)+1)-head.pos)
 32 |   mat = matrix(as.numeric(unlist(lines.l[lines.len==7])), 7)
 33 |   res = as.data.frame(t(mat[1:5,]))
 34 |   colnames(res) = c('rs','re','qs','qe','error')
 35 |   res$qid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 2))
 36 |   res$rid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 1)) %>% gsub('^>', '', .)
 37 |   res$strand = ifelse(res$qe-res$qs > 0, '+', '-')
 38 |   res
 39 | }
 40 | 
 41 | #calculates percentage identity and mid point for each mummer alignment
 42 | calculate_perc_id_mid_points <- function(data){
 43 |   data$r_length <- (data$re - data$rs)
 44 |   data$perc_id <- ((data$r_length - data$error)/data$r_length)*100
 45 |   data$perc_id_factor <- data$perc_id
 46 |   data[data$perc_id < 100, "perc_id_factor"] <- "<100"
 47 |   data$perc_id_factor <- as.factor(data$perc_id_factor)
 48 |   data$r_mid <- (data$rs + data$re)/2
 49 |   data$q_mid <- (data$qs + data$qe)/2
 50 |   return(data)
 51 | }
 52 | 
 53 | #reads in delta file, calculates percentage ID and filters for a minimum alignment size
 54 | pre_plot_analysis <- function(delta_path, min_size = 20000){
 55 |   data = readDelta(delta_path)
 56 |   data <- calculate_perc_id_mid_points(data)
 57 |   data_filt <- data[data$r_length >= min_size,]
 58 |   return(data_filt)
 59 | }
 60 | 
 61 | # Plots diagonal scatter plot of a nucmer pairwise alignment with points coloured by percentage identity of alignment - x axis is reference, y axis is query
 62 | plot_by_perc_id_cap <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){
 63 |   #order data by % ID to help with overplotting
 64 |   data <- data[order(data$perc_id),]
 65 |   ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=qs, yend=qe, colour=perc_id)) + geom_segment() +
 66 |     geom_point(alpha=.5) +
 67 |     theme_bw() + 
 68 |     theme(strip.text.y=element_text(angle=180, size=5),
 69 |           strip.background=element_blank()) +
 70 |     xlab(unique(data$rid)) + ylab(unique(data$qid)) +
 71 |     scale_colour_viridis(limits = c(cap_lower, cap_upper)) +
 72 |     scale_x_continuous(breaks=seq(xmin,xmax,100000000)) +
 73 |     scale_y_continuous(breaks=seq(0,max(data$qe),100000000))
 74 | }
 75 | 
 76 | plot_by_perc_id_cap_midpoint <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){
 77 |   #order data by % ID to help with overplotting
 78 |   data <- data[order(data$perc_id),]
 79 |   ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=r_mid, y=q_mid, colour=perc_id)) +
 80 |     geom_point(size = 0.3) +
 81 |     theme_bw() + 
 82 |     theme(strip.text.y=element_text(angle=180, size=5),
 83 |           strip.background=element_blank(),
 84 |           legend.position = "none",
 85 |           axis.title.x=element_blank(),
 86 |           axis.text.x=element_blank(),
 87 |           axis.title.y=element_blank(),
 88 |           axis.text.y=element_blank(),) +
 89 |     #xlab(unique(data$rid)) + ylab(unique(data$qid)) +
 90 |     scale_colour_viridis(limits = c(cap_lower, cap_upper)) +
 91 |     scale_x_continuous(breaks=seq(xmin,xmax,100000000)) +
 92 |     scale_y_continuous(breaks=seq(0,max(data$qe),100000000))+
 93 |     coord_cartesian(xlim=c(xmin,xmax))
 94 | }
 95 | 
 96 | 
 97 | plot_perc_id_v_ref <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100){
 98 |   ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id, colour=r_length)) +
 99 |     theme_bw() + xlab(data$rid) + ylab(paste0('percentage ID v ', data$qid)) +
100 |     geom_point(alpha=.5) +
101 |     ylim(ymin, ymax) +
102 |     scale_colour_viridis()
103 | }
104 | 
105 | plot_perc_id_v_ref_no_length <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100, breaks = 0.5){
106 |   ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id)) +
107 |     theme_bw() + 
108 |     #xlab(data$rid) + 
109 |    #ylab(paste0('percentage ID v ', data$qid)) +
110 |     theme(strip.text.y=element_text(angle=180, size=5),
111 |           strip.background=element_blank(),
112 |           legend.position = "none",
113 |           axis.title.x=element_blank(),
114 |           axis.text.x=element_blank(),
115 |           axis.title.y=element_blank(),
116 |           axis.text.y=element_blank(),) +
117 |     geom_point(alpha=.5, size = 0.5, colour = "gray25") +
118 |     scale_x_continuous(breaks=seq(xmin,xmax,100000000)) +
119 |     scale_y_continuous(limits = c(ymin, ymax), breaks=seq(ymin,ymax,breaks)) +
120 |     coord_cartesian(xlim=c(xmin,xmax))
121 | }
122 | 
123 | #bins alignment dataframe by reference position
124 | bin_data <- function(data, bin_size, max_chrom_size){
125 |   bins <- seq(0, max_chrom_size, by = bin_size)
126 |   data$bin <- NA
127 |   for (i in bins){
128 |     data$bin <- ifelse(((data$r_mid > (i-bin_size)) & (data$r_mid < (i-1))), i, data$bin)
129 |   }
130 |   return(data)
131 | }
132 | 
133 | #plots median percentage identity of each bin against reference position, colour of point is based on if the median percentage identity passes the identity threshold
134 | plot_medians_line_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){
135 |   comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
136 |   comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
137 |   colnames(comparison_medians)[2] <- "perc_id_median"
138 |   comparison_medians$cut_off <- NA
139 |   comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
140 |   comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
141 |   
142 |   ggplot(comparison_medians, aes(x=bin, y = perc_id_median, colour = cut_off)) +
143 |     geom_line(colour = "grey") + 
144 |     geom_point(size = 1)  + 
145 |     ylim(ymin,ymax) +
146 |     scale_colour_manual(values = c("#73D055FF", "#440154FF")) +
147 |     labs(colour = "% id") +
148 |     theme_bw() +
149 |     xlab(data$rid) +
150 |     
151 |     ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off))
152 | }
153 | 
154 | #plots boxplots of percentage identity of each bin against reference position, colour of boxplot is based on if the median percentage identity passes the identity threshold
155 | plot_boxplots_median_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){
156 |   comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
157 |   comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
158 |   colnames(comparison_medians)[2] <- "perc_id_median"
159 |   comparison_medians$cut_off <- NA
160 |   comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
161 |   comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
162 |   
163 |   comparison_to_plot <- merge(comparison_filt_bin, comparison_medians)
164 |   
165 |   ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) +
166 |     geom_boxplot(outlier.shape = NA, lwd = 0.3) +
167 |     scale_fill_manual(values = c("darkorchid4", "gold")) +
168 |     #labs(fill = "% id") +
169 |     #scale_colour_viridis() +
170 |     #xlab(data$rid) +
171 |     theme_bw() +
172 |     theme(strip.text.y=element_text(angle=180, size=5),
173 |           strip.background=element_blank(),
174 |           legend.position = "none",
175 |           axis.title.x=element_blank(),
176 |           axis.text.x=element_blank(),
177 |           axis.title.y=element_blank(),
178 |           axis.text.y=element_blank(),) +
179 |     #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) +
180 |     scale_x_continuous(breaks=seq(0,max(data$re),100000000)) +
181 |     scale_y_continuous(limits = c(ymin,ymax),breaks=seq(ymin,ymax,0.5))
182 | }
183 | 
184 | 
185 | ## END OF FUNCTIONS
186 | 
187 | ```
188 | 
189 | We want to zoom in on the junction in mace and stanley with the smaller window sizes
190 | 
191 | Read in the raw data
192 | 
193 | ```{r}
194 | 
195 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/"
196 | 
197 | min_size <- 20000
198 | cut_off <- 99.99
199 | 
200 | chr <- "6A"
201 | chrom <- paste0("chr", chr)
202 | 
203 | variety <- "mace"
204 | 
205 | data_dir_6A <- "X:/brintonj/6A_region/mummer/whole_6A_aln/"
206 | data_dir <- paste0(data_dir_6A, variety, "/")
207 | 
208 | all_files <- list.files(data_dir)
209 | 
210 | filtered_delta <- all_files[grep("_L20Kb_rq.delta", all_files)]
211 | 
212 | comparison <- "mace_v_stanley.all_6A_filtered_L20Kb_rq.delta"
213 | 
214 | comparison_delta_path <- paste0(data_dir, "/", comparison)
215 | comparison_id <- str_split_fixed(comparison, "\\.", 3)[1]
216 | 
217 | ref <- variety
218 | 
219 | if (ref == "sy_mattis"){
220 |   query <- str_split_fixed(comparison_id, "_", 4)[4]
221 | } else {
222 |   query <- str_split_fixed(comparison_id, "_", 3)[3]
223 | }
224 | 
225 | #read and filter for the minimum size
226 | comparison_filt <- pre_plot_analysis(delta_path = comparison_delta_path, min_size = min_size)
227 | print("data read in and filtered")
228 | head(comparison_filt)
229 | min(comparison_filt$perc_id)
230 | 
231 | x_max <- 610000000
232 | ```
233 | 
234 | Also read in the mummer blocks
235 | 
236 | ```{r}
237 | #read in the mummer_block coordinates
238 | mummer <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/whole_genome_mummer_BLAST_5000000_blocks_combined_updated_ref_coords_10g_corrected.tsv",
239 |                      header = TRUE, 
240 |                      stringsAsFactors = FALSE)
241 | 
242 | blocks_to_use <- mummer[(mummer$ref == "mace") & (mummer$query == "stanley") & (mummer$chrom == "chr6A") & (mummer$source == "mummer"),]
243 | 
244 | blocks_to_use
245 | ```
246 | 
247 | We are interested in the 1.85e+08	2.80e+08 block, the left hand junction at 1.85e+08
248 | 
249 | One level at a time...step down to the 2.5mbp blocks
250 | 
251 | ```{r}
252 | #read in the mummer_block coordinates
253 | 
254 | mummer_25mbp <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/no_spelta_2gap/whole_genome_mummer_BLAST_2500000_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv",
255 |                      header = TRUE, 
256 |                      stringsAsFactors = FALSE)
257 | 
258 | blocks_to_use_25m <- mummer_25mbp[(mummer_25mbp$ref == "mace") & (mummer_25mbp$query == "stanley") & (mummer_25mbp$chrom == "chr6A") & (mummer_25mbp$source == "mummer"),]
259 | 
260 | blocks_to_use_25m
261 | 
262 | head(mummer_25mbp)
263 | 
264 | ```
265 | 
266 | The 1.85e+08	2.80e+08	block is still there - lets try zooming into .150-2.50
267 | 
268 | 5Mbp bins first
269 | ```{r}
270 | 
271 | width = 4
272 | height = 3
273 | 
274 | data = comparison_filt
275 | bin_size = 5000000
276 | max_chrom_size = max(comparison_filt$re)
277 | alpha = 0.5
278 | fill = "#de87cd"
279 | cut_off = 99.99
280 | ymin = 99.9
281 | ymax = 100
282 | xmin = 150000000
283 | xmax = 250000000
284 | 
285 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
286 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
287 | colnames(comparison_medians)[2] <- "perc_id_median"
288 | comparison_medians$cut_off <- NA
289 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
290 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
291 |   
292 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians)
293 |   
294 | perc_boxplot_5M_shade_zoom <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) +
295 |   annotate("rect", xmin = blocks_to_use$ref_start-(bin_size*0.5), xmax = blocks_to_use$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) +
296 |     geom_boxplot(outlier.shape = NA, lwd = 0.3) +
297 |     scale_fill_manual(values = c("white", "gray50")) +
298 |     #labs(fill = "% id") +
299 |     #scale_colour_viridis() +
300 |     #xlab(data$rid) +
301 |     theme_bw() +
302 |     theme(strip.text.y=element_text(angle=180, size=5),
303 |           axis.text.x = element_text(angle = 90),
304 |           legend.position = "none")+#,
305 |           #strip.background=element_blank(),
306 |           #legend.position = "none",
307 |           #axis.title.x=element_blank(),
308 |           #axis.text.x=element_blank(),
309 |           #axis.title.y=element_blank(),
310 |           #axis.text.y=element_blank(),) +
311 |     #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) +
312 |     scale_x_continuous(breaks=seq(xmin, xmax, 10000000)) +
313 |     scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) +
314 |     coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax))
315 | 
316 | perc_boxplot_5M_shade_zoom
317 | 
318 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_", bin_size, ".pdf"), width = width, height = height)
319 | print(perc_boxplot_5M_shade_zoom)
320 | dev.off()
321 | ```
322 | 
323 | ```{r}
324 | 
325 | 
326 | data = comparison_filt
327 | bin_size = 2500000
328 | max_chrom_size = max(comparison_filt$re)
329 | alpha = 0.5
330 | fill = "#de87cd"
331 | cut_off = 99.99
332 | ymin = 99.9
333 | ymax = 100
334 | xmin = 150000000
335 | xmax = 250000000
336 | 
337 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
338 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
339 | colnames(comparison_medians)[2] <- "perc_id_median"
340 | comparison_medians$cut_off <- NA
341 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
342 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
343 |   
344 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians)
345 |   
346 | perc_boxplot_5M_shade_zoom_25mbp <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) +
347 |   annotate("rect", xmin = blocks_to_use_25m$ref_start-(bin_size*0.5), xmax = blocks_to_use_25m$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) +
348 |     geom_boxplot(outlier.shape = NA, lwd = 0.3) +
349 |     scale_fill_manual(values = c("white", "gray50")) +
350 |     #labs(fill = "% id") +
351 |     #scale_colour_viridis() +
352 |     #xlab(data$rid) +
353 |     theme_bw() +
354 |     theme(strip.text.y=element_text(angle=180, size=5),
355 |           axis.text.x = element_text(angle = 90),
356 |           legend.position = "none")+#,
357 |           #strip.background=element_blank(),
358 |           #legend.position = "none",
359 |           #axis.title.x=element_blank(),
360 |           #axis.text.x=element_blank(),
361 |           #axis.title.y=element_blank(),
362 |           #axis.text.y=element_blank(),) +
363 |     #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) +
364 |     scale_x_continuous(breaks=seq(xmin, xmax, 10000000)) +
365 |     scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) +
366 |     coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax))
367 | 
368 | perc_boxplot_5M_shade_zoom_25mbp
369 | 
370 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_", bin_size, ".pdf"), width = width, height = height)
371 | print(perc_boxplot_5M_shade_zoom_25mbp)
372 | dev.off()
373 | ```
374 | 
375 | 
376 | 
377 | 1mbp now
378 | 
379 | 
380 | 
381 | ```{r}
382 | #read in the mummer_block coordinates
383 | mummer_1mbp <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/no_spelta_2gap/whole_genome_mummer_BLAST_1e+06_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv",
384 |                      header = TRUE, 
385 |                      stringsAsFactors = FALSE)
386 | 
387 | blocks_to_use_1m <- mummer_1mbp[(mummer_1mbp$ref == "mace") & (mummer_1mbp$query == "stanley") & (mummer_1mbp$chrom == "chr6A") & (mummer_1mbp$source == "mummer"),]
388 | 
389 | blocks_to_use_1m
390 | 
391 | 
392 | 
393 | ```
394 | 
395 | zooming into .150-2.50
396 | 
397 | ```{r}
398 | 
399 | data = comparison_filt
400 | bin_size = 1000000
401 | max_chrom_size = max(comparison_filt$re)
402 | alpha = 0.5
403 | fill = "#de87cd"
404 | cut_off = 99.99
405 | ymin = 99.9
406 | ymax = 100
407 | xmin = 165000000
408 | xmax = 215000000
409 | 
410 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
411 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
412 | colnames(comparison_medians)[2] <- "perc_id_median"
413 | comparison_medians$cut_off <- NA
414 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
415 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
416 |   
417 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians)
418 |   
419 | perc_boxplot_5M_shade_zoom_1mbp <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) +
420 |   annotate("rect", xmin = blocks_to_use_1m$ref_start-(bin_size*0.5), xmax = blocks_to_use_1m$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) +
421 |     geom_boxplot(outlier.shape = NA, lwd = 0.3) +
422 |     scale_fill_manual(values = c("white", "gray50")) +
423 |     #labs(fill = "% id") +
424 |     #scale_colour_viridis() +
425 |     #xlab(data$rid) +
426 |     theme_bw() +
427 |     theme(strip.text.y=element_text(angle=180, size=5),
428 |           axis.text.x = element_text(angle = 90),
429 |           legend.position = "none")+#,
430 |           #strip.background=element_blank(),
431 |           #legend.position = "none",
432 |           #axis.title.x=element_blank(),
433 |           #axis.text.x=element_blank(),
434 |           #axis.title.y=element_blank(),
435 |           #axis.text.y=element_blank(),) +
436 |     #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) +
437 |     scale_x_continuous(breaks=seq(xmin, xmax, 5000000)) +
438 |     scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) +
439 |     coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax))
440 | 
441 | perc_boxplot_5M_shade_zoom_1mbp
442 | 
443 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_", bin_size, ".pdf"), width = width, height = height)
444 | print(perc_boxplot_5M_shade_zoom_1mbp)
445 | dev.off()
446 | ```
447 | 
448 | 
449 | 
450 | Try plotting the alignments in this border region with some indication of percentage ID
451 | ```{r}
452 | data = comparison_filt
453 | bin_size = 250000
454 | max_chrom_size = max(comparison_filt$re)
455 | alpha = 0.5
456 | fill = "#de87cd"
457 | cut_off = 99.99
458 | ymin = 99.9
459 | ymax = 100
460 | xmin = 186000000
461 | xmax = 188000000
462 | 
463 | aln_plot <- ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=perc_id, yend=perc_id)) +
464 |   annotate("rect", xmin = blocks_to_use_1m$ref_start-(bin_size*0.5), xmax = blocks_to_use_1m$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) +
465 |   geom_segment() +
466 |     #geom_point(alpha=.5) +
467 |     theme_bw() + 
468 |     scale_x_continuous(breaks=seq(xmin, xmax, 100000)) +
469 |     scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) +
470 |   coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax)) +
471 |   theme(axis.text.x = element_text(angle = 90))
472 | 
473 | aln_plot
474 | 
475 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_alignments.pdf"), width = width, height = height)
476 | print(aln_plot)
477 | dev.off()
478 | 
479 | ```
480 | 
481 | 


--------------------------------------------------------------------------------
/figures/figure_1_plots.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Plot figure 1 plots"
  3 | author: "Jemima Brinton"
  4 | date: "08/04/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | ## generate plots for figure 1
 12 | ## using the mace/stanley 6A alignment example
 13 | 
 14 | ## FUNCTIONS
 15 | library(dplyr)
 16 | library(magrittr)
 17 | library(GenomicRanges)
 18 | library(ggplot2)
 19 | library(tidyr)
 20 | library(viridis)
 21 | library(stringr)
 22 | library(gridExtra)
 23 | 
 24 | 
 25 | #Read Delta functions and plotting functions (adapted) used are from https://jmonlong.github.io/Hippocamplus/2017/09/19/mummerplots-with-ggplot2/
 26 | #Reads in delta file output from nucmer into R and summarises into dataframe
 27 | readDelta <- function(deltafile){
 28 |   lines = scan(deltafile, 'a', sep='\n', quiet=TRUE)
 29 |   lines = lines[-1]
 30 |   lines.l = strsplit(lines, ' ')
 31 |   lines.len = lapply(lines.l, length) %>% as.numeric
 32 |   lines.l = lines.l[lines.len != 1]
 33 |   lines.len = lines.len[lines.len != 1]
 34 |   head.pos = which(lines.len == 4)
 35 |   head.id = rep(head.pos, c(head.pos[-1], length(lines.l)+1)-head.pos)
 36 |   mat = matrix(as.numeric(unlist(lines.l[lines.len==7])), 7)
 37 |   res = as.data.frame(t(mat[1:5,]))
 38 |   colnames(res) = c('rs','re','qs','qe','error')
 39 |   res$qid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 2))
 40 |   res$rid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 1)) %>% gsub('^>', '', .)
 41 |   res$strand = ifelse(res$qe-res$qs > 0, '+', '-')
 42 |   res
 43 | }
 44 | 
 45 | #calculates percentage identity and mid point for each mummer alignment
 46 | calculate_perc_id_mid_points <- function(data){
 47 |   data$r_length <- (data$re - data$rs)
 48 |   data$perc_id <- ((data$r_length - data$error)/data$r_length)*100
 49 |   data$perc_id_factor <- data$perc_id
 50 |   data[data$perc_id < 100, "perc_id_factor"] <- "<100"
 51 |   data$perc_id_factor <- as.factor(data$perc_id_factor)
 52 |   data$r_mid <- (data$rs + data$re)/2
 53 |   data$q_mid <- (data$qs + data$qe)/2
 54 |   return(data)
 55 | }
 56 | 
 57 | #reads in delta file, calculates percentage ID and filters for a minimum alignment size
 58 | pre_plot_analysis <- function(delta_path, min_size = 20000){
 59 |   data = readDelta(delta_path)
 60 |   data <- calculate_perc_id_mid_points(data)
 61 |   data_filt <- data[data$r_length >= min_size,]
 62 |   return(data_filt)
 63 | }
 64 | 
 65 | # Plots diagonal scatter plot of a nucmer pairwise alignment with points coloured by percentage identity of alignment - x axis is reference, y axis is query
 66 | plot_by_perc_id_cap <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){
 67 |   #order data by % ID to help with overplotting
 68 |   data <- data[order(data$perc_id),]
 69 |   ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=qs, yend=qe, colour=perc_id)) + geom_segment() +
 70 |     geom_point(alpha=.5) +
 71 |     theme_bw() + 
 72 |     theme(strip.text.y=element_text(angle=180, size=5),
 73 |           strip.background=element_blank()) +
 74 |     xlab(unique(data$rid)) + ylab(unique(data$qid)) +
 75 |     scale_colour_viridis(limits = c(cap_lower, cap_upper)) +
 76 |     scale_x_continuous(breaks=seq(xmin,xmax,100000000)) +
 77 |     scale_y_continuous(breaks=seq(0,max(data$qe),100000000))
 78 | }
 79 | 
 80 | plot_by_perc_id_cap_midpoint <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){
 81 |   #order data by % ID to help with overplotting
 82 |   data <- data[order(data$perc_id),]
 83 |   ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=r_mid, y=q_mid, colour=perc_id)) +
 84 |     geom_point(size = 0.3) +
 85 |     theme_bw() + 
 86 |     theme(strip.text.y=element_text(angle=180, size=5),
 87 |           strip.background=element_blank(),
 88 |           legend.position = "none",
 89 |           axis.title.x=element_blank(),
 90 |           axis.text.x=element_blank(),
 91 |           axis.title.y=element_blank(),
 92 |           axis.text.y=element_blank(),) +
 93 |     #xlab(unique(data$rid)) + ylab(unique(data$qid)) +
 94 |     scale_colour_viridis(limits = c(cap_lower, cap_upper)) +
 95 |     scale_x_continuous(breaks=seq(xmin,xmax,100000000)) +
 96 |     scale_y_continuous(breaks=seq(0,max(data$qe),100000000))+
 97 |     coord_cartesian(xlim=c(xmin,xmax))
 98 | }
 99 | 
100 | 
101 | plot_perc_id_v_ref <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100){
102 |   ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id, colour=r_length)) +
103 |     theme_bw() + xlab(data$rid) + ylab(paste0('percentage ID v ', data$qid)) +
104 |     geom_point(alpha=.5) +
105 |     ylim(ymin, ymax) +
106 |     scale_colour_viridis()
107 | }
108 | 
109 | plot_perc_id_v_ref_no_length <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100, breaks = 0.5){
110 |   ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id)) +
111 |     theme_bw() + 
112 |     #xlab(data$rid) + 
113 |    #ylab(paste0('percentage ID v ', data$qid)) +
114 |     theme(strip.text.y=element_text(angle=180, size=5),
115 |           strip.background=element_blank(),
116 |           legend.position = "none",
117 |           axis.title.x=element_blank(),
118 |           axis.text.x=element_blank(),
119 |           axis.title.y=element_blank(),
120 |           axis.text.y=element_blank(),) +
121 |     geom_point(alpha=.5, size = 0.5, colour = "gray25") +
122 |     scale_x_continuous(breaks=seq(xmin,xmax,100000000)) +
123 |     scale_y_continuous(limits = c(ymin, ymax), breaks=seq(ymin,ymax,breaks)) +
124 |     coord_cartesian(xlim=c(xmin,xmax))
125 | }
126 | 
127 | #bins alignment dataframe by reference position
128 | bin_data <- function(data, bin_size, max_chrom_size){
129 |   bins <- seq(0, max_chrom_size, by = bin_size)
130 |   data$bin <- NA
131 |   for (i in bins){
132 |     data$bin <- ifelse(((data$r_mid > (i-bin_size)) & (data$r_mid < (i-1))), i, data$bin)
133 |   }
134 |   return(data)
135 | }
136 | 
137 | #plots median percentage identity of each bin against reference position, colour of point is based on if the median percentage identity passes the identity threshold
138 | plot_medians_line_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){
139 |   comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
140 |   comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
141 |   colnames(comparison_medians)[2] <- "perc_id_median"
142 |   comparison_medians$cut_off <- NA
143 |   comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
144 |   comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
145 |   
146 |   ggplot(comparison_medians, aes(x=bin, y = perc_id_median, colour = cut_off)) +
147 |     geom_line(colour = "grey") + 
148 |     geom_point(size = 1)  + 
149 |     ylim(ymin,ymax) +
150 |     scale_colour_manual(values = c("#73D055FF", "#440154FF")) +
151 |     labs(colour = "% id") +
152 |     theme_bw() +
153 |     xlab(data$rid) +
154 |     
155 |     ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off))
156 | }
157 | 
158 | #plots boxplots of percentage identity of each bin against reference position, colour of boxplot is based on if the median percentage identity passes the identity threshold
159 | plot_boxplots_median_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){
160 |   comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
161 |   comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
162 |   colnames(comparison_medians)[2] <- "perc_id_median"
163 |   comparison_medians$cut_off <- NA
164 |   comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
165 |   comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
166 |   
167 |   comparison_to_plot <- merge(comparison_filt_bin, comparison_medians)
168 |   
169 |   ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) +
170 |     geom_boxplot(outlier.shape = NA, lwd = 0.3) +
171 |     scale_fill_manual(values = c("darkorchid4", "gold")) +
172 |     #labs(fill = "% id") +
173 |     #scale_colour_viridis() +
174 |     #xlab(data$rid) +
175 |     theme_bw() +
176 |     theme(strip.text.y=element_text(angle=180, size=5),
177 |           strip.background=element_blank(),
178 |           legend.position = "none",
179 |           axis.title.x=element_blank(),
180 |           axis.text.x=element_blank(),
181 |           axis.title.y=element_blank(),
182 |           axis.text.y=element_blank(),) +
183 |     #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) +
184 |     scale_x_continuous(breaks=seq(0,max(data$re),100000000)) +
185 |     scale_y_continuous(limits = c(ymin,ymax),breaks=seq(ymin,ymax,0.5))
186 | }
187 | 
188 | 
189 | ## END OF FUNCTIONS
190 | ```
191 | 
192 | ```{r}
193 | 
194 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/"
195 | 
196 | min_size <- 20000
197 | cut_off <- 99.99
198 | 
199 | chr <- "6A"
200 | chrom <- paste0("chr", chr)
201 | 
202 | variety <- "mace"
203 | 
204 | data_dir_6A <- "X:/brintonj/6A_region/mummer/whole_6A_aln/"
205 | data_dir <- paste0(data_dir_6A, variety, "/")
206 | 
207 | all_files <- list.files(data_dir)
208 | 
209 | filtered_delta <- all_files[grep("_L20Kb_rq.delta", all_files)]
210 | 
211 | comparison <- "mace_v_stanley.all_6A_filtered_L20Kb_rq.delta"
212 | 
213 | comparison_delta_path <- paste0(data_dir, "/", comparison)
214 | comparison_id <- str_split_fixed(comparison, "\\.", 3)[1]
215 | 
216 | ref <- variety
217 | 
218 | if (ref == "sy_mattis"){
219 |   query <- str_split_fixed(comparison_id, "_", 4)[4]
220 | } else {
221 |   query <- str_split_fixed(comparison_id, "_", 3)[3]
222 | }
223 | 
224 | #read and filter for the minimum size
225 | comparison_filt <- pre_plot_analysis(delta_path = comparison_delta_path, min_size = min_size)
226 | print("data read in and filtered")
227 | head(comparison_filt)
228 | min(comparison_filt$perc_id)
229 | 
230 | x_max <- 610000000
231 | ```
232 | 
233 | 
234 | 
235 | ```{r}
236 | diagonal_dot_plot_mid <- plot_by_perc_id_cap_midpoint(comparison_filt, cap_lower = 97, cap_upper = 100, xmax = x_max)
237 | diagonal_dot_plot_mid
238 | ```
239 | 
240 | ```{r}
241 | diagonal_dot_plot_mid_legend <- plot_by_perc_id_cap(comparison_filt, cap_lower = 97, cap_upper = 100, xmax = x_max)
242 | diagonal_dot_plot_mid_legend
243 | 
244 | ```
245 | 
246 | 
247 | 
248 | ```{r}
249 | perc_dot_plot_no_length <- plot_perc_id_v_ref_no_length(comparison_filt, ymin = 97, xmax = x_max)
250 |         
251 | perc_dot_plot_no_length
252 | ```
253 | ```{r}
254 | perc_boxplot_5M <- plot_boxplots_median_colour_cut_off(comparison_filt, bin_size = 5000000, max_chrom_size = max(comparison_filt$re))
255 | perc_boxplot_5M
256 | ```
257 | 
258 | 
259 | Add some shading in the background of the boxplots where haplotype blocks are called
260 | 
261 | ```{r}
262 | #read in the mummer_block coordinates
263 | mummer <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/whole_genome_mummer_BLAST_5000000_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv",
264 |                      header = TRUE, 
265 |                      stringsAsFactors = FALSE)
266 | 
267 | blocks_to_use <- mummer[(mummer$ref == "mace") & (mummer$query == "stanley") & (mummer$chrom == "chr6A") & (mummer$source == "mummer"),]
268 | 
269 | blocks_to_use
270 | ```
271 | 
272 | 
273 | Focus just on the 99.9 - 100 % points
274 | 
275 | Dot plot
276 | 
277 | ```{r}
278 | perc_dot_plot_no_length_zoom <- plot_perc_id_v_ref_no_length(comparison_filt, ymin = 99.9, breaks = 0.01, xmax = x_max)
279 |         
280 | perc_dot_plot_no_length_zoom
281 | ```
282 | 
283 | Boxplots
284 | ```{r}
285 | 
286 | data = comparison_filt
287 | bin_size = 5000000
288 | max_chrom_size = max(comparison_filt$re)
289 | alpha = 0.5
290 | fill = "#de87cd"
291 | cut_off = 99.99
292 | ymin = 99.9
293 | ymax = 100
294 | 
295 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size)
296 |   comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median)
297 |   colnames(comparison_medians)[2] <- "perc_id_median"
298 |   comparison_medians$cut_off <- NA
299 |   comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off))
300 |   comparison_medians$cut_off <- as.factor(comparison_medians$cut_off)
301 |   
302 |   comparison_to_plot <- merge(comparison_filt_bin, comparison_medians)
303 |   
304 | perc_boxplot_5M_shade_zoom <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) +
305 |     #geom_rect(aes(xmin=blocks_to_use[1,"ref_start"]+bin_size,xmax=blocks_to_use[1,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) +
306 |   #geom_rect(aes(xmin=blocks_to_use[2,"ref_start"]+bin_size,xmax = blocks_to_use[2,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) +
307 |   #geom_rect(aes(xmin=blocks_to_use[3,"ref_start"]+bin_size,xmax = blocks_to_use[3,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) +
308 |   #geom_rect(aes(xmin=blocks_to_use[4,"ref_start"]+bin_size,xmax = blocks_to_use[4,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) +
309 |   #geom_rect(aes(xmin=blocks_to_use[5,"ref_start"]+bin_size,xmax = blocks_to_use[5,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) +
310 |   annotate("rect", xmin = blocks_to_use$ref_start+(bin_size*0.5), xmax = blocks_to_use$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) +
311 |     geom_boxplot(outlier.shape = NA, lwd = 0.3) +
312 |     scale_fill_manual(values = c("white", "gray50")) +
313 |     #labs(fill = "% id") +
314 |     #scale_colour_viridis() +
315 |     #xlab(data$rid) +
316 |     theme_bw() +
317 |     theme(strip.text.y=element_text(angle=180, size=5),
318 |           strip.background=element_blank(),
319 |           legend.position = "none",
320 |           axis.title.x=element_blank(),
321 |           axis.text.x=element_blank(),
322 |           axis.title.y=element_blank(),
323 |           axis.text.y=element_blank(),) +
324 |     #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) +
325 |     scale_x_continuous(breaks=seq(0,x_max,100000000)) +
326 |     scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) +
327 |     coord_cartesian(ylim=c(ymin,ymax), xlim = c(0,x_max))
328 | 
329 | perc_boxplot_5M_shade_zoom
330 | ```
331 | ```{r}
332 | #ggsave(combined, "Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots.png", dpi = 600, height = 10, width = 10)
333 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_shade.png", res = 1500, width = 4.3, height = 3.8, units = "in")
334 | grid.arrange(diagonal_dot_plot_mid, perc_dot_plot_no_length, perc_boxplot_5M_shade, nrow=3)
335 | dev.off()
336 | 
337 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_shade_wide.png", res = 1500, width = 6.5, height = 4, units = "in")
338 | grid.arrange(diagonal_dot_plot_mid, perc_dot_plot_no_length, perc_boxplot_5M_shade, nrow=3)
339 | dev.off()
340 | 
341 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_shade_zoomed_wide.png", res = 1500, width = 6.5, height = (4/3)*4, units = "in")
342 | grid.arrange(diagonal_dot_plot_mid, perc_dot_plot_no_length, perc_dot_plot_no_length_zoom, perc_boxplot_5M_shade_zoom, nrow=4)
343 | dev.off()
344 | 
345 | ## also want to save them separately
346 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal.png", res = 1500, width = 6.5, height = (4/3), units = "in")
347 | print(diagonal_dot_plot_mid)
348 | dev.off()
349 | 
350 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_perc_dot.png", res = 1500, width = 6.5, height = (4/3), units = "in")
351 | print(perc_dot_plot_no_length)
352 | dev.off()
353 | 
354 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_perc_dot_zoom.png", res = 1500, width = 6.5, height = (4/3), units = "in")
355 | print(perc_dot_plot_no_length_zoom)
356 | dev.off()
357 | 
358 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_boxplot_zoom.png", res = 1500, width = 6.5, height = (4/3), units = "in")
359 | print(perc_boxplot_5M_shade_zoom)
360 | dev.off()
361 | 
362 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal_with_legend.png", res = 1500, width = 6.5, height = 4, units = "in")
363 | print(diagonal_dot_plot_mid_legend)
364 | dev.off()
365 | 
366 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal_with_legend_alt.png", res = 1500, width = 6.5, height = 4, units = "in")
367 | print(diagonal_dot_plot_mid_legend)
368 | dev.off()
369 | 
370 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal_short.png", res = 1500, width = 6.5, height = 1, units = "in")
371 | print(diagonal_dot_plot_mid)
372 | dev.off()
373 | ```
374 | 
375 | Next we want to generate the heatmaps from the BLAST with the different amounts of flanking sequence
376 | 
377 | ```{r}
378 | blast_data_dir <- "W:/ramirezr/SM1/pairwise_blast_oct_2019/"
379 | files_to_run <- list.files(blast_data_dir, pattern = "varieties_6A_identites")
380 | 
381 | bp0 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_0bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
382 | bp1000 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_1000bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
383 | bp2000 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_2000bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
384 | bp5000 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_5000bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
385 | cds <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_cdsbp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
386 | 
387 | all_flank <- rbind(cds, bp5000, bp2000, bp1000, bp0)
388 | 
389 | mace_stanley <- all_flank[all_flank$aln_type == "mace->stanley",]
390 | mace_stanley <- mace_stanley[order(mace_stanley$transcript),]
391 | #add the capped identity
392 | mace_stanley$capped_ident <- mace_stanley$pident
393 | mace_stanley[mace_stanley$capped_ident < 100, "capped_ident"] <- 0
394 | mace_stanley$Flanking <- factor(mace_stanley$Flanking, levels = c("5000", "2000", "1000", "0", "cds"))
395 | mace_stanley$capped_ident <- factor(as.character(mace_stanley$capped_ident), levels = c("0", "100"))
396 | mace_stanley$capped_ident_noN <- mace_stanley$capped_ident
397 | mace_stanley[mace_stanley$Ns_total > 0, "capped_ident_noN"] <- NA
398 | mace_stanley_expand <- data.frame(complete(mace_stanley, transcript, Flanking))
399 | 
400 | 
401 | mace_stanley_noN <- mace_stanley[mace_stanley$Ns_total == 0,]
402 | 
403 | table(mace_stanley_noN$Flanking)
404 | mace_stanley_noN_expand <- data.frame(complete(mace_stanley_noN, transcript, Flanking))
405 | ```
406 | 
407 | Try plotting all together this will show which transcripts get removed with the Ns
408 | 
409 | ```{r}
410 | colours = c("0" = "gray", "100" = "black")
411 | na_col <- "white"
412 | combined_heatmap <- ggplot(mace_stanley_expand , aes(x=factor(transcript),y=Flanking)) +
413 | 		geom_tile(aes(fill = capped_ident_noN)) +
414 | 		scale_fill_manual(values = colours, na.value = na_col) +
415 |     theme_bw() + 
416 |     theme(strip.text.y=element_text(angle=180, size=5),
417 |           strip.background=element_blank(),
418 |           legend.position = "none",
419 |           axis.title.x=element_blank(),
420 |           axis.text.x=element_blank(),
421 |           axis.title.y=element_blank(),
422 |           axis.text.y=element_blank())
423 |           
424 | 
425 | combined_heatmap 
426 | 
427 | png("Y:/Publications/Haplotypes/Figures/figure_1_combined_heatmaps.png", res = 1500, width = 6.5, height = 2, units = "in")
428 | print(combined_heatmap)
429 | dev.off()
430 | 
431 | combined_heatmap_axes <- ggplot(mace_stanley_expand , aes(x=factor(transcript),y=Flanking)) +
432 | 		geom_tile(aes(fill = capped_ident_noN)) +
433 | 		scale_fill_manual(values = colours, na.value = na_col) +
434 |     theme_bw() + 
435 |     theme(strip.text.y=element_text(angle=180, size=5),
436 |           strip.background=element_blank(),
437 |           #legend.position = "none",
438 |           axis.title.x=element_blank(),
439 |           axis.text.x=element_blank(),
440 |           axis.title.y=element_blank())
441 |           #axis.text.y=element_blank())
442 |           
443 | 
444 | combined_heatmap_axes 
445 | 
446 | png("Y:/Publications/Haplotypes/Figures/figure_1_combined_heatmaps_axes.png", res = 1500, width = 6.5, height = 2, units = "in")
447 | print(combined_heatmap_axes)
448 | dev.off()
449 | ```
450 | 
451 | Try plotting separately
452 | 
453 | ```{r}
454 | flanks <- unique(mace_stanley_expand$Flanking)
455 | 
456 | for (flank in flanks){
457 |   flank_data <- mace_stanley_expand[mace_stanley_expand$Flanking == flank, ]
458 |   
459 |   flank_data_noN <- flank_data[flank_data$Ns_total == 0,]
460 |   flank_data_noN <- flank_data_noN[complete.cases(flank_data_noN$Ns_total),]
461 |   
462 |   heatmap_axes <- ggplot(flank_data_noN , aes(x=factor(transcript),y=Flanking)) +
463 | 		geom_tile(aes(fill = capped_ident_noN)) +
464 | 		scale_fill_manual(values = colours, na.value = na_col) +
465 |     theme_bw() + 
466 |     theme(strip.text.y=element_text(angle=180, size=5),
467 |           strip.background=element_blank(),
468 |           #legend.position = "none",
469 |           axis.title.x=element_blank(),
470 |           axis.text.x=element_blank(),
471 |           axis.title.y=element_blank())
472 |           #axis.text.y=element_blank())
473 |           
474 | 
475 | png(paste0("Y:/Publications/Haplotypes/Figures/figure_1_", flank, "_heatmap_axes.png"), res = 1500, width = 6.5, height = 2, units = "in")
476 | print(heatmap_axes)
477 | dev.off()
478 | 
479 |   heatmap <- ggplot(flank_data_noN , aes(x=factor(transcript),y=Flanking)) +
480 | 		geom_tile(aes(fill = capped_ident_noN)) +
481 | 		scale_fill_manual(values = colours, na.value = na_col) +
482 |     theme_bw() + 
483 |     theme(strip.text.y=element_text(angle=180, size=5),
484 |           strip.background=element_blank(),
485 |           legend.position = "none",
486 |           axis.title.x=element_blank(),
487 |           axis.text.x=element_blank(),
488 |           axis.title.y=element_blank(),
489 |           axis.text.y=element_blank())
490 |           
491 | 
492 | png(paste0("Y:/Publications/Haplotypes/Figures/figure_1_", flank, "_heatmap.png"), res = 1500, width = 6.5, height = 0.75, units = "in")
493 | print(heatmap)
494 | dev.off()
495 | }
496 | ```
497 | 
498 | 
499 | 


--------------------------------------------------------------------------------
/figures/figure_4_plots.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "figure_3_plots"
  3 | author: "Jemima Brinton"
  4 | date: "04/05/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | library(stringr)
 12 | library(ggplot2)
 13 | library(GenomicRanges)
 14 | library(ggbio)
 15 | library(viridis)
 16 | library(reshape2)
 17 | library("ggalluvial")
 18 | library(RColorBrewer)
 19 | 
 20 | 
 21 | convert_long_01 <- function(data){
 22 |   converted <- data.frame(marker=character(), variable = character(),  value = numeric(),pos = numeric())
 23 |   markers <- unique(data$marker)
 24 | 
 25 |   for (i in seq(1, length(markers))){
 26 |     marker <- markers[i]
 27 |     marker_data <- data[data$marker == marker,]
 28 |     snp_calls <- unique(marker_data$value)
 29 |     alleles <- snp_calls[(snp_calls %in% c("A", "C", "T", "G"))]
 30 |     #make sure chinese spring allele is always 0
 31 |     CS_allele <- marker_data[marker_data$variable == "CHINESE", "value"]
 32 |     alt_allele <- alleles[!(alleles %in% CS_allele)]
 33 |     marker_data$value <- gsub(CS_allele, "0", marker_data$value)
 34 |     marker_data$value <- gsub(alt_allele, "1", marker_data$value)
 35 |     converted <- rbind(converted, marker_data)
 36 |   }
 37 |   converted$value <- gsub("N", "NA", converted$value)
 38 |   converted$value <- as.numeric(converted$value)
 39 |   return(converted)
 40 | }
 41 | 
 42 | plot_capture_specified_order_small <- function(data, start, end, outfile_prefix, order_list){
 43 |   data_zoom <- data[(data$pos > start) & (data$pos < end),]
 44 |   data_zoom$variable <- factor(x = data_zoom$variable, levels = order_list, ordered = TRUE)
 45 |   plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) +
 46 | 		geom_tile(aes(fill = value), color = "gray") +
 47 | 		#scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") +
 48 |     #scale_fill_continuous(type = "viridis") +
 49 |     scale_fill_gradient(low = "grey70", high = "grey20", na.value = "white", aesthetics = "fill") +
 50 |     theme_bw() +
 51 | 		theme(axis.text.x=element_text(angle=90, hjust=1)) #+
 52 |     #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) )
 53 |   #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 6, width = 6)
 54 |   return(plot)
 55 | }
 56 | ```
 57 | 
 58 | 
 59 | Ok now lets make the plots for the different genotyping platforms
 60 | 
 61 | ```{r}
 62 | cerealsdb_manual <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/figures/35K_cerealsdb_hap_examples.csv", 
 63 |                            sep = ",", 
 64 |                            header = TRUE, 
 65 |                            stringsAsFactors = FALSE)
 66 | 
 67 | cerealsdb_manual_long <- melt(cerealsdb_manual, id.vars = c("marker", "pos"))
 68 | 
 69 | cerealsdb_manual_01 <- convert_long_01(cerealsdb_manual_long)
 70 | 
 71 | cerealsdb_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY")
 72 | 
 73 | to_plot_subset <- subset(cerealsdb_manual_01, variable %in% cerealsdb_order)
 74 | 
 75 | 
 76 | plot_manual_35k <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(cerealsdb_order))
 77 | plot_manual_35k
 78 | 
 79 | plot_manual_35k_no_axes <- plot_manual_35k +
 80 |   theme(strip.text.y=element_text(angle=180, size=5),
 81 |           strip.background=element_blank(),
 82 |           legend.position = "none",
 83 |           axis.title.x=element_blank(),
 84 |           axis.text.x=element_blank(),
 85 |           axis.title.y=element_blank(),
 86 |           axis.text.y=element_blank())
 87 | 
 88 | plot_manual_35k_no_axes
 89 | ```
 90 | 
 91 | Do this for the vos fells as well
 92 | 
 93 | ```{r}
 94 | VF_manual <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/figures/voss_fells_hap_examples.csv", 
 95 |                            sep = ",", 
 96 |                            header = TRUE, 
 97 |                            stringsAsFactors = FALSE)
 98 | 
 99 | VF_manual_long <- melt(VF_manual, id.vars = c("marker", "pos"))
100 | 
101 | VF_manual_01 <- convert_long_01(VF_manual_long)
102 | 
103 | VF_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY", "X9K1")
104 | 
105 | to_plot_subset <- subset(VF_manual_01, variable %in% VF_order)
106 | 
107 | plot_manual_VF <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(VF_order))
108 | plot_manual_VF
109 | 
110 | plot_manual_VF_no_axes <- plot_manual_VF +
111 |   theme(strip.text.y=element_text(angle=180, size=5),
112 |           strip.background=element_blank(),
113 |           legend.position = "none",
114 |           axis.title.x=element_blank(),
115 |           axis.text.x=element_blank(),
116 |           axis.title.y=element_blank(),
117 |           axis.text.y=element_blank())
118 | 
119 | plot_manual_VF_no_axes
120 | ```
121 | 
122 | Now the akhunov data
123 | 
124 | ```{r}
125 | akhunov_manual <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/figures/akhunov_hap_examples.csv", 
126 |                            sep = ",", 
127 |                            header = TRUE, 
128 |                            stringsAsFactors = FALSE)
129 | 
130 | akhunov_manual_long <- melt(akhunov_manual, id.vars = c("marker", "pos"))
131 | 
132 | akhunov_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY", "Ex1", "Ex2", "Ex3")
133 | 
134 | to_plot_subset <- subset(akhunov_manual_long, variable %in% akhunov_order)
135 | 
136 | plot_manual_akhunov <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(akhunov_order))
137 | plot_manual_akhunov
138 | 
139 | plot_manual_akhunov_no_axes <- plot_manual_akhunov +
140 |   theme(strip.text.y=element_text(angle=180, size=5),
141 |           strip.background=element_blank(),
142 |           legend.position = "none",
143 |           axis.title.x=element_blank(),
144 |           axis.text.x=element_blank(),
145 |           axis.title.y=element_blank(),
146 |           axis.text.y=element_blank())
147 | 
148 | plot_manual_akhunov_no_axes
149 | ```
150 | 
151 | Now the SNP distributions
152 | ```{r}
153 | akhunov_pos <- read.table("X:/brintonj/6A_region/akhunov_capture/all.GP08_mm75_het3_publication01142019.vcf.gz_all6A_HCLC_GT_only.GT.FORMAT_position_only.tsv",
154 |                        sep = "\t",
155 |                        header = TRUE, 
156 |                        stringsAsFactors = FALSE)
157 | head(akhunov_pos)
158 | nrow(akhunov_pos)
159 | nrow(unique(akhunov_pos))
160 | 
161 | akhunov_unfiltered_pos <- read.table("X:/brintonj/6A_region/akhunov_capture/all.GP08_mm75_het3_publication01142019.vcf.gz_all6A_GT_only.GT.FORMAT_position_only.tsv",
162 |                        sep = "\t",
163 |                        header = TRUE, 
164 |                        stringsAsFactors = FALSE)
165 | head(akhunov_unfiltered_pos)
166 | nrow(akhunov_unfiltered_pos)
167 | nrow(unique(akhunov_unfiltered_pos))
168 | 
169 | breeders <- read.table("Y:/Jemima/6A/haplotype/35K/35k_probe_set_IWGSCv1.csv",
170 |                        sep = ",",
171 |                        header = TRUE, 
172 |                        stringsAsFactors = FALSE)
173 | head(breeders)
174 | 
175 | breeders_6A_pos <- breeders[breeders$IWGSC_v1_Chromosome == "chr6A", c(2,3)]
176 | colnames(breeders_6A_pos) <- colnames(akhunov_pos)
177 | 
178 | nrow(breeders_6A_pos)
179 | nrow(unique(breeders_6A_pos))
180 | breeders_6A_pos$POS <- as.numeric(breeders_6A_pos$POS)
181 | head(breeders_6A_pos)
182 | 
183 | inifinium <- read.table("Y:/Jemima/6A/haplotype/Vos-Fels et al 2019/voss_fells_marker_positions_6A.csv",
184 |                        sep = ",",
185 |                        header = TRUE, 
186 |                        stringsAsFactors = FALSE)
187 | 
188 | head(inifinium)
189 | 
190 | inifinium_pos <- inifinium[,c("chrom", "pos")]
191 | colnames(inifinium_pos) <- colnames(akhunov_pos)
192 | 
193 | nrow(inifinium_pos)
194 | nrow(unique(inifinium_pos))
195 | inifinium_pos$POS <- as.numeric(inifinium_pos$POS)
196 | head(inifinium_pos)
197 | 
198 | pangenome_snps <- read.table("Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_dist_6A_CS_ref.tsv",
199 |                              sep = "\t",
200 |                              header = TRUE,
201 |                              stringsAsFactors = FALSE)
202 | 
203 | nrow(pangenome_snps)
204 | nrow(unique(pangenome_snps))
205 | colnames(pangenome_snps) <- colnames(akhunov_pos)
206 | pangenome_snps$POS <- as.numeric(pangenome_snps$POS)
207 | head(pangenome_snps)
208 | ```
209 | 
210 | Plots 
211 | ```{r}
212 | 
213 | 
214 | pangenome_snps <- ggplot(pangenome_snps, aes(x = POS)) +
215 |   geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) +
216 |   theme_bw() +
217 |   scale_x_continuous(breaks=seq(0,CS_6A_len,100000000))
218 | 
219 | 
220 | pangenome_snps
221 | 
222 | pangenome_snps_no_axes <- pangenome_snps +
223 |   theme(strip.text.y=element_text(angle=180, size=5),
224 |           strip.background=element_blank(),
225 |           legend.position = "none",
226 |           axis.title.x=element_blank(),
227 |           axis.text.x=element_blank(),
228 |           axis.title.y=element_blank(),
229 |           axis.text.y=element_blank())
230 | 
231 | pangenome_snps_no_axes
232 | 
233 | ```
234 | 
235 | ```{r}
236 | 
237 | 
238 | akhunov_snps <- ggplot(akhunov_pos, aes(x = POS)) +
239 |   geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) +
240 |   theme_bw() +
241 |   scale_x_continuous(breaks=seq(0,CS_6A_len,100000000))
242 | 
243 | akhunov_snps
244 | 
245 | akhunov_snps_no_axes <- akhunov_snps +
246 |   theme(strip.text.y=element_text(angle=180, size=5),
247 |           strip.background=element_blank(),
248 |           legend.position = "none",
249 |           axis.title.x=element_blank(),
250 |           axis.text.x=element_blank(),
251 |           axis.title.y=element_blank(),
252 |           axis.text.y=element_blank())
253 | 
254 | akhunov_snps_no_axes
255 | 
256 | ```
257 | 
258 | ```{r}
259 | 
260 | 
261 | akhunov_unfilt_snps <- ggplot(akhunov_unfiltered_pos, aes(x = POS)) +
262 |   geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) +
263 |   theme_bw() +
264 |   scale_x_continuous(breaks=seq(0,CS_6A_len,100000000))
265 | 
266 | akhunov_unfilt_snps
267 | 
268 | akhunov_unfilt_snps_no_axes <- akhunov_unfilt_snps +
269 |   theme(strip.text.y=element_text(angle=180, size=5),
270 |           strip.background=element_blank(),
271 |           legend.position = "none",
272 |           axis.title.x=element_blank(),
273 |           axis.text.x=element_blank(),
274 |           axis.title.y=element_blank(),
275 |           axis.text.y=element_blank())
276 | 
277 | akhunov_unfilt_snps_no_axes
278 | 
279 | ```
280 | 
281 | ```{r}
282 | breeders_snps <- ggplot(breeders_6A_pos, aes(x = POS)) +
283 |   geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) +
284 |   theme_bw() +
285 |   scale_x_continuous(breaks=seq(0,CS_6A_len,100000000))
286 | breeders_snps
287 | 
288 | breeders_snps_no_axes <- breeders_snps +
289 |   theme(strip.text.y=element_text(angle=180, size=5),
290 |           strip.background=element_blank(),
291 |           legend.position = "none",
292 |           axis.title.x=element_blank(),
293 |           axis.text.x=element_blank(),
294 |           axis.title.y=element_blank(),
295 |           axis.text.y=element_blank())
296 | 
297 | breeders_snps_no_axes
298 | ```
299 | 
300 | ```{r}
301 | 
302 | inifinium_snps <- ggplot(inifinium_pos, aes(x = POS)) +
303 |   geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) +
304 |   theme_bw() +
305 |   scale_x_continuous(breaks=seq(0,CS_6A_len,100000000))
306 | 
307 | print(inifinium_snps)
308 | 
309 | inifinium_snps_no_axes <- inifinium_snps +
310 |   theme(strip.text.y=element_text(angle=180, size=5),
311 |           strip.background=element_blank(),
312 |           legend.position = "none",
313 |           axis.title.x=element_blank(),
314 |           axis.text.x=element_blank(),
315 |           axis.title.y=element_blank(),
316 |           axis.text.y=element_blank())
317 | 
318 | inifinium_snps_no_axes
319 | 
320 | ```
321 | 
322 | Now we want to plots the genotypes of the panels
323 | 
324 | ```{r}
325 | #calls coming from the public datasets
326 | haplotype_calls <- read.table(file = "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/pangenome_haplotype_allocations_35K_all_AK_USA_capture_vossfells_updated_May2020_FINAL_SET_TO_INCLUDE.csv",
327 |                               sep = ",",
328 |                               header = TRUE)
329 | 
330 | 
331 | ##calls with the haplotype-specific markers
332 | marker_calls <- read.table(file = "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/watkins_analysis/final_haplotypes_watkins_UKRL.csv",
333 |                               sep = ",",
334 |                               header = TRUE)
335 | 
336 | head(marker_calls)
337 | 
338 | ```
339 | 
340 | ```{r}
341 | 
342 | haplotype_colours <- c("#000080",
343 |                        "#5FC6BC",
344 |                        "#008080", 
345 |                        "#FF7F2a",
346 |                        "#5F5Fd3", 
347 |                        "#55DDFF", 
348 |                        "#A793AC", 
349 |                        "#DE87CD",
350 |                        "#C287DE", 
351 |                        "#D9AEED", 
352 |                        "#FFE680", 
353 |                        "#FF5555", 
354 |                        "#B3DE69",
355 |                        "#B3B3B3")
356 | 
357 | 
358 | haplotype_names <- c("H1",
359 |                      "H1/2",
360 |                      "H2",
361 |                      "H3",
362 |                      "H4",
363 |                      "H5",
364 |                      "H6",
365 |                      "H7",
366 |                      "H5/6/7",
367 |                      "H4/5/6/7",
368 |                      "35K1",
369 |                      "35K2",
370 |                      "wat_specific",
371 |                      "other")
372 | 
373 | names(haplotype_colours) <- haplotype_names
374 | 
375 | haplotype_calls_complete <- haplotype_calls[!(haplotype_calls$nonPG_simple_hap == "poor_data"),]
376 | 
377 | haplotype_calls_complete$nonPG_simple_hap <- factor(haplotype_calls_complete$nonPG_simple_hap, levels = haplotype_names)
378 | haplotype_calls_complete$group <- factor(haplotype_calls_complete$group, levels = c("UK_RL", "voss_fells", "Australia", "USA","CIMCOG", "Watkins"))
379 | 
380 | head(haplotype_calls_complete)
381 | 
382 | ```
383 | 
384 | Try plotting using proportions rather than absolute values
385 | ```{r}
386 | 
387 | haplotype_calls_complete <- haplotype_calls_complete[complete.cases(haplotype_calls_complete$nonPG_simple_hap),]
388 | hap_counts <- data.frame(table(haplotype_calls_complete[,c("group", "nonPG_simple_hap")]))
389 | 
390 | group_counts <- data.frame(table(haplotype_calls_complete$group))
391 | colnames(group_counts) <- c("group", "total_var")
392 | hap_counts <- merge(hap_counts, group_counts, all.x = TRUE, all.y = FALSE)
393 | hap_counts$perc <- (hap_counts$Freq/hap_counts$total_var)*100
394 | 
395 | group_counts
396 | 
397 | hap_counts$group <- factor(hap_counts$group, levels = c("UK_RL", "voss_fells", "Australia", "USA","CIMCOG", "Watkins"))
398 | 
399 | plot <- ggplot(hap_counts, aes(x=group, y = perc, fill = nonPG_simple_hap)) + 
400 |         geom_bar(stat = "identity", width = 0.7) +
401 |         ylab("% of varieties") +
402 |         scale_x_discrete(labels = c(paste0("UK RL \n (n=", group_counts[group_counts$group == "UK_RL","total_var"], ")"),
403 |                                     paste0("European \n (n=", group_counts[group_counts$group == "voss_fells","total_var"], ")"),
404 |                                     paste0("Australia \n (n=", group_counts[group_counts$group == "Australia","total_var"], ")"),
405 |                                     paste0("USA \n (n=", group_counts[group_counts$group == "USA","total_var"], ")"),
406 |                                     paste0("CIMCOG \n (n=", group_counts[group_counts$group == "CIMCOG","total_var"], ")"),
407 |                                     paste0("Watkins \n (n=", group_counts[group_counts$group == "Watkins","total_var"], ")"))) +
408 |         scale_fill_manual(name = "Haplotype", values = haplotype_colours) + theme_bw()
409 | plot
410 | 
411 | plot_no_watkins <- ggplot(hap_counts[!(hap_counts$group == "Watkins"),], aes(x=group, y = perc, fill = nonPG_simple_hap)) + 
412 |         geom_bar(stat = "identity", width = 0.7) +
413 |         ylab("% of varieties") +
414 |         scale_x_discrete(labels = c(paste0("UK RL \n (n=", group_counts[group_counts$group == "UK_RL","total_var"], ")"),
415 |                                     paste0("European \n (n=", group_counts[group_counts$group == "voss_fells","total_var"], ")"),
416 |                                     paste0("Australia \n (n=", group_counts[group_counts$group == "Australia","total_var"], ")"),
417 |                                     paste0("USA \n (n=", group_counts[group_counts$group == "USA","total_var"], ")"),
418 |                                     paste0("CIMCOG \n (n=", group_counts[group_counts$group == "CIMCOG","total_var"], ")"))) +
419 |         scale_fill_manual(name = "Haplotype", values = haplotype_colours) + theme_bw()
420 | 
421 | plot_no_watkins
422 | 
423 | plot_no_watkins_no_axes <- plot_no_watkins +
424 |   theme(strip.text.y=element_text(angle=180, size=5),
425 |           strip.background=element_blank(),
426 |           legend.position = "none",
427 |           axis.title.x=element_blank(),
428 |           axis.text.x=element_blank(),
429 |           axis.title.y=element_blank(),
430 |           axis.text.y=element_blank())
431 | 
432 | plot_no_watkins_no_axes
433 | 
434 | ```
435 | 
436 | Now the watkins alluvial plots
437 | 
438 | ```{r}
439 | #melt the dataframe
440 | #remove_poor_data in both  data sets
441 | marker_calls_no_missing <- marker_calls[!((marker_calls$previous_summary == "poor_data") | (marker_calls$marker_summary == "poor_data") | (marker_calls$previous_summary == "no_data")), ]
442 | marker_melt <- melt(marker_calls_no_missing, id.vars = c("line", "group"))
443 | 
444 | 
445 | head(marker_melt)
446 | tail(marker_melt)
447 | 
448 | 
449 | names(haplotype_colours) <- haplotype_names
450 | 
451 | 
452 | marker_wat_summary <- marker_melt[(marker_melt$group == "watkins") & (marker_melt$variable == "previous_summary" |  marker_melt$variable == "marker_summary"),]
453 | marker_wat_summary$value <- factor(marker_wat_summary$value, levels = names(haplotype_colours))
454 | 
455 | unique(marker_wat_summary$value)
456 | watkins_old_to_new <- ggplot(marker_wat_summary, aes(x = variable, stratum = value, alluvium = line, fill = value)) +
457 |   geom_flow() +
458 |   geom_stratum() +
459 |   scale_fill_manual(name = "Haplotype", values = haplotype_colours) +
460 |   theme_bw()
461 | 
462 | watkins_old_to_new
463 | 
464 | watkins_old_to_new_no_axes <- watkins_old_to_new +
465 |   theme(strip.text.y=element_text(angle=180, size=5),
466 |           strip.background=element_blank(),
467 |           legend.position = "none",
468 |           axis.title.x=element_blank(),
469 |           axis.text.x=element_blank(),
470 |           axis.title.y=element_blank(),
471 |           axis.text.y=element_blank())
472 | 
473 | watkins_old_to_new_no_axes
474 | ```
475 | 
476 | Now just plot the watkins specific
477 | 
478 | Plot european data over time
479 | ```{r}
480 | head(haplotype_calls_complete)
481 | 
482 | voss_fells <- subset(haplotype_calls_complete, group %in% "voss_fells")
483 | 
484 | plot_over_time <- ggplot(voss_fells, aes(x = year, fill = nonPG_simple_hap)) + 
485 |   geom_histogram(position = "stack", binwidth=5, col = "black") +
486 |   xlab("Year of release (5 year bins)") +
487 |   ylab("Number of varieties") +
488 |   scale_fill_manual(name = "Haplotype", values = haplotype_colours, labels = c("H1/2", "H3", "H4", "H5", "H6", "H7", "other")) + theme_bw()
489 | plot_over_time
490 | 
491 | ```
492 | 
493 | ```{r}
494 | width = 10
495 | height = 1
496 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/summarised_conserved_regions_cap5_6A.pdf", width = width, height = height)
497 | print(summarised_conserved)
498 | dev.off()
499 | 
500 | width = 4
501 | height = 4
502 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_35K_axes.pdf", width = width, height = height)
503 | print(plot_manual_35k)
504 | dev.off()
505 | 
506 | width = 4
507 | height = 4
508 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_35K.pdf", width = width, height = height)
509 | print(plot_manual_35k_no_axes)
510 | dev.off()
511 | 
512 | width = 4
513 | height = 4
514 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_VF_axes.pdf", width = width, height = height)
515 | print(plot_manual_VF)
516 | dev.off()
517 | 
518 | width = 4
519 | height = 4
520 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_VF.pdf", width = width, height = height)
521 | print(plot_manual_VF_no_axes)
522 | dev.off()
523 | 
524 | width = 4
525 | height = 4
526 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_akhunov_axes.pdf", width = width, height = height)
527 | print(plot_manual_akhunov)
528 | dev.off()
529 | 
530 | width = 4
531 | height = 4
532 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_akhunov.pdf", width = width, height = height)
533 | print(plot_manual_akhunov_no_axes)
534 | dev.off()
535 | 
536 | width = 8
537 | height = 2
538 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_snp_positions_5mbpbin.pdf", width = width, height = height)
539 | print(akhunov_snps)
540 | dev.off()
541 | 
542 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height)
543 | print(akhunov_snps_no_axes)
544 | dev.off()
545 | 
546 | width = 8
547 | height = 2
548 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_unfiltered_snp_positions_5mbpbin.pdf", width = width, height = height)
549 | print(akhunov_unfilt_snps)
550 | dev.off()
551 | 
552 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_unfiltered_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height)
553 | print(akhunov_unfilt_snps_no_axes)
554 | dev.off()
555 | 
556 | 
557 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/9K_snp_positions_5mbpbin.pdf", width = width, height = height)
558 | print(inifinium_snps)
559 | dev.off()
560 | 
561 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/9K_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height)
562 | print(inifinium_snps_no_axes)
563 | dev.off()
564 | 
565 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/35K_snp_positions_5mbpbin.pdf", width = width, height = height)
566 | print(breeders_snps)
567 | dev.off()
568 | 
569 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/35K_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height)
570 | print(breeders_snps_no_axes)
571 | dev.off()
572 | 
573 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_positions_5mbpbin.pdf", width = width, height = height)
574 | print(pangenome_snps)
575 | dev.off()
576 | 
577 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height)
578 | print(pangenome_snps_no_axes)
579 | dev.off()
580 | 
581 | width = 6
582 | height = 4
583 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/panels_public_datasets_calls_no_watkins_no_axes.pdf", width = width, height = height)
584 | print(plot_no_watkins_no_axes)
585 | dev.off()
586 | 
587 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/panels_public_datasets_calls_no_watkins.pdf", width = width, height = height)
588 | print(plot_no_watkins)
589 | dev.off()
590 | 
591 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/hap_specific_markers_watkins_summary_no_axes.pdf", width = width, height = height)
592 | print(watkins_old_to_new_no_axes)
593 | dev.off()
594 | 
595 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/hap_specific_markers_watkins_summary.pdf", width = width, height = height)
596 | print(watkins_old_to_new)
597 | dev.off()
598 | 
599 | 
600 | width = 1.5
601 | height = 4
602 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/hap_specific_markers_watkins_specific_no_axes.pdf", width = width, height = height)
603 | print(watkins_specific_no_axes)
604 | dev.off()
605 | 
606 | width = 5
607 | height = 4
608 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/european_data_over_time.pdf", width = width, height = height)
609 | print(plot_over_time)
610 | dev.off()
611 | 
612 | ```
613 | 


--------------------------------------------------------------------------------