├── example ├── example_create_CNVR │ ├── results │ │ ├── cnvr_create.txt │ │ ├── cnvr_boundary.txt │ │ ├── cnvr_clean.txt │ │ ├── cnv.penncnv.txt │ │ ├── cnv.ipattern.txt │ │ └── cnv.quantisnp.txt │ ├── data │ │ ├── centromere_hg19.txt │ │ ├── CNV.PennCNV_new.txt │ │ └── iPattern_all_calls.txt │ └── README.md ├── example_CNV_genotype │ ├── RDS │ │ ├── BAF │ │ │ └── matrix_chr_1_BAF.rds │ │ └── LRR │ │ │ └── matrix_chr_1_LRR.rds │ ├── results │ │ ├── pars │ │ │ └── CNVR_pars_chr_1_batch_2.rds │ │ ├── stats │ │ │ └── CNVR_79_r1_chr1_p_stat.rds │ │ ├── png │ │ │ ├── diag │ │ │ │ └── diag_CNVR_79_r1_chr1_p.png │ │ │ ├── steps │ │ │ │ ├── steps_1_CNVR_79_r1_chr1_p.png │ │ │ │ └── steps_2_CNVR_79_r1_chr1_p.png │ │ │ ├── heatmap │ │ │ │ └── heatmap_CNVR_79_r1_chr1_p.png │ │ │ └── summary │ │ │ │ └── summary_CNVR_79_r1_chr1_p.png │ │ └── pred │ │ │ └── chr_1_batch_2 │ │ │ └── CNVR_79_r1_chr1_p_pred.rds │ ├── data │ │ ├── cnvr_batch.txt │ │ └── duplicate_pairs.txt │ └── README.md └── example_boundary_refinement │ ├── RDS │ └── LRR │ │ └── matrix_chr_2_LRR.rds │ ├── results │ ├── res_refine │ │ └── chr2 │ │ │ ├── data │ │ │ └── CNVR_refine_chr_2_detail.rds │ │ │ └── png │ │ │ └── CNVR_163_r1_chr2_q_boundary_refinement.png │ └── cnvr_refine.txt │ ├── data │ ├── centromere_hg19.txt │ └── SNP_pos.txt │ └── README.md ├── 01_initial_call ├── run_iPattern │ ├── ref_files_hg19 │ │ └── pq.txt │ ├── prepare_input_files_for_iPattern.R │ └── README.md ├── run_QuantiSNP │ ├── step.3.combine.QuantiSNP.pl │ ├── README.md │ ├── step.1.prepare.QuantiSNP.R │ └── step.2.check.QuantiSNP.R ├── run_PennCNV │ ├── step.4.combine.PennCNV.res.pl │ ├── step.5.clean.PennCNV.res.R │ ├── step.3.check.PennCNV.jobs.R │ ├── step.2.run.PennCNV.jobs.R │ └── README.md └── finalreport_to_matrix_LRR_and_BAF │ └── transform_from_tab_to_rds.R ├── 04_CNV_genotype ├── scripts │ ├── fun_plot_heatmap.R │ ├── fun_gatk.R │ ├── fun_plot_diagnosis.R │ ├── fun_LRR.R │ ├── fun_BAF.R │ ├── fun_plot_steps.R │ └── fun_pipeline_main.R ├── step.1.split.cnvrs.into.batches.R ├── step.4.prediction.results.R ├── step.2.submit.jobs.R ├── step.3.check.and.resubmit.jobs.R └── CNV.genotype.one.chr.one.batch.R ├── 02_batch_effect ├── PCA_on_LRR │ ├── step.1.down.sampling.R │ ├── step.3.LRR.pca.R │ └── step.2.LRR.matrix.pl └── PCA_on_summary_stats │ ├── step.2.stats.PCA.R │ └── step.1.prepare.stats.R ├── 05_boundary_refinement ├── refine.cpp ├── step.4.update.genotype.matrix.R ├── step.1.common.CNVR.to.refine.R ├── step.2.submit.jobs.R └── step.3.clean.results.R ├── create_new_project.sh ├── 06_performance_assessment ├── step.2.set.GQ.generate.results.R └── step.1.performance.assessment.R └── 03_create_CNVR └── step.1.CNV.data.R /example/example_create_CNVR/results/cnvr_create.txt: -------------------------------------------------------------------------------- 1 | CNVR_ID outer.start outer.end nCNV chr arm 2 | CNVR_1_r1_chr1_p 11259 11268 233 1 p 3 | -------------------------------------------------------------------------------- /example/example_CNV_genotype/RDS/BAF/matrix_chr_1_BAF.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/RDS/BAF/matrix_chr_1_BAF.rds -------------------------------------------------------------------------------- /example/example_CNV_genotype/RDS/LRR/matrix_chr_1_LRR.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/RDS/LRR/matrix_chr_1_LRR.rds -------------------------------------------------------------------------------- /example/example_boundary_refinement/RDS/LRR/matrix_chr_2_LRR.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_boundary_refinement/RDS/LRR/matrix_chr_2_LRR.rds -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/pars/CNVR_pars_chr_1_batch_2.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/pars/CNVR_pars_chr_1_batch_2.rds -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/stats/CNVR_79_r1_chr1_p_stat.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/stats/CNVR_79_r1_chr1_p_stat.rds -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/png/diag/diag_CNVR_79_r1_chr1_p.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/diag/diag_CNVR_79_r1_chr1_p.png -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/png/steps/steps_1_CNVR_79_r1_chr1_p.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/steps/steps_1_CNVR_79_r1_chr1_p.png -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/png/steps/steps_2_CNVR_79_r1_chr1_p.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/steps/steps_2_CNVR_79_r1_chr1_p.png -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/png/heatmap/heatmap_CNVR_79_r1_chr1_p.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/heatmap/heatmap_CNVR_79_r1_chr1_p.png -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/png/summary/summary_CNVR_79_r1_chr1_p.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/summary/summary_CNVR_79_r1_chr1_p.png -------------------------------------------------------------------------------- /example/example_CNV_genotype/results/pred/chr_1_batch_2/CNVR_79_r1_chr1_p_pred.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/pred/chr_1_batch_2/CNVR_79_r1_chr1_p_pred.rds -------------------------------------------------------------------------------- /example/example_create_CNVR/results/cnvr_boundary.txt: -------------------------------------------------------------------------------- 1 | CNVR_ID outer.start outer.end nCNV chr arm posStart posEnd start_snp end_snp nPeak 2 | CNVR_1_r1_chr1_p 11259 11268 233 1 p 25598276 25642596 rs2517979 rs28393458 1 3 | -------------------------------------------------------------------------------- /example/example_create_CNVR/results/cnvr_clean.txt: -------------------------------------------------------------------------------- 1 | CNVR_ID outer.start outer.end nCNV chr arm posStart posEnd start_snp end_snp nPeak Freq 2 | CNVR_1_r1_chr1_p 11259 11268 233 1 p 25598276 25642596 rs2517979 rs28393458 1 233 3 | -------------------------------------------------------------------------------- /example/example_CNV_genotype/data/cnvr_batch.txt: -------------------------------------------------------------------------------- 1 | CNVR_ID outer.start outer.end nCNV chr arm posStart posEnd start_snp end_snp nPeak Freq batch 2 | CNVR_79_r1_chr1_p 11259 11268 326 1 p 25598276 25642596 rs2517979 rs28393458 1 233 2 3 | -------------------------------------------------------------------------------- /example/example_boundary_refinement/results/res_refine/chr2/data/CNVR_refine_chr_2_detail.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_boundary_refinement/results/res_refine/chr2/data/CNVR_refine_chr_2_detail.rds -------------------------------------------------------------------------------- /example/example_boundary_refinement/results/res_refine/chr2/png/CNVR_163_r1_chr2_q_boundary_refinement.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_boundary_refinement/results/res_refine/chr2/png/CNVR_163_r1_chr2_q_boundary_refinement.png -------------------------------------------------------------------------------- /example/example_boundary_refinement/results/cnvr_refine.txt: -------------------------------------------------------------------------------- 1 | CNVR_ID outer.start outer.end nCNV chr arm posStart posEnd start_snp end_snp nPeak raw_Freq batch genotype Freq 2 | CNVR_163_r1_chr2_q 73662 73669 2 2 q 242929164 242971871 rs13432136 rs13431327 1 2 1 1 79 3 | -------------------------------------------------------------------------------- /01_initial_call/run_iPattern/ref_files_hg19/pq.txt: -------------------------------------------------------------------------------- 1 | 1 125000000 2 | 2 93300000 3 | 3 91000000 4 | 4 50400000 5 | 5 48400000 6 | 6 61000000 7 | 7 59900000 8 | 8 45600000 9 | 9 49000000 10 | 10 40200000 11 | 11 53700000 12 | 12 35800000 13 | 13 17900000 14 | 14 17600000 15 | 15 19000000 16 | 16 36600000 17 | 17 24000000 18 | 18 17200000 19 | 19 26500000 20 | 20 27500000 21 | 21 13200000 22 | 22 14700000 23 | X 60600000 24 | Y 12500000 25 | -------------------------------------------------------------------------------- /example/example_create_CNVR/data/centromere_hg19.txt: -------------------------------------------------------------------------------- 1 | chr position 2 | 1 125000000 3 | 2 93300000 4 | 3 91000000 5 | 4 50400000 6 | 5 48400000 7 | 6 61000000 8 | 7 59900000 9 | 8 45600000 10 | 9 49000000 11 | 10 40200000 12 | 11 53700000 13 | 12 35800000 14 | 13 17900000 15 | 14 17600000 16 | 15 19000000 17 | 16 36600000 18 | 17 24000000 19 | 18 17200000 20 | 19 26500000 21 | 20 27500000 22 | 21 13200000 23 | 22 14700000 24 | X 60600000 25 | Y 12500000 26 | -------------------------------------------------------------------------------- /example/example_boundary_refinement/data/centromere_hg19.txt: -------------------------------------------------------------------------------- 1 | chr position 2 | 1 125000000 3 | 2 93300000 4 | 3 91000000 5 | 4 50400000 6 | 5 48400000 7 | 6 61000000 8 | 7 59900000 9 | 8 45600000 10 | 9 49000000 11 | 10 40200000 12 | 11 53700000 13 | 12 35800000 14 | 13 17900000 15 | 14 17600000 16 | 15 19000000 17 | 16 36600000 18 | 17 24000000 19 | 18 17200000 20 | 19 26500000 21 | 20 27500000 22 | 21 13200000 23 | 22 14700000 24 | X 60600000 25 | Y 12500000 26 | -------------------------------------------------------------------------------- /example/example_create_CNVR/data/CNV.PennCNV_new.txt: -------------------------------------------------------------------------------- 1 | 1 25598276 25638253 1 Sample536.1_46R06C01.txt rs2517979 rs2427759 16.242 7 2 | 1 25598276 25642596 3 Sample353.1_80R07C01.txt rs2517979 rs28393458 11.247 8 3 | 1 25598276 25638253 1 Sample384.1_66R04C01.txt rs2517979 rs2427759 17.075 7 4 | 1 25598276 25642596 1 Samplec82.1_16R05C01.txt rs2517979 rs28393458 12.154 8 5 | 1 25598276 25638253 1 Samplec134.1_8R08C01.txt rs2517979 rs2427759 12.341 7 6 | 1 25598276 25629950 1 Sample292.1_70R08C01.txt rs2517979 exm32925 13.511 5 7 | 1 25598276 25638253 1 Sample541.1_37R04C01.txt rs2517979 rs2427759 12.084 7 8 | -------------------------------------------------------------------------------- /example/example_CNV_genotype/data/duplicate_pairs.txt: -------------------------------------------------------------------------------- 1 | sample1.name sample2.name 2 | Sample167.1_44R03C01 Sample167.1_5R08C01 3 | Sample282.1_27R08C01 Sample282.1_2R03C01 4 | Sample312.1_64R03C01 Sample312.1_80R01C01 5 | Sample402.1_69R08C01 Sample402.1_74R03C01 6 | Sample408.1_15R03C01 Sample408.1_36R08C01 7 | Sample563.1_25R03C01 Sample563.1_56R08C01 8 | Sample713.1_15R08C01 Sample713.1_25R03C01 9 | Samplec134.1_30R03C01 Samplec134.1_8R08C01 10 | Samplec55.1_53R03C01 Samplec55.1_58R08C01 11 | Sample170.1_15R08C01 Sample203.1_18R06C01 12 | Sample697.1_24R05C01 Samplec97.1_10R06C01 13 | SampleY-27.1_1R01C01 SampleY-6.1_6R01C01 14 | -------------------------------------------------------------------------------- /04_CNV_genotype/scripts/fun_plot_heatmap.R: -------------------------------------------------------------------------------- 1 | 2 | # plot heatmap 3 | 4 | plot_heatmap <- function(dt_lrr_heatmap, dt_snps_flag) { 5 | 6 | cor1 <- cor(dt_lrr_heatmap, use = "na.or.complete") 7 | 8 | groups1 <- ifelse(dt_snps_flag$snp_flag == 0, "snps_add", "snps_raw") 9 | groups2 <- ifelse(dt_snps_flag$snp_flag == 2, "inner_boundary", "outer_boundary") 10 | annotation_col1 <- data.frame( 11 | group1 = groups1, 12 | group2 = groups2 13 | ) 14 | rownames(annotation_col1) <- colnames(dt_lrr_heatmap) 15 | 16 | pheatmap(cor1, 17 | cluster_cols = FALSE, 18 | cluster_rows = FALSE, 19 | annotation_col = annotation_col1) 20 | 21 | } 22 | -------------------------------------------------------------------------------- /example/example_create_CNVR/results/cnv.penncnv.txt: -------------------------------------------------------------------------------- 1 | chr posStart posEnd CN Sample_ID conf numSNP avgConf length CNV_type method 2 | 1 25598276 25638253 1 Sample536.1_46R06C01 16.242 7 2.32028571428571 39978 Loss PennCNV 3 | 1 25598276 25642596 3 Sample353.1_80R07C01 11.247 8 1.405875 44321 Gain PennCNV 4 | 1 25598276 25638253 1 Sample384.1_66R04C01 17.075 7 2.43928571428571 39978 Loss PennCNV 5 | 1 25598276 25642596 1 Samplec82.1_16R05C01 12.154 8 1.51925 44321 Loss PennCNV 6 | 1 25598276 25638253 1 Samplec134.1_8R08C01 12.341 7 1.763 39978 Loss PennCNV 7 | 1 25598276 25629950 1 Sample292.1_70R08C01 13.511 5 2.7022 31675 Loss PennCNV 8 | 1 25598276 25638253 1 Sample541.1_37R04C01 12.084 7 1.72628571428571 39978 Loss PennCNV 9 | -------------------------------------------------------------------------------- /02_batch_effect/PCA_on_LRR/step.1.down.sampling.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # generate the list of 100000 randomly selected SNPs 4 | # Input is SNP_pos.txt 5 | # including 3 columns: Name, Chr, Position 6 | 7 | suppressMessages({ 8 | require( data.table, quietly = TRUE) 9 | }) 10 | 11 | args <- commandArgs( trailingOnly = TRUE ) 12 | file_snps <- args[1] ## SNP_pos.txt 13 | path_output <- args[2] ## path to save randomly selected SNPs 14 | 15 | ## sampleing from chr: 1-22 16 | dat_snps <- fread( input = file_snps ) 17 | dat_snps <- as.data.frame(dat_snps, stringsAsFactors = FALSE) 18 | dat_snps <- subset(dat_snps, Chr %in% 1:22) 19 | 20 | snps <- sample( dat_snps$Name ) 21 | snps.selected <- snps[ 1:100000 ] 22 | 23 | write.table(snps.selected, file = file.path(path_output, "snps.down.sample.txt"), 24 | sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) 25 | -------------------------------------------------------------------------------- /01_initial_call/run_QuantiSNP/step.3.combine.QuantiSNP.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ## The script was used to run QuantiSNP on Minerva high performance cluster. 4 | ## You need to modifiy it according to the system you are using if you would like to use it. 5 | ## Please refer to original QuantiSNP documents (https://sites.google.com/site/quantisnp/) for more information 6 | 7 | use strict; 8 | use Getopt::Long; 9 | 10 | my $in_dir=""; ## input directory 11 | my $out_dir=""; ## output directory 12 | 13 | GetOptions("in_dir=s" => \$in_dir, 14 | "out_dir=s" => \$out_dir); 15 | 16 | my $out_file=$out_dir."/quantisnp.cnv"; 17 | 18 | opendir(DIR, $in_dir) or die "cannot open $in_dir: $!"; ## 2018-12-11 19 | open(OUT1, ">", $out_file) or die $!; 20 | 21 | my $flag = 1; 22 | while (defined(my $folder = readdir(DIR))) { 23 | 24 | next if ($folder=~/^\./); 25 | ##next if ($folder=~/^INTERNAL/); 26 | ##next if ($folder=~/^CONTROL/); 27 | 28 | my $filename=$folder.".cnv"; ## $folder is Sample ID 29 | my $file=$in_dir."/".$folder."/".$filename; 30 | 31 | #print "$flag", "$file\n"; 32 | open(IN1, "<$file") or die $!; 33 | while (my $line=) { 34 | next if ($line=~/^Sample/); 35 | print OUT1 $line; 36 | } 37 | 38 | close IN1; 39 | $flag = $flag + 1; 40 | } 41 | 42 | close OUT1; 43 | 44 | print "Analysis completed!\n"; 45 | -------------------------------------------------------------------------------- /example/example_boundary_refinement/data/SNP_pos.txt: -------------------------------------------------------------------------------- 1 | Name Chr Position exm2014254 2 242756160 exm2269245 2 242918203 exm285145 2 242743532 exm285162 2 242755734 exm285169 2 242755776 exm285173 2 242755877 exm285174 2 242755880 exm285179 2 242756152 exm285195 2 242756338 exm285206 2 242757460 exm285208 2 242757466 exm285210 2 242757494 exm285218 2 242757680 exm285235 2 242758176 exm285237 2 242758187 exm285245 2 242758210 exm285249 2 242758301 exm285263 2 242793287 exm285264 2 242793290 exm285267 2 242793362 exm285272 2 242793433 exm285283 2 242794356 exm285293 2 242794796 exm285299 2 242794902 exm285320 2 242811915 exm285321 2 242811935 exm285323 2 242811961 exm285334 2 242813886 exm285359 2 242814360 exm285376 2 242814536 exm285384 2 242814639 exm285386 2 242814702 exm285387 2 242814705 exm285391 2 242814758 exm285404 2 242814983 exm285409 2 242815059 exm285418 2 242815157 exm285422 2 242815175 kgp14294579 2 242937311 kgp14399551 2 243017485 rs12468297 2 242996474 rs12469535 2 243044147 rs12472007 2 242932909 rs12620346 2 242962791 rs12987998 2 242917734 rs13390284 2 242919764 rs13431327 2 242971871 rs13432136 2 242929164 rs28528975 2 242750743 rs35399295 2 242795942 rs3892357 2 242763542 rs3934981 2 242926381 rs3934982 2 242926558 rs4072221 2 242809415 rs4973649 2 243034519 rs4973686 2 243020723 rs6605267 2 242824974 rs6712567 2 242929233 rs6737774 2 242918157 rs6737791 2 242918203 rs6740738 2 243007368 rs7421861 2 242795350 rs7423746 2 242937388 rs7573042 2 242996589 rs7587805 2 242942878 -------------------------------------------------------------------------------- /01_initial_call/run_PennCNV/step.4.combine.PennCNV.res.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | ## The script was used to run PennCNV on Minerva high performance cluster. 4 | ## You need to modifiy it according to the system you are using if you would like to use it. 5 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information 6 | 7 | use strict; 8 | use Getopt::Long; 9 | 10 | my $in_dir=""; ## input directory 11 | my $out_dir=""; ## output directory 12 | 13 | GetOptions("in_dir=s" => \$in_dir, 14 | "out_dir=s" => \$out_dir); 15 | 16 | my $out_file=$out_dir."/"."CNV.PennCNV.rawcnv"; 17 | my $out_log=$out_dir."/"."CNV.PennCNV.log"; 18 | 19 | opendir(DIR, $in_dir) or die "cannot open $in_dir: $!"; ## 2018-12-10 20 | open(OUT1, ">", $out_file) or die $!; 21 | open(OUT2, ">", $out_log) or die $!; 22 | 23 | while (defined(my $folder = readdir(DIR))) { 24 | 25 | next if ($folder=~/^\./); 26 | 27 | my $filename=$folder.".rawcnv"; 28 | my $logname=$folder.".log"; 29 | my $file=$in_dir."/".$folder."/".$filename; 30 | my $logfile=$in_dir."/".$folder."/".$logname; 31 | ## print "$file\n"; 32 | open(IN1, "<$file") or die $!; 33 | open(IN2, "<$logfile") or die $!; 34 | while (my $line=) { 35 | print OUT1 $line; 36 | } 37 | 38 | while (my $line=) { 39 | print OUT2 $line; 40 | } 41 | 42 | close IN1; 43 | close IN2; 44 | } 45 | 46 | close OUT1; 47 | close OUT2; 48 | 49 | print "Analysis completed!\n"; 50 | -------------------------------------------------------------------------------- /05_boundary_refinement/refine.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | //[[Rcpp::depends(RcppArmadillo)]] 4 | 5 | using namespace Rcpp; 6 | 7 | //[[Rcpp::export]] 8 | List refine_step1(arma::mat Yt, int min_len = 5) { 9 | 10 | int n = Yt.n_cols; 11 | arma::mat stat1(n, n); stat1.zeros(); 12 | double max_value = - arma::datum::inf; 13 | int max_l = 0; int max_r = 0; 14 | 15 | arma::mat Mcorr = arma::cor(Yt); 16 | double sumS = arma::accu(arma::trimatl(Mcorr)) - n; 17 | int n2 = (1+n)*n/2 - n; 18 | double tmp = 0; 19 | for (int i=0; i<=(n-1); i++) { 20 | int k = i+min_len-1; 21 | for (int j=k; j<=(n-1); j++) { 22 | 23 | arma::mat M = Mcorr.submat(i, i, j, j); 24 | arma::mat x = arma::trimatl(M); 25 | int xcol = x.n_cols; 26 | int n1 = (1+xcol)*xcol/2 - xcol; 27 | double xsum = arma::accu(x) - xcol; 28 | double xmean = xsum/n1; 29 | 30 | if (n1 == n2){ 31 | tmp = xmean/sqrt(1/double(n1)); 32 | } else { 33 | 34 | tmp = (xmean - (sumS - xsum)/double(n2-n1))/sqrt(1/double(n1)+1/double(n2-n1)); 35 | } 36 | stat1(i,j) = tmp; 37 | 38 | if (tmp > max_value) { 39 | max_value = tmp; 40 | max_l = i+1; 41 | max_r = j+1; 42 | } 43 | } 44 | 45 | } 46 | 47 | return List::create( 48 | _["max.value"] = max_value, 49 | _["max.l"] = max_l, 50 | _["max.r"] = max_r 51 | ); 52 | } 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /example/example_CNV_genotype/README.md: -------------------------------------------------------------------------------- 1 | ## Example: CNV genotyping 2 | 3 | Here is a demo of the main script `CNV.genotype.one.chr.one.batch.R` for [CNV genotyping](https://github.com/HaoKeLab/ensembleCNV#4-cnv-genotyping-for-each-cnvr) using example data from one CNVR. 4 | 5 | Please specify where the git clone of ensembleCNV is located. 6 | ```sh 7 | ENSEMBLECNV= 8 | ``` 9 | 10 | Then run the following code for a demo of the main script `CNV.genotype.one.chr.one.batch.R` for CNV genotyping. 11 | ```sh 12 | Rscript ${ENSEMBLECNV}/04_CNV_genotype/CNV.genotype.one.chr.one.batch.R \ 13 | --chr 1 \ 14 | --batch 2 \ 15 | --type 0 \ 16 | --sourcefile ${ENSEMBLECNV}/04_CNV_genotype/scripts/ \ 17 | --datapath ${ENSEMBLECNV}/example/example_CNV_genotype/data \ 18 | --matrixpath ${ENSEMBLECNV}/example/example_CNV_genotype/RDS \ 19 | --resultpath ${ENSEMBLECNV}/example/example_CNV_genotype/results \ 20 | --duplicates \ 21 | --plot 22 | ``` 23 | 24 | Note: When the analysis is successfully completed, in the directory `${path_ensembleCNV}/example/example_CNV_genotype/results`, you will find similar directory structure and outputs as in a real project. In particular, 25 | 26 | - in the subfolders of the `pred` folder, you will find `*_pred.rds`, each corresponding to the CN genotype and GQ score for a CNVR. They are stored in `.rds` format in order to save space and improve I/O time. 27 | 28 | - in the subfolders of the `png` folder, you will find different diagnosis plots for each CNVR. 29 | -------------------------------------------------------------------------------- /example/example_boundary_refinement/README.md: -------------------------------------------------------------------------------- 1 | ## Example: Boundary refinement 2 | 3 | Here is a demo of the main script `CNVR.boundary.refinement.R` for [boundary refinement](https://github.com/HaoKeLab/ensembleCNV#5-boundary-refinement) using example data from one CNVR. 4 | 5 | Please specify where the git clone of ensembleCNV is located. 6 | ```sh 7 | ENSEMBLECNV= 8 | ``` 9 | 10 | Then run the following code for a demo of the main script `CNVR.boundary.refinement.R` for boundary refinement. 11 | ```sh 12 | Rscript ${ENSEMBLECNV}/05_boundary_refinement/CNVR.boundary.refinement.R \ 13 | --chr 2 \ 14 | --rcppfile ${ENSEMBLECNV}/05_boundary_refinement/refine.cpp \ 15 | --datapath ${ENSEMBLECNV}/example/example_boundary_refinement/data \ 16 | --matrixpath ${ENSEMBLECNV}/example/example_boundary_refinement/RDS \ 17 | --centromere ${ENSEMBLECNV}/example/example_boundary_refinement/data/centromere_hg19.txt \ 18 | --resultpath ${ENSEMBLECNV}/example/example_boundary_refinement/results \ 19 | --plot 20 | ``` 21 | 22 | Note: 23 | 24 | - When the analysis is successfully completed, the output will be stored at the directory `${ENSEMBLECNV}/example/example_boundary_refinement/results/res_refine`. 25 | 26 | - In practice, the list of common CNVRs in `cnvr_refine.txt`, whose boundaries are to be refined, is selected by the step `${ENSEMBLECNV}/05_boundary_refinement/step.1.common.CNVR.to.refine.R` based on frequency cut-off specified by the user, before boundary refinement is actually performed (see step (1) of [boundary refinement](https://github.com/HaoKeLab/ensembleCNV#5-boundary-refinement)). Therefore, `cnvr_refine.txt` is supposed to appear in the directory `${ENSEMBLECNV}/example/example_boundary_refinement/results` (instead of the `data` folder) as input for subsequent `CNVR.boundary.refinement.R`. 27 | -------------------------------------------------------------------------------- /create_new_project.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## create new project 4 | wkdir=$1 5 | 6 | ## create working directory 7 | mkdir -p $wkdir 8 | 9 | ## data: final report, sample table, centromere position, and duplicate pairs [optional] 10 | ## put in this directory 11 | mkdir -p ${wkdir}/data 12 | 13 | ## 01_initial_call 14 | cp -ru ./01_initial_call $wkdir 15 | mkdir -p ${wkdir}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS 16 | 17 | mkdir -p ${wkdir}/01_initial_call/run_iPattern/data 18 | mkdir -p ${wkdir}/01_initial_call/run_iPattern/data_aux 19 | mkdir -p ${wkdir}/01_initial_call/run_iPattern/results 20 | 21 | mkdir -p ${wkdir}/01_initial_call/run_PennCNV/data 22 | mkdir -p ${wkdir}/01_initial_call/run_PennCNV/data_aux 23 | mkdir -p ${wkdir}/01_initial_call/run_PennCNV/results 24 | 25 | mkdir -p ${wkdir}/01_initial_call/run_QuantiSNP/data 26 | mkdir -p ${wkdir}/01_initial_call/run_QuantiSNP/results 27 | mkdir -p ${wkdir}/01_initial_call/run_QuantiSNP/results/res 28 | 29 | ## 02_batch_effect 30 | cp -ru ./02_batch_effect $wkdir 31 | 32 | ## 03_create_CNVR 33 | cp -ru ./03_create_CNVR $wkdir 34 | 35 | ## 04_CNV_genotype 36 | cp -ru ./04_CNV_genotype $wkdir 37 | mkdir -p ${wkdir}/04_CNV_genotype/data 38 | mkdir -p ${wkdir}/04_CNV_genotype/results 39 | 40 | ## 05_boundary_refinement 41 | cp -ru ./05_boundary_refinement $wkdir 42 | mkdir -p ${wkdir}/05_boundary_refinement/data 43 | mkdir -p ${wkdir}/05_boundary_refinement/results 44 | 45 | ## 05a_regenotype_after_refinement 46 | mkdir -p ${wkdir}/05a_regenotype_after_refinement 47 | mkdir -p ${wkdir}/05a_regenotype_after_refinement/data 48 | mkdir -p ${wkdir}/05a_regenotype_after_refinement/results 49 | 50 | ## 06_performance_assessment 51 | cp -ru ./06_performance_assessment $wkdir 52 | 53 | echo "New project directory has been created at: $wkdir" 54 | echo "Please put (or create symbolic link to) input data in the directory: $wkdir/data" 55 | 56 | -------------------------------------------------------------------------------- /04_CNV_genotype/scripts/fun_gatk.R: -------------------------------------------------------------------------------- 1 | 2 | output_gatk_result_LRR <- function(dt_LRRBAF) { 3 | 4 | dt_LRRBAF$LRRBAF0 <- -10*log(dt_LRRBAF$LRR0)/log(10) 5 | dt_LRRBAF$LRRBAF1 <- -10*log(dt_LRRBAF$LRR1)/log(10) ## save V1 6 | dt_LRRBAF$LRRBAF2 <- -10*log(dt_LRRBAF$LRR2)/log(10) 7 | dt_LRRBAF$LRRBAF3 <- -10*log(dt_LRRBAF$LRR3)/log(10) 8 | 9 | ## must be deal with 10 | 11 | dt_sub <- dt_LRRBAF[, c("LRRBAF0", "LRRBAF1", "LRRBAF2", "LRRBAF3")] 12 | 13 | value_GQs <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) { 14 | v1 <- unlist(dt_sub[k, ]) 15 | v1 <- sort(v1) 16 | gq1 <- v1[2] - v1[1] 17 | gq1 18 | })) 19 | 20 | # mean(GQs) 21 | dt_LRRBAF$value_GQ <- value_GQs 22 | 23 | CN_gatk_preds <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) { 24 | v1 <- unlist(dt_sub[k, ]) 25 | idx1 <- which.min(v1) 26 | return(idx1 - 1) 27 | })) 28 | 29 | dt_LRRBAF$CN_gatk_pred <- CN_gatk_preds 30 | 31 | ## add BAF0 1 2 3 column 32 | dt_LRRBAF$BAF0 = NA 33 | dt_LRRBAF$BAF1 = NA 34 | dt_LRRBAF$BAF2 = NA 35 | dt_LRRBAF$BAF3 = NA 36 | 37 | dt_LRRBAF # return result 38 | } 39 | 40 | # calculate gatk result 41 | output_gatk_result <- function(dt_LRRBAF) { 42 | 43 | dt_LRRBAF$LRRBAF0 <- -10*log(dt_LRRBAF$LRR0*dt_LRRBAF$BAF0)/log(10) 44 | dt_LRRBAF$LRRBAF1 <- -10*log(dt_LRRBAF$LRR1*dt_LRRBAF$BAF1)/log(10) ## save V1 45 | dt_LRRBAF$LRRBAF2 <- -10*log(dt_LRRBAF$LRR2*dt_LRRBAF$BAF2)/log(10) 46 | dt_LRRBAF$LRRBAF3 <- -10*log(dt_LRRBAF$LRR3*dt_LRRBAF$BAF3)/log(10) 47 | 48 | dt_sub <- dt_LRRBAF[, c("LRRBAF0", "LRRBAF1", "LRRBAF2", "LRRBAF3")] 49 | 50 | value_GQs <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) { 51 | v1 <- unlist(dt_sub[k, ]) 52 | v1 <- sort(v1) 53 | gq1 <- v1[2] - v1[1] 54 | gq1 55 | })) 56 | 57 | # mean(GQs) 58 | dt_LRRBAF$value_GQ <- value_GQs 59 | 60 | CN_gatk_preds <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) { 61 | v1 <- unlist(dt_sub[k, ]) 62 | idx1 <- which.min(v1) 63 | return(idx1 - 1) 64 | })) 65 | 66 | dt_LRRBAF$CN_gatk_pred <- CN_gatk_preds 67 | 68 | dt_LRRBAF # return result 69 | } 70 | -------------------------------------------------------------------------------- /02_batch_effect/PCA_on_summary_stats/step.2.stats.PCA.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscipt 2 | 3 | args <- commandArgs( trailingOnly = TRUE ) 4 | wk_dir <- args[1] ## path to IPQ.stats.txt generated in step (1) 5 | 6 | suppressMessages({ 7 | require(ggplot2) 8 | require(cowplot) 9 | }) 10 | 11 | # PCA -------------------------------------------------------------------- 12 | 13 | dat <- read.delim(file = file.path(wk_dir, "IPQ.stats.txt"), as.is = TRUE) 14 | 15 | idx1 <- which( names(dat) == "Sample_ID" ) 16 | dat_pca <- dat[, -idx1] 17 | mat <- as.matrix(dat_pca) 18 | rownames(mat) <- dat$Sample_ID 19 | 20 | PCA <- prcomp(mat, scale. = TRUE) 21 | PC <- predict(PCA) 22 | 23 | PC <- data.frame(Sample_ID = rownames(PC), 24 | PC, 25 | stringsAsFactors = FALSE) 26 | 27 | write.table(PC, file = file.path(wk_dir, "IPQ_stats_PCA_res.txt"), 28 | quote = F, row.names = F, sep = "\t") 29 | 30 | p12 <- ggplot() + 31 | geom_point(data = PC, aes(PC1, PC2), shape = 1, size = 3) + 32 | xlab("PC1") + 33 | ylab("PC2") + 34 | theme_bw(base_size = 9)+ 35 | theme(axis.text = element_text(size = 15), 36 | axis.title = element_text(size = 15), 37 | plot.title = element_text(size = 20, hjust = 0.5)) + 38 | ggtitle("PC2 ~ PC1") 39 | 40 | p13 <- ggplot() + 41 | geom_point(data = PC, aes(PC1, PC3), shape = 1, size = 3) + 42 | xlab("PC1") + 43 | ylab("PC3") + 44 | theme_bw(base_size = 9)+ 45 | theme(axis.text = element_text(size = 15), 46 | axis.title = element_text(size = 15), 47 | plot.title = element_text(size = 20, hjust = 0.5)) + 48 | ggtitle("PC3 ~ PC1") 49 | 50 | p23 <- ggplot() + 51 | geom_point(data = PC, aes(PC2, PC3), shape = 1, size = 3) + 52 | xlab("PC2") + 53 | ylab("PC3") + 54 | theme_bw(base_size = 9)+ 55 | theme(axis.text = element_text(size = 15), 56 | axis.title = element_text(size = 15), 57 | plot.title = element_text(size = 20, hjust = 0.5)) + 58 | ggtitle("PC3 ~ PC2") 59 | 60 | png(filename = file.path(wk_dir, "IPQ_stats_PCA_plots.png"), 61 | width = 12, height = 12, units = "in", res = 512) 62 | p <- plot_grid(p12, p13, p23, nrow = 2, labels = LETTERS[1:3]) 63 | print(p) 64 | dev.off() 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /04_CNV_genotype/scripts/fun_plot_diagnosis.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | library(gridExtra) 4 | 5 | plot_model <- function(paras, dt_cnvr, title) { 6 | 7 | mu1 <- paras$mu[1] 8 | sigma1 <- paras$sigma[1] 9 | lambda1 <- paras$lambda[1] 10 | 11 | mu2 <- paras$mu[2] 12 | sigma2 <- paras$sigma[2] 13 | lambda2 <- paras$lambda[2] 14 | 15 | mu3 <- paras$mu[3] 16 | sigma3 <- paras$sigma[3] 17 | lambda3 <- paras$lambda[3] 18 | 19 | 20 | x <- dt_cnvr$LRR_median 21 | range_x <- range(x) 22 | 23 | xs <- seq(range_x[1], range_x[2], length.out = 800) 24 | dt <- data.frame(x = xs, stringsAsFactors = F) 25 | 26 | dt1 <- data.frame(x = xs, d = lambda1*dnorm(xs, mean = mu1, sd = sigma1), CN = 1) 27 | 28 | dt3 <- data.frame(x = xs, d = lambda3*dnorm(xs, mean = mu3, sd = sigma3), CN = 3) 29 | 30 | dt2 <- data.frame(x = xs, d = lambda2*dnorm(xs, mean = mu2, sd = sigma2), CN = 2) 31 | dt123 <- rbind(dt1, dt2, dt3) 32 | dt123$CN <- as.factor(dt123$CN) 33 | 34 | 35 | p <- ggplot(data = dt_cnvr, aes(LRR_median, y = ..density..)) + 36 | geom_histogram(bins = 50, fill = NA, color = "black") + 37 | geom_line(data = dt123, aes(x, d, col = CN), lwd = 1.5) + 38 | theme_bw(base_size = 10) + 39 | labs(title = title, 40 | subtitle = paste("mu1:", round(mu1, 2), "mu2:", round(mu2, 2), "mu3:", round(mu3, 2), "\n", 41 | "sd1:", round(sigma1, 2), "sd2:", round(sigma2, 2), "sd3:", round(sigma3, 2))) 42 | p 43 | } 44 | 45 | plot_gmm_diagnosis <- function(dt_cnvr, paras_model) { 46 | 47 | paras_stage1 <- paras_model$stage1 48 | paras_stage1_init <- paras_stage1$init 49 | paras_stage1_model <- paras_stage1$model 50 | 51 | paras_stage2 <- paras_model$stage2 52 | paras_stage2_init <- paras_stage2$init 53 | paras_stage2_model <- paras_stage2$model 54 | 55 | # plot 56 | p1 <- plot_model(paras = paras_stage1_init, dt_cnvr = dt_cnvr, title = "stage1 init") 57 | p2 <- plot_model(paras = paras_stage1_model, dt_cnvr = dt_cnvr, title = "stage1 model") 58 | 59 | p3 <- plot_model(paras = paras_stage2_init, dt_cnvr = dt_cnvr, title = "stage2 init") 60 | p4 <- plot_model(paras = paras_stage2_model, dt_cnvr = dt_cnvr, title = "stage2 model") 61 | 62 | ps <- gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2) 63 | return(ps) 64 | } -------------------------------------------------------------------------------- /example/example_create_CNVR/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Example: Boundary refinement 3 | 4 | Here is a demo of [creating CNVR](https://github.com/HaoKeLab/ensembleCNV#3-create-cnvr) using example data of CNVs clumping around one CNVR. 5 | 6 | Please specify where the git clone of ensembleCNV is located. 7 | ```sh 8 | ENSEMBLECNV= 9 | ``` 10 | 11 | Step 1: reformat the CNV calls generated from individual CNV caller: iPattern, PennCNV and QuantiSNP 12 | ```sh 13 | Rscript ${ENSEMBLECNV}/03_create_CNVR/step.1.CNV.data.R \ 14 | ${ENSEMBLECNV}/example/example_create_CNVR/results/ \ 15 | ${ENSEMBLECNV}/example/example_create_CNVR/data/iPattern_all_calls.txt \ 16 | ${ENSEMBLECNV}/example/example_create_CNVR/data/CNV.PennCNV_new.txt \ 17 | ${ENSEMBLECNV}/example/example_create_CNVR/data/quantisnp.cnv \ 18 | ${ENSEMBLECNV}/example/example_create_CNVR/data/Samples_Table.txt 19 | ``` 20 | Note: 21 | 22 | - `iPattern_all_calls.txt`, `CNV.PennCNV_new.txt`, and `quantisnp.cnv` are examples of what raw CNV calls generated from iPattern, PennCNV and QuantiSNP look like. 23 | 24 | - We do not include `Gender` column in `Samples_Table.txt` as gender information is not relevant for creating CNVR in this example. 25 | 26 | - When this step is successfully completed, you will find in `${ENSEMBLECNV}/example/example_create_CNVR/results/` directory `cnv.ipattern.txt`, `cnv.penncnv.txt`, and `cnv.quantisnp.txt`, which are reformated CNV calls from the 3 CNV callers. 27 | 28 | Step 2: create CNVR 29 | ```sh 30 | Rscript ${ENSEMBLECNV}/03_create_CNVR/step.2.create.CNVR.R \ 31 | --icnv ${ENSEMBLECNV}/example/example_create_CNVR/results/cnv.ipattern.txt \ 32 | --pcnv ${ENSEMBLECNV}/example/example_create_CNVR/results/cnv.penncnv.txt \ 33 | --qcnv ${ENSEMBLECNV}/example/example_create_CNVR/results/cnv.quantisnp.txt \ 34 | --snp ${ENSEMBLECNV}/example/example_create_CNVR/data/SNP_pos.txt \ 35 | --centromere ${ENSEMBLECNV}/example/example_create_CNVR/data/centromere_hg19.txt \ 36 | --output ${ENSEMBLECNV}/example/example_create_CNVR/results/ 37 | ``` 38 | 39 | Note: When this step is successfully completed, you will find some intermediate outputs and the final results in the directory `${ENSEMBLECNV}/example/example_create_CNVR/results/`, including 40 | - `cnv_clean.txt`: the table of merged CNV events from iPattern, PennCNV and QuantiSNP; the `CNVR_ID` in the table indicates which CNVR each CNV belongs to. 41 | - `cnvr_clean.txt`: the table of constructed CNVRs with each assigned a `CNVR_ID`. 42 | -------------------------------------------------------------------------------- /04_CNV_genotype/step.1.split.cnvrs.into.batches.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript --vanilla 2 | 3 | ## Split the list of created CNVRs in each chromosome into batches, such that 4 | ## the CNVRs can be processed in parallel. 5 | 6 | suppressMessages(require(optparse)) 7 | 8 | option_list = list( 9 | make_option(c("-i", "--input"), action = "store", type = "character", default = NA, 10 | help = "CNVR dataset input"), 11 | make_option(c("-o", "--output"), action = "store", type = "character", default = NA, 12 | help = "CNVR dataset output"), 13 | make_option(c("-c", "--cnv"), action = "store", type = "character", default = NA, 14 | help = "CNV after cleaning dataset"), 15 | make_option(c("-n", "--num"), action = "store", type = "integer", default = 200, 16 | help = "number of CNVRs in each batch") 17 | ) 18 | 19 | opt = parse_args( OptionParser(option_list = option_list) ) 20 | 21 | pars = c(opt$input, opt$output, opt$num) ##opt$cnv 22 | if ( any(is.na(pars)) ) { 23 | stop("All parameter must be supplied.(--help for detail)") 24 | } 25 | 26 | 27 | # main ------------------------------------------------------------------- 28 | 29 | dt_cnvr = read.delim( file = opt$input, as.is = TRUE ) 30 | n_cnvr = nrow(dt_cnvr) 31 | 32 | dt_cnvr = dt_cnvr[order(dt_cnvr$chr, dt_cnvr$posStart, dt_cnvr$posEnd), ] 33 | 34 | cat('total cnvr number:', n_cnvr, "\n") 35 | 36 | number_each_batch = as.integer( opt$num ) ## 200 default 37 | 38 | # add raw Freq information (This has been done in create CNVR step?) 39 | # dt_cnv = readRDS(file = opt$cnv) 40 | # nrow(dt_cnv) 41 | # tbl <- table(dt_cnv$CNVR_ID) 42 | # freqs <- as.vector(tbl) 43 | # dt_freq <- data.frame(CNVR_ID = names(tbl), Freq = freqs, stringsAsFactors = FALSE) 44 | 45 | # dt_cnvr <- merge(dt_cnvr, dt_freq, by = "CNVR_ID") 46 | # stopifnot( nrow(dt_cnvr) == n_cnvr) 47 | 48 | # split batches in each chr 49 | chrs <- sort(unique(dt_cnvr$chr)) 50 | 51 | dt_cnvr_new <- data.frame() 52 | for (chr1 in chrs) { 53 | 54 | dt_cnvr1 <- subset(dt_cnvr, chr == chr1) 55 | idxs_batch <- 1:nrow(dt_cnvr1) 56 | 57 | n1 <- nrow(dt_cnvr1) 58 | n2 <- ceiling(n1/number_each_batch) 59 | 60 | cat("chr:", chr1, "number of cnvrs:", n1, "\n") 61 | if (n2 == 1) { 62 | 63 | dt_cnvr1$batch <- 1 64 | dt_cnvr_new <- rbind(dt_cnvr_new, dt_cnvr1) 65 | 66 | } else { 67 | 68 | cuts <- cut(idxs_batch, breaks = n2, include.lowest = TRUE) 69 | cuts_index <- as.integer(cuts) 70 | dt_cnvr1$batch <- cuts_index 71 | 72 | dt_cnvr_new <- rbind(dt_cnvr_new, dt_cnvr1) 73 | } 74 | 75 | } 76 | 77 | write.table(dt_cnvr_new, 78 | file = opt$output, 79 | quote = F, row.names = F, sep = "\t") 80 | 81 | 82 | -------------------------------------------------------------------------------- /02_batch_effect/PCA_on_LRR/step.3.LRR.pca.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## PCA 4 | args <- commandArgs( trailingOnly = TRUE ) 5 | wk_dir <- args[1] ## working directory where the LRR matrix is located for PCA 6 | filename_matrix <- args[2] ## the LRR matrix generated in step 2 7 | 8 | suppressMessages({ 9 | require(data.table) 10 | require(tibble) 11 | require(cowplot) 12 | require(ggplot2) 13 | }) 14 | 15 | 16 | dat_LRR <- fread(input = file.path( wk_dir, filename_matrix) ) 17 | dat_LRR <- as.data.frame(dat_LRR, stringsAsFactors = FALSE) 18 | dat_LRR <- column_to_rownames(dat_LRR, var = "V1") 19 | 20 | sampleID <- rownames( dat_LRR ) 21 | 22 | ## deal with NA values in matrix 23 | mat <- as.matrix(dat_LRR) 24 | rownames(mat) <- sampleID 25 | colnames(mat) <- NULL 26 | 27 | col_mean <- colMeans(mat, na.rm = TRUE) 28 | for (i in 1:nrow(mat)) { 29 | v1 <- as.vector(mat[i, ]) 30 | idx1 <- which(is.na(v1)) 31 | if (length(idx1) >= 1) { 32 | mat[i, idx1] <- col_mean[idx1] 33 | } 34 | } 35 | 36 | ## check which SNPs with all values being NA 37 | idxs.na.snps <- which( is.na(col_mean) ) 38 | if (length(idxs.na.snps)>0) mat <- mat[, -idxs.na.snps] ##*** 39 | 40 | dat.pca <- as.data.frame( mat ) 41 | rownames(dat.pca) <- sampleID 42 | 43 | PCA <- prcomp(dat.pca) 44 | PC <- predict(PCA) 45 | PC <- data.frame(Sample_ID = rownames(PC), 46 | PC[, c("PC1", "PC2", "PC3")], 47 | stringsAsFactors = FALSE) 48 | 49 | write.table(PC, file = file.path(wk_dir, "LRR_PCA_res.txt"), 50 | quote = F, row.names = F, sep = "\t") 51 | 52 | ## plot PCA results 53 | p12 <- ggplot(data = PC, aes(PC1, PC2)) + 54 | geom_point(size = 1) + 55 | theme_bw() + 56 | theme(plot.title = element_text(size = 20, hjust = 0.5), 57 | axis.title = element_text(size = 15, face = "bold"), 58 | axis.text = element_text(size = 15, face = "bold")) + 59 | ggtitle("PC2 ~ PC1") 60 | 61 | p13 <- ggplot(data = PC, aes(PC1, PC3)) + 62 | geom_point(size = 1) + 63 | theme_bw() + 64 | theme(plot.title = element_text(size = 20, hjust = 0.5), 65 | axis.title = element_text(size = 15, face = "bold"), 66 | axis.text = element_text(size = 15, face = "bold")) + 67 | ggtitle("PC3 ~ PC1") 68 | 69 | p23 <- ggplot(data = PC, aes(PC2, PC3)) + 70 | geom_point(size = 1) + 71 | theme_bw() + 72 | theme(plot.title = element_text(size = 20, hjust = 0.5), 73 | axis.title = element_text(size = 15, face = "bold"), 74 | axis.text = element_text(size = 15, face = "bold")) + 75 | ggtitle("PC3 ~ PC2") 76 | 77 | png(filename = file.path(wk_dir, "LRR_PCA_plots.png"), 78 | width = 12, height = 12, units = "in", res = 512) 79 | p <- plot_grid(p12, p13, p23, nrow = 2) 80 | print(p) 81 | dev.off() 82 | 83 | 84 | -------------------------------------------------------------------------------- /01_initial_call/run_iPattern/prepare_input_files_for_iPattern.R: -------------------------------------------------------------------------------- 1 | #!/urs/bin/env Rscript 2 | 3 | ## The script was used to prepare auxiliary input files for iPattern 4 | ## the auxiliary input files will be stored at ${WKDIR}/01_initial_call/run_iPattern/data_aux 5 | 6 | args <- commandArgs( trailingOnly = TRUE ) 7 | 8 | ## working directory 9 | path_wkdir <- args[1] 10 | 11 | ## project name for running iPattern 12 | project_name <- args[2] 13 | 14 | ## path to the directory for running iPattern 15 | path_run_ipattern <- file.path(path_wkdir, "01_initial_call", "run_iPattern") 16 | 17 | ##-------------------------------------------------------------------------------- 18 | ## 1) data_file: list of splitted final report files for each sample 19 | ## the directory contains the input files prepared by finalreport_to_iPattern.pl 20 | path_ipattern_prepare_data <- file.path(path_run_ipattern, "data") 21 | fls_all <- list.files(path = path_ipattern_prepare_data, pattern = ".txt$", full.names = TRUE) 22 | 23 | data_file <- data.frame(data_file = fls_all, stringsAsFactors = FALSE) 24 | write.table( data_file, file = file.path( path_run_ipattern, "data_aux", 25 | paste0(project_name, "_data_file.txt")), 26 | sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) 27 | 28 | 29 | ##-------------------------------------------------------------------------------- 30 | ## 2) gener_file: tab-delimied file which lists geneder information for each sample 31 | ## the file consists of two columns, Sample_ID and Gender, 32 | ## which may be retrieved from Samples_Table.txt (see Data section of ensembleCNV README.md) 33 | ## Samples_Table.txt is supposed to be at ${WKDIR}/data 34 | ## the gender_file does NOT have column names in the header, for example 35 | # Sample_1 M 36 | # Sample_2 F 37 | # Sample_3 F 38 | 39 | gender_file <- read.delim(file = file.path(path_wkdir, "data", "Samples_Table.txt"), as.is = TRUE) 40 | gender_file$Gender <- toupper( substr(gender_file$Gender, 1, 1) ) 41 | write.table( gender_file, file = file.path( path_run_ipattern, "data_aux", 42 | paste0(project_name, "_gender_file.txt")), 43 | sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) 44 | 45 | 46 | ##-------------------------------------------------------------------------------- 47 | ## 3) bad_samples: file lists sample IDs of poor quality to be excluded from iPattern analysis, for example 48 | # bad_sample_1 49 | # bad_sample_2 50 | # bad_sample_3 51 | 52 | ## We prepare an empty file. The user can type in bad samples. 53 | write.table(NULL, file = file.path( path_run_ipattern, "data_aux", 54 | paste0(project_name, "_bad_samples.txt")), 55 | sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) 56 | 57 | cat("Processing is completed.\n") 58 | cat("Three files are generated:\n") 59 | cat(file.path( path_run_ipattern, "data_aux", paste0(project_name, "_data_file.txt")), "\n") 60 | cat(file.path( path_run_ipattern, "data_aux", paste0(project_name, "_gender_file.txt")), "\n") 61 | cat(file.path( path_run_ipattern, "data_aux", paste0(project_name, "_bad_samples.txt")), "\n") 62 | 63 | -------------------------------------------------------------------------------- /05_boundary_refinement/step.4.update.genotype.matrix.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages( library(optparse) ) 4 | # before running this script 5 | # you need to regenotype CNVRs in cnvr_regenotype_after_refine.txt 6 | # which are generated from step3.clean.results.R 7 | 8 | option_list <- list( 9 | make_option(c("-b", "--matrixbeforerefine"), action = "store", type = "character", default = NA, 10 | help = "Path to CN and GQ matrices generated in first round of 11 | CNV genotyping step before boundary refinement."), 12 | make_option(c("-f", "--matrixrefine"), action = "store", type = "character", default = NA, 13 | help = "Path to CN and GQ matrices generated in CNV regenotyping for 14 | CNVRs with updated boundaries after refinement as well as CNVR information."), 15 | make_option(c("-p", "--refinepath"), action = "store", type = "character", default = NA, 16 | help = "Path to cnvr_kept_after_refine.txt."), 17 | make_option(c("-o", "--output"), action = "store", type = "character", default = NA, 18 | help = "Path to the directory for saving final CN and GQ matrices") 19 | ) 20 | 21 | opt <- parse_args(OptionParser(option_list = option_list)) 22 | pars <- c(opt$matrixbeforerefine, opt$matrixrefine, 23 | opt$refinepath, opt$output) 24 | if ( any(is.na(pars)) ) { 25 | stop("All parameters must be supplied. (--help for detail)") 26 | } 27 | 28 | path_matrix_before_refine <- opt$matrixbeforerefine 29 | path_matrix_refine <- opt$matrixrefine 30 | path_refine <- opt$refinepath 31 | path_output <- opt$output 32 | 33 | mat_CN_before_refine <- readRDS( file = file.path(path_matrix_before_refine, "matrix_CN.rds")) 34 | mat_GQ_before_refine <- readRDS( file = file.path(path_matrix_before_refine, "matrix_GQ.rds")) 35 | 36 | cnvrs <- rownames( mat_CN_before_refine ) 37 | samples <- colnames( mat_CN_before_refine ) 38 | 39 | # keep cnvrs after refinement 40 | dat_cnvr_keep <- read.delim( file = file.path(path_refine, "cnvr_kept_after_refine.txt"), as.is = TRUE) 41 | 42 | mat_CN_keep <- mat_CN_before_refine[dat_cnvr_keep$CNVR_ID, ] 43 | mat_GQ_keep <- mat_GQ_before_refine[dat_cnvr_keep$CNVR_ID, ] 44 | 45 | # regenotyped CNVRs with updated boundaries ---------------------------------------------- 46 | dat_cnvr_refine <- read.delim( file = file.path(path_matrix_refine, "cnvr_genotype.txt"), as.is = TRUE) 47 | 48 | mat_CN_refine <- readRDS( file = file.path(path_matrix_refine, "matrix_CN.rds")) 49 | mat_GQ_refine <- readRDS( file = file.path(path_matrix_refine, "matrix_GQ.rds")) 50 | 51 | samples_refine <- colnames( mat_CN_refine ) 52 | stopifnot( sum(samples_refine %in% samples) == length(samples)) 53 | 54 | mat_CN_refine <- mat_CN_refine[, samples] 55 | mat_GQ_refine <- mat_GQ_refine[, samples] 56 | 57 | ## final results 58 | mat_CN_final <- rbind( mat_CN_keep, mat_CN_refine ) 59 | mat_GQ_final <- rbind( mat_GQ_keep, mat_GQ_refine ) 60 | 61 | common.cols <- intersect(names(dat_cnvr_keep), names(dat_cnvr_refine)) 62 | common.cols <- setdiff(common.cols, c("batch", "genotype", "identicalID")) 63 | dat_cnvr <- rbind(dat_cnvr_keep[, common.cols], dat_cnvr_refine[, common.cols]) 64 | 65 | saveRDS( mat_CN_final, file = file.path(path_output, "matrix_CN_final.rds")) 66 | saveRDS( mat_GQ_final, file = file.path(path_output, "matrix_GQ_final.rds")) 67 | 68 | write.table(dat_cnvr, 69 | file = file.path(path_output, "cnvr_final.txt"), 70 | quote = F, row.names = F, sep = "\t") 71 | 72 | -------------------------------------------------------------------------------- /05_boundary_refinement/step.1.common.CNVR.to.refine.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(optparse)) 4 | 5 | option_list <- list( 6 | make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA, 7 | help = "Path to the directory containing necessary input data."), 8 | make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA, 9 | help = "Path to the directory for saving results."), 10 | make_option(c("-c", "--freq"), action = "store", type = "double", default = NA, 11 | help = "Frequency cut-off to select CNVRs with common CNVs for boundary refinement.") 12 | ) 13 | 14 | 15 | opt = parse_args(OptionParser(option_list = option_list)) 16 | pars = c(opt$datapath, opt$resultpath, opt$freq) 17 | 18 | if ( any(is.na(pars)) ) { 19 | stop("All three parameters must be supplied. (--help for detail)") 20 | } 21 | 22 | cutoff_freq <- as.numeric( opt$freq ) 23 | path_data <- opt$datapath 24 | path_result <- opt$resultpath 25 | 26 | path_output <- file.path( path_result ) ##"cnvr_refinement" 27 | if (!dir.exists(paths = path_output) ) dir.create(path = path_output, showWarnings = F, recursive = T) 28 | 29 | # the copy number matrix generated from CNV genotyping step 30 | mat_CN <- readRDS( file = file.path(path_data, "matrix_CN.rds")) 31 | n.sample <- ncol( mat_CN ) 32 | n.CNVR <- nrow( mat_CN ) 33 | 34 | cnvrs <- rownames( mat_CN ) 35 | 36 | freqs_CNVR <- unlist( lapply(1:n.CNVR, FUN = function(i) { 37 | v1 <- as.integer( mat_CN[i, ]) 38 | n1 <- sum( v1 %in% c(0, 1, 3)) 39 | n1 40 | })) 41 | 42 | idxs.refine <- which( freqs_CNVR >= n.sample*cutoff_freq) 43 | 44 | dat_freq <- data.frame(CNVR_ID = cnvrs, 45 | Freq = freqs_CNVR, 46 | stringsAsFactors = F) 47 | 48 | if (length(idxs.refine) > 0) { 49 | cnvrs_refine <- cnvrs[ idxs.refine ] 50 | cnvrs_keep <- cnvrs[ -idxs.refine ] 51 | } else { 52 | cnvrs_refine <- NULL 53 | cnvrs_keep <- cnvrs 54 | } 55 | 56 | # write.table( data.frame(CNVR_ID = cnvrs_refine, stringsAsFactors = FALSE), 57 | # file = file.path(path_output, "cnvrs_refine.txt"), 58 | # quote = F, row.names = F, col.names = F, sep = "\t") 59 | # write.table( data.frame(CNVR_ID = cnvrs_keep, stringsAsFactors = FALSE), 60 | # file = file.path(path_output, "cnvrs_keep.txt"), 61 | # quote = F, row.names = F, col.names = F, sep = "\t") 62 | 63 | file_cnvr <- "cnvr_genotype.txt" ## with CNV genotype information 64 | dat_cnvrs <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE) 65 | nms <- names(dat_cnvrs) 66 | names(dat_cnvrs)[nms == "Freq"] <- "raw_Freq" 67 | dat_cnvrs <- subset(dat_cnvrs, genotype == 1) 68 | 69 | dat_cnvrs <- merge( dat_cnvrs, dat_freq, by = "CNVR_ID", all = FALSE) 70 | dat_cnvrs <- dat_cnvrs[order(dat_cnvrs$chr, dat_cnvrs$posStart, dat_cnvrs$posEnd), ] 71 | stopifnot( nrow(dat_cnvrs) == nrow(dat_freq) ) 72 | 73 | if (length(cnvrs_refine) > 0) { 74 | dat_cnvrs_refine <- subset( dat_cnvrs, CNVR_ID %in% cnvrs_refine ) 75 | dat_cnvrs_keep <- subset( dat_cnvrs, CNVR_ID %in% cnvrs_keep ) 76 | } else { 77 | dat_cnvrs_refine <- data.frame(NULL) 78 | dat_cnvrs_keep <- dat_cnvrs 79 | } 80 | 81 | write.table( dat_cnvrs_keep, 82 | file = file.path(path_output, "cnvr_keep.txt"), 83 | quote = F, row.names = F, sep = "\t") 84 | write.table( dat_cnvrs_refine, 85 | file = file.path(path_output, "cnvr_refine.txt"), 86 | quote = F, row.names = F, sep = "\t") 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /05_boundary_refinement/step.2.submit.jobs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system 4 | 5 | suppressMessages(library(optparse)) 6 | 7 | option_list <- list( 8 | make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA, 9 | help = "Path to the directory containing necessary input data."), 10 | make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA, 11 | help = "Path to the directory for saving results."), 12 | make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA, 13 | help = "Path to chromosome-wise LRR and BAF matrices."), 14 | make_option(c("-i", "--refinescript"), action = "store", type = "character", default = NA, 15 | help = "Path to the main script CNVR.boundary.refinement.R."), 16 | make_option(c("-s", "--rcppfile"), action = "store", type = "character", default = NA, 17 | help = "Path to refine.rcpp to be used in this R script."), 18 | make_option(c("-r", "--centromere"), action = "store", type = "character", default = NA, 19 | help = "Path to file with centromere position of each chromosome."), 20 | make_option(c("-n", "--plot"), action = "store_true", default = FALSE, 21 | help = "[optional] Whether to generate diagnosis plots.") 22 | ) 23 | 24 | opt <- parse_args(OptionParser(option_list = option_list)) 25 | pars = c(opt$datapath, opt$resultpath, opt$matrixpath, 26 | opt$rcppfile, opt$centromere, opt$refinescript) 27 | 28 | if ( any(is.na(pars)) ) { 29 | stop("All parameters must be supplied. (--help for detail)") 30 | } 31 | 32 | script_refine <- opt$refinescript 33 | path_result <- opt$resultpath 34 | path_matrix <- opt$matrixpath 35 | path_data <- opt$datapath 36 | script_rcpp <- opt$rcppfile 37 | file_centromere <- opt$centromere 38 | flag_plot <- opt$plot 39 | 40 | # cnvrs refinement 41 | dat_cnvrs_refine <- read.delim( file = file.path(path_result, "cnvr_refine.txt"), as.is = TRUE ) 42 | stopifnot( nrow(dat_cnvrs_refine) > 0 ) 43 | 44 | chrs <- sort( unique(dat_cnvrs_refine$chr)) 45 | 46 | cmd <- paste("Rscript", script_refine, 47 | "--datapath", path_data, 48 | "--resultpath", path_result, 49 | "--matrixpath", path_matrix, 50 | "--rcppfile", script_rcpp, 51 | "--centromere", file_centromere) 52 | 53 | if ( flag_plot ) { 54 | cmd <- paste(cmd, "--plot") 55 | } 56 | 57 | for (chr1 in chrs) { 58 | 59 | cmd.chr1 <- paste(cmd, 60 | "--chr", chr1) 61 | 62 | path_log <- file.path(path_result, "res_refine/chr", chr1, "log") 63 | if (!dir.exists(path_log)) dir.create(path = path_log, recursive = TRUE) 64 | 65 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 66 | ## configure based on your system 67 | bsub.cmd.chr1 <- paste("bsub -n 2 -W 10:00", 68 | "-R 'rusage[mem=10000]'", 69 | "-P ", 70 | "-J", paste0("chr", chr1), 71 | "-q premium", 72 | "-e", file.path(path_log, paste0("boundary_refine_chr", chr1, ".err")), 73 | "-o", file.path(path_log, paste0("boundary_refine_chr", chr1, ".log")), 74 | shQuote( cmd.chr1 )) 75 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 76 | 77 | cat("chr:", chr1, bsub.cmd.chr1, "\n") 78 | system( bsub.cmd.chr1 ) 79 | Sys.sleep(0.1) 80 | } 81 | -------------------------------------------------------------------------------- /04_CNV_genotype/scripts/fun_LRR.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # process cnvrs LRR from snps to sample_based 4 | process_cnvr_LRR <- function(dt_cnvrs, samples_LRR) { 5 | 6 | samples <- unique(dt_cnvrs$Sample_ID) 7 | # subset samples_LRR 8 | samples_LRR1 <- subset(samples_LRR, Sample_ID %in% samples) 9 | median_samples_LRR_SD <- median(samples_LRR1$LRR_SD, na.rm = TRUE) 10 | # test 11 | stopifnot(nrow(samples_LRR1) == length(samples)) 12 | 13 | res <- data.frame(Sample_ID = samples, CNVR_ID = unique(dt_cnvrs$CNVR_ID), 14 | LRR_median = 0, Chr = unique(dt_cnvrs$Chr), alg = "other", 15 | CN = 2, numSNP = unique(dt_cnvrs$numSNP), stringsAsFactors = FALSE) 16 | 17 | for (i in 1:length(samples)) { 18 | 19 | sample1 <- samples[i] 20 | 21 | idx1 <- which(samples_LRR1$Sample_ID == sample1) 22 | sample1_LRR_SD <- samples_LRR1$LRR_SD[idx1] ## 23 | 24 | dt1 <- subset(dt_cnvrs, Sample_ID == sample1) 25 | 26 | LRR_median1 <- median(dt1$LRR, na.rm = TRUE) 27 | CN1 <- unique(dt1$CN) 28 | alg1 <- unique(dt1$alg) 29 | 30 | res$LRR_median[i] <- (LRR_median1/sample1_LRR_SD)*median_samples_LRR_SD ## transform 31 | # res$LRR_median[i] <- LRR_median1 32 | res$CN[i] <- CN1 33 | res$alg[i] <- alg1 34 | } 35 | 36 | res 37 | } 38 | 39 | 40 | 41 | # calculate LRR gatk whole with pi 42 | calculate_LRR_gatk_whole <- function(dt_cnvr, mu1, sigma1, lambda1, cn_type) { 43 | if(cn_type == 2) { # for all CN = 2 type 44 | 45 | dt_cnvr$LRR2 <- sapply(1:nrow(dt_cnvr), FUN = function(k) { 46 | LRR1 <- dt_cnvr$LRR_median[k] 47 | prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1) 48 | prop1 49 | }) 50 | 51 | } else if(cn_type == 1) { 52 | 53 | dt_cnvr$LRR1 <- sapply(1:nrow(dt_cnvr), FUN = function(k) { 54 | LRR1 <- dt_cnvr$LRR_median[k] 55 | prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1) 56 | prop1 57 | }) 58 | 59 | } else if(cn_type == 3) { 60 | 61 | dt_cnvr$LRR3 <- sapply(1:nrow(dt_cnvr), FUN = function(k) { 62 | LRR1 <- dt_cnvr$LRR_median[k] 63 | prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1) 64 | prop1 65 | }) 66 | 67 | } else if(cn_type == 0) { 68 | 69 | dt_cnvr$LRR0 <- sapply(1:nrow(dt_cnvr), FUN = function(k) { 70 | LRR1 <- dt_cnvr$LRR_median[k] 71 | prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1) 72 | prop1 73 | }) 74 | 75 | } 76 | 77 | return(dt_cnvr) 78 | } 79 | 80 | # output LRR calcualte gatk result 81 | output_LRR_gatk <- function(dt_cnvr, model) { 82 | 83 | dt_LRR0 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 84 | mu1 = model$mu[1], 85 | sigma1 = model$sigma[1], 86 | lambda1 = model$lambda[1], cn_type = 0) 87 | dt_LRR1 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 88 | mu1 = model$mu[2], 89 | sigma1 = model$sigma[2], 90 | lambda1 = model$lambda[2], cn_type = 1) 91 | dt_LRR2 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 92 | mu1 = model$mu[3], 93 | sigma1 = model$sigma[3], 94 | lambda1 = model$lambda[3], cn_type = 2) 95 | dt_LRR3 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 96 | mu1 = model$mu[4], 97 | sigma1 = model$sigma[4], 98 | lambda1 = model$lambda[4], cn_type = 3) 99 | dt_LRR01 <- merge(dt_LRR0, dt_LRR1) 100 | dt_LRR012 <- merge(dt_LRR01, dt_LRR2) 101 | dt_LRR0123 <- merge(dt_LRR012, dt_LRR3) ## all p(LRR_median | CN = cn_type) 102 | 103 | return(dt_LRR0123) 104 | } 105 | -------------------------------------------------------------------------------- /04_CNV_genotype/step.4.prediction.results.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(optparse)) 4 | 5 | option_list = list( 6 | make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA, 7 | help = "Path to the directory containing necessary input data."), 8 | make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA, 9 | help = "Path to the directory for saving results.") 10 | ) 11 | 12 | opt = parse_args(OptionParser(option_list = option_list)) 13 | pars = c(opt$datapath, opt$resultpath) 14 | 15 | if ( any(is.na(pars)) ) { 16 | stop("All parameters must be supplied. (--help for detail)") 17 | } 18 | 19 | path_data <- opt$datapath 20 | path_result <- opt$resultpath 21 | 22 | path_pred <- file.path(path_result, "pred") 23 | 24 | # number of samples 25 | # dat_samples <- read.delim(file = file.path(path_data, "samples_QC.txt"), as.is = TRUE) 26 | # samples <- sub("\\.txt$", "", dat_samples$File) 27 | # n_samples <- nrow(dat_samples) 28 | 29 | file_cnvr <- "cnvr_batch.txt" ## with batch information 30 | dt_cnvr_raw <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE) 31 | 32 | tbl_raw <- table(dt_cnvr_raw$chr, dt_cnvr_raw$batch) 33 | dt_freq_raw <- as.data.frame(tbl_raw) 34 | names(dt_freq_raw) <- c("chr", "batch", "Freq") 35 | dt_freq_raw <- subset(dt_freq_raw, Freq != 0) 36 | 37 | ## initialize sample list using the information from the first CNVR 38 | chr1 <- dt_freq_raw$chr[1] 39 | batch1 <- dt_freq_raw$batch[1] 40 | preds1 <- list.files(path = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1)), 41 | pattern = ".rds") 42 | dat1 <- readRDS( file = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1), preds1[1]) ) 43 | 44 | samples <- dat1$Sample_ID 45 | n_samples <- length(samples) 46 | 47 | cnvrs <- c() 48 | 49 | # row: CNVRs; column: samples 50 | res_CN <- data.frame() 51 | res_GQ <- data.frame() 52 | 53 | for ( i in 1:nrow(dt_freq_raw) ) { 54 | 55 | chr1 <- dt_freq_raw$chr[i] 56 | batch1 <- dt_freq_raw$batch[i] 57 | 58 | preds1 <- list.files(path = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1)), 59 | pattern = ".rds") 60 | 61 | cnvrs1 <- gsub("_pred.rds$", "", preds1, perl = T) 62 | cnvrs <- c(cnvrs, cnvrs1) 63 | 64 | res1_GQ <- matrix(nrow = length(cnvrs1), ncol = n_samples) 65 | rownames(res1_GQ) <- cnvrs1 66 | colnames(res1_GQ) <- samples 67 | res1_CN <- res1_GQ 68 | 69 | for (k in 1:length(preds1)) { 70 | pred1 <- preds1[k] 71 | cnvr1 <- cnvrs1[k] 72 | dat1 <- readRDS(file = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1), pred1)) 73 | 74 | ## sort the results according to the order of samples 75 | dat1 <- dat1[match(samples, dat1$Sample_ID), ] 76 | stopifnot( all(dat1$Sample_ID == samples) ) 77 | 78 | res1_GQ[k, ] <- dat1$value_GQ 79 | res1_CN[k, ] <- dat1$CN_gatk_pred 80 | } 81 | 82 | res_GQ <- rbind(res_GQ, res1_GQ) 83 | res_CN <- rbind(res_CN, res1_CN) 84 | } 85 | 86 | stopifnot( all(rownames(res_GQ) == cnvrs) ) 87 | stopifnot( all(colnames(res_GQ) == samples) ) 88 | stopifnot( all(rownames(res_CN) == cnvrs) ) 89 | stopifnot( all(colnames(res_CN) == samples) ) 90 | 91 | mat_GQ <- as.matrix(res_GQ) 92 | mat_CN <- as.matrix(res_CN) 93 | rownames(mat_GQ) <- cnvrs 94 | rownames(mat_CN) <- cnvrs 95 | colnames(mat_GQ) <- samples 96 | colnames(mat_CN) <- samples 97 | 98 | ## mark on successfully CNV-genotyped CNVRs 99 | dt_cnvr_raw$genotype <- 0 100 | dt_cnvr_raw$genotype[ dt_cnvr_raw$CNVR_ID %in% cnvrs ] <- 1 101 | 102 | write.table(dt_cnvr_raw, 103 | file = file.path(path_result, "cnvr_genotype.txt"), 104 | quote = F, row.names = F, sep = "\t") 105 | 106 | write.table(data.frame(Sample_ID = samples), 107 | file = file.path(path_result, "sample_genotype.txt"), 108 | quote = F, row.names = F, col.names = F, sep = "\t") 109 | 110 | saveRDS(mat_GQ, file = file.path(path_result, "matrix_GQ.rds")) 111 | saveRDS(mat_CN, file = file.path(path_result, "matrix_CN.rds")) 112 | 113 | -------------------------------------------------------------------------------- /04_CNV_genotype/step.2.submit.jobs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system 4 | 5 | suppressMessages(library(optparse)) 6 | 7 | option_list = list( 8 | make_option(c("-t", "--type"), action = "store", type = "character", default = NA, 9 | help = "Job submission type (0 - initial submission, 1 - resubmission of failed jobs)"), 10 | make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA, 11 | help = "Path to the directory containing necessary input data."), 12 | make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA, 13 | help = "Path to the directory for saving results."), 14 | make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA, 15 | help = "Path to chromosome-wise LRR and BAF matrices."), 16 | make_option(c("-s", "--sourcefile"), action = "store", type = "character", default = NA, 17 | help = "Path to the scripts directory containing R scripts to be loaded into R."), 18 | make_option(c("-d", "--duplicates"), action = "store_true", default = FALSE, 19 | help = "[optional] Whether duplicate pairs information will be annotated in diagnosis plots."), 20 | make_option(c("-n", "--plot"), action = "store_true", default = FALSE, 21 | help = "[optional] Whether to generate diagnosis plots."), 22 | make_option(c("-r", "--script"), action = "store", type = "character", default = NA, 23 | help = "Path to the main script CNV.genotype.one.chr.one.batch.R."), 24 | make_option(c("-l", "--joblog"), action = "store", type = "character", default = NA, 25 | help = "Path to the directory saving job logs.") 26 | ) 27 | 28 | opt = parse_args(OptionParser(option_list = option_list)) 29 | pars = c(opt$type, opt$datapath, opt$resultpath, opt$joblog, 30 | opt$matrixpath, opt$sourcefile, opt$script) 31 | 32 | if ( any(is.na(pars)) ) { 33 | stop("All required parameters must be supplied. (--help for detail)") 34 | } 35 | 36 | script <- file.path(opt$script, "CNV.genotype.one.chr.one.batch.R") 37 | cmd <- paste("Rscript", script, 38 | "--type", opt$type, 39 | "--datapath", opt$datapath, 40 | "--resultpath", opt$resultpath, 41 | "--matrixpath", opt$matrixpath, 42 | "--sourcefile", opt$sourcefile) 43 | 44 | if ( opt$duplicates ) cmd <- paste(cmd, "--duplicates") 45 | if ( opt$plot ) cmd <- paste(cmd, "--plot") 46 | 47 | path_joblog <- opt$joblog 48 | if (!dir.exists(paths = path_joblog)) dir.create(path = path_joblog, showWarnings = F, recursive = T) 49 | dir.create(path = file.path(path_joblog, "job", "ERROR"), showWarnings = F, recursive = T) 50 | dir.create(path = file.path(path_joblog, "job", "OUT"), showWarnings = F, recursive = T) 51 | 52 | path_job_error <- file.path(path_joblog, "job", "ERROR") 53 | path_job_out <- file.path(path_joblog, "job", "OUT") 54 | 55 | file_cnvr <- "cnvr_batch.txt" ## with batch information 56 | dat_cnvr <- read.delim(file = file.path(opt$datapath, file_cnvr), as.is = TRUE) 57 | chrs <- sort( unique(dat_cnvr$chr) ) 58 | 59 | for ( chr1 in chrs ) { 60 | 61 | dat_cnvr_chr1 = subset(dat_cnvr, chr == chr1) 62 | batch_chr1 = sort( unique(dat_cnvr_chr1$batch) ) 63 | 64 | if ( nrow(dat_cnvr_chr1) == 0) { 65 | next 66 | } 67 | 68 | for ( batch1 in batch_chr1 ){ 69 | 70 | cat("chr:", chr1, "batch1:", batch1, "\n") 71 | cmd1 = paste(cmd, "--chr", chr1, "--batch", batch1) 72 | 73 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 74 | ## configure based on your system 75 | bsub.cmd = paste("bsub -n 2 -W 10:00 -R 'rusage[mem=20000]' -P ", 76 | "-e", file.path(path_job_error, paste0("chr_", chr1, "_batch_", batch1, ".e")), 77 | "-o", file.path(path_job_out, paste0("chr_", chr1, "_batch_", batch1, ".o")), 78 | "-q premium", shQuote(cmd1)) 79 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 80 | 81 | cat(bsub.cmd, "\n") 82 | system(bsub.cmd) 83 | 84 | Sys.sleep(0.1) 85 | } 86 | 87 | } 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /01_initial_call/run_QuantiSNP/README.md: -------------------------------------------------------------------------------- 1 | ## QuantiSNP 2 | 3 | ### Installation 4 | 5 | To download and install QuantiSNP (version 2), please follow the detailed instructions at the [page](https://sites.google.com/site/quantisnp/downloads), which provides links to download MATLAB Run-Time Component Libraries, QuantiSNP package and GC content data. For more information about QuantiSNP, please refer to their original [QuantiSNP website](https://sites.google.com/site/quantisnp/home). 6 | 7 | After installation, set up environment variable QUANTISNP: `export QUANTISNP='/path/to/quantisnp'` 8 | 9 | Please organize the installation folder in the following way: 10 | 11 | - MATLAB Run-Time Component Libraries root directory: `${QUANTISNP}/v79/` 12 | - QuantiSNP root directory: `${QUANTISNP}/quantisnp/` 13 | - GC content data (take b37/hg19 data for example) directory: `${QUANTISNP}/data/b37/` 14 | 15 | Note: 16 | 17 | - Running QuantiSNP does not require MATLAB, but rather the developers provided a self-contained MATLAB Run-Time Component Libraries in accompany with QuantiSNP. 18 | 19 | - Package libxp6 (e.g. https://packages.ubuntu.com/trusty/libxp6) needs to be installed. 20 | 21 | - We have checked that the installation of MATLAB Run-Time Component Libraries and QuantiSNP worked properly on two versions of Linux: CentOS 6.9 with openjdk 6 (the system used on [Minverva](https://hpc.mssm.edu/) cluster) or Ubuntu 16.04 with openjdk 8. The installation of the two components will probably require some further tweaking for other different systems. 22 | 23 | ### Analysis workflow 24 | 25 | Note: 26 | 27 | - QuantiSNP was originally designed to analyze one sample at a time or a batch of samples sequentially. Please refer to the original QuantiSNP [usage](https://sites.google.com/site/quantisnp/howto) for more details. Here, we provide scripts to run the analysis on multiple samples in parallel via job submitting system (one sample per job) in a cluster environment. 28 | 29 | - In the following steps (1) and (2), the scripts regarding job submission embraced by "##<<<... ##>>>..." in the scripts need to be specified by the users based on the system the users are using. 30 | 31 | We run QuantiSNP analysis with the following 3 steps: 32 | 33 | (1) Run QuantiSNP for each sample in parallel (through job submitting system) 34 | ```sh 35 | Rscript ${WKDIR}/01_initial_call/run_QuantiSNP/step.1.prepare.QuantiSNP.R \ 36 | --quantisnp ${QUANTISNP} \ 37 | --data ${WKDIR}/01_initial_call/run_QuantiSNP/data \ ## generated with finalreport_to_QuantiSNP.pl 38 | --sample ${WKDIR}/data/Samples_Table.txt \ 39 | --result ${WKDIR}/01_initial_call/run_QuantiSNP/results/res 40 | ``` 41 | Note: For details about `Samples_Table.txt`, please check the section [data](https://github.com/HaoKeLab/ensembleCNV#data). 42 | 43 | When the analysis is completed, there will be subfolders named after sample IDs, each for one sample respectively, created in the directory `${WKDIR}/01_initial_call/run_QuantiSNP/results/res`. Within each sample subfolders, two files (among others) will be generated and used in downstream analysis: 44 | - `.qc`: chromosome-level summary statistics, which will be summarized later at sample level and used in checking [batch effect](https://github.com/HaoKeLab/ensembleCNV#pca-on-summary-statistics). 45 | - `.cnv`: raw CNV calls for each sample. 46 | 47 | (2) Check job status and resubmit unfinishing jobs 48 | ```sh 49 | Rscript ${WKDIR}/01_initial_call/run_QuantiSNP/step.2.check.QuantiSNP.R \ 50 | --quantisnp ${QUANTISNP} \ 51 | --data ${WKDIR}/01_initial_call/run_QuantiSNP/data \ ## generated with finalreport_to_QuantiSNP.pl 52 | --sample ${WKDIR}/data/Samples_Table.txt \ 53 | --result ${WKDIR}/01_initial_call/run_QuantiSNP/results/res 54 | ``` 55 | This step checks if the jobs submitted for each sample in step (1) are successfully completed and resubmits failed jobs if there is any. 56 | 57 | (3) Combine PennCNV results from each sample, including the content in ".cnv" files 58 | ```sh 59 | perl ${WKDIR}/01_initial_call/run_QuantiSNP/step.3.combine.QuantiSNP.pl \ 60 | --in_dir ${WKDIR}/01_initial_call/run_QuantiSNP/results/res \ 61 | --out_dir ${WKDIR}/01_initial_call/run_QuantiSNP/results 62 | ``` 63 | When the analysis is completed, you will find `quantisnp.cnv`, which will be used by ensembleCNV, in the directory `${WKDIR}/01_initial_call/run_QuantiSNP/results`. `quantisnp.cnv` combines the CNV calls from all samples generated in steps (1) and (2). 64 | -------------------------------------------------------------------------------- /02_batch_effect/PCA_on_LRR/step.2.LRR.matrix.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | ## build matrix file for all samples using 100000 selected SNPs 4 | use strict; 5 | use Carp; 6 | 7 | ## input 8 | my $file_snps_selected = $ARGV[0]; ## seleted SNPs file from the first step 9 | my $reportfile = $ARGV[1]; ## finalreport from Genome Studio 10 | my $file_matrix_LRR = $ARGV[2]; ## output LRR matrix file 11 | 12 | ## read in selected snps 13 | open(IN, "< $file_snps_selected") or die "Error: can't open snps file $file_snps_selected: $!"; 14 | my %snps=(); 15 | while (my $line=) { 16 | chomp $line; 17 | #print "$line\n"; 18 | $snps{$line}++; 19 | } 20 | close IN; 21 | 22 | my @snps=(keys %snps); 23 | print "total number of SNPs: ".scalar(@snps)."\n"; 24 | 25 | ## parse the header of final report 26 | open(REPORT, "< $reportfile") or die "Error: can't open finalreport $reportfile: $!"; 27 | 28 | my (@field); 29 | my ($count_line, $sample_index, $name_index, $LRR_index) = (0); ## HC 30 | 31 | while () { 32 | $count_line++; 33 | m/^\[Data\]/ and last; 34 | $count_line > 1000 and confess "Error: after reading 1000 lines in $reportfile, still cannot find [Data] section. The $reportfile file may not be in Illumina report format.\n"; 35 | } 36 | 37 | $_ = ; 38 | s/[\r\n]+$//; 39 | $count_line++; 40 | @field = split (/\t/, $_); 41 | @field >= 3 or confess confess "Error: invalid header line (at least 3 tab-delimited fields, including 'SNP Name', 'Sample ID', 'Log R Ratio' expected) in report file $reportfile: <$_>\n"; 42 | 43 | for my $i (0 .. @field-1) { 44 | $field[$i] eq 'SNP Name' and $name_index = $i; 45 | $field[$i] eq 'Sample ID' and $sample_index = $i; 46 | $field[$i] eq 'Log R Ratio' and $LRR_index = $i; 47 | } 48 | 49 | defined $name_index or confess "Error: the 'SNP Name' field is not found in header line in report file $reportfile: <$_>\n"; 50 | defined $sample_index or confess "Error: the 'Sample ID' field is not found in header line in report file $reportfile: <$_>\n"; 51 | defined $LRR_index or confess "Error: the 'Log R Ratio' field is not found in header line in report file $reportfile: <$_>\n"; 52 | 53 | ## parse data part of final report 54 | my %samples = (); ## hash for sample ID 55 | my %hash = (); ## hash of LRR values at selected SNPs for one sample 56 | 57 | my $flagsample = 0; ## indicator of the first sample =0; following samples =1 58 | my $lrrsample = (); ## tab-delimited LRR values for one sample 59 | my $SampleIDraw = (); ## temporary sample ID of one sample 60 | my $total = 0; ## counter of current number of LRR values recorded in $lrrsample 61 | my $flageof = 0; ## indicaotr of eof =0 not EOF; =1 EOF 62 | 63 | while (my $line = ) { 64 | 65 | $flageof = 1 if eof; ## add file eof flag 66 | $line =~ s/[\r\n]+$//; # HC 67 | 68 | my @line=split(/\t/, $line); 69 | 70 | ## tansform Log R Ratio 71 | if (exists($samples{$line[$sample_index]}) && exists($snps{$line[$name_index]})) { ##%snps has been converted to @snps in line 22?? 72 | 73 | my $lrrvalue = $line[$LRR_index]; 74 | $lrrvalue =~ tr/\015//d; 75 | $lrrsample = $lrrsample."\t".$lrrvalue; 76 | $flagsample = 1; 77 | $SampleIDraw = $line[$sample_index]; 78 | $total++; 79 | 80 | if ($flageof == 1) { 81 | $hash{$SampleIDraw} = $lrrsample; 82 | print "SampleID: $SampleIDraw\t".scalar(keys %samples)."\t$total\n"; 83 | last; 84 | } 85 | 86 | } elsif (exists($samples{$line[$sample_index]})) { 87 | 88 | if ($flageof == 1) { 89 | $hash{$SampleIDraw} = $lrrsample; 90 | print "SampleID: $SampleIDraw\t".scalar(keys %samples)."\t$total\n"; 91 | last; 92 | } else { 93 | next; 94 | } 95 | 96 | } else { 97 | 98 | if ($flagsample == 0) { 99 | 100 | ## initialize the first sample 101 | if (exists($snps{$line[$name_index]})) { 102 | $samples{$line[$sample_index]}++; 103 | my $lrrvalue = $line[$LRR_index]; 104 | $lrrsample = $lrrvalue; 105 | $total++; 106 | } 107 | } elsif ($flagsample == 1) { 108 | 109 | if (exists($snps{$line[$name_index]})) { 110 | 111 | ## complete the previous sample 112 | $hash{$SampleIDraw} = $lrrsample; 113 | print "SampleID: $SampleIDraw\t".scalar(keys %samples)."\t$total\n"; 114 | 115 | ## initialize another new sample 116 | $samples{$line[$sample_index]}++; 117 | $lrrsample = (); 118 | my $lrrvalue = $line[$LRR_index]; 119 | $lrrsample = $lrrvalue; 120 | $total = 1; 121 | } 122 | } 123 | } 124 | } 125 | 126 | close IN; 127 | 128 | ## save LRR matrix 129 | open(OUT, ">", $file_matrix_LRR) or die "Error: can't open file $file_matrix_LRR: $!"; 130 | foreach my $item (keys %hash) { 131 | print OUT "$item\t$hash{$item}\n"; 132 | } 133 | close OUT; 134 | -------------------------------------------------------------------------------- /01_initial_call/finalreport_to_matrix_LRR_and_BAF/transform_from_tab_to_rds.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages({ 4 | require(data.table, quietly = TRUE) 5 | require(tibble, quietly = TRUE) 6 | require(optparse, quietly = TRUE) 7 | }) 8 | 9 | option_list = list( 10 | make_option(c("-i", "--input"), action = "store", default = NA, type = "character", 11 | help = "path of perl code output"), 12 | make_option(c("-o", "--output"), action = "store", default = NA, type = "character", 13 | help = "path to save .rds file"), 14 | make_option(c("-s", "--startChr"), action = "store", default = 1, type = "integer", 15 | help = "start Chr name [default %default]"), 16 | make_option(c("-d", "--endChr"), action = "store", default = 22, type = "integer", 17 | help = "end Chr name [default %default]") 18 | ) 19 | 20 | opt <- parse_args(OptionParser(option_list = option_list)) 21 | pars <- c(opt$input, opt$output, opt$startChr, opt$endChr) 22 | 23 | if ( any(is.na(pars)) ) { 24 | stop("All parameters must be supplied. (--help for detail)") 25 | } 26 | 27 | path_input <- opt$input 28 | path_output <- opt$output 29 | startChr <- opt$startChr 30 | endChr <- opt$endChr 31 | 32 | if ( !(is.integer(startChr) & is.integer(endChr)) ) { 33 | stop("parameters startChr and endChr must be integer.") 34 | } 35 | 36 | if ( startChr > endChr | startChr < 1 | endChr > 22 ) { 37 | stop("parameters startChr and endChr should satisfy 1 <= startChr <= endChr <= 22.") 38 | } 39 | 40 | chrs <- seq(startChr, endChr) 41 | # create LRR/BAF folder ---------------------------------------------------/ 42 | if ( !dir.exists(file.path(path_output, "LRR"))) dir.create(path = file.path(path_output, "LRR"), showWarnings = FALSE, recursive = TRUE) 43 | if ( !dir.exists(file.path(path_output, "BAF"))) dir.create(path = file.path(path_output, "BAF"), showWarnings = FALSE, recursive = TRUE) 44 | 45 | # read in annotate files --------------------------------------------------/ 46 | dat_snpName = fread( input = file.path(path_input, "snps_name.txt"), header = FALSE) 47 | dat_snpName = as.data.frame(dat_snpName, stringsAsFactors = FALSE) 48 | names( dat_snpName) <- c("Chr", "SNPs") 49 | 50 | dat_snpNum = fread( input = file.path(path_input, "snps_number.txt"), header = FALSE) 51 | dat_snpNum = as.data.frame( dat_snpNum, stringsAsFactors = FALSE) 52 | names( dat_snpNum) <- c("Chr", "number") 53 | 54 | dat_snpPos = fread( input = file.path(path_input, "SNP_pos.txt"), header = TRUE) 55 | dat_snpPos = as.data.frame(dat_snpPos, stringsAsFactors = FALSE) 56 | names( dat_snpPos) <- c("name", "chr","position") 57 | 58 | dat_samples_order <- read.table(file = file.path(path_input, "samples_order.txt"), 59 | sep = "\t", header = F, stringsAsFactors = F) 60 | names(dat_samples_order) <- c("sampleID" ,"order") 61 | dat_samples_order <- dat_samples_order[order(dat_samples_order$order), ] 62 | 63 | for (chr1 in chrs) { 64 | 65 | cat("chr:", chr1, "\n") 66 | 67 | snp1 <- unlist(strsplit( dat_snpName$SNPs[dat_snpName$Chr == chr1], 68 | split = "___", fixed = TRUE)) 69 | 70 | n1 <- dat_snpNum$number[ dat_snpNum$Chr == chr1] 71 | stopifnot( length(snp1) == n1) 72 | 73 | snp_position_chr1 <- subset( dat_snpPos, name %in% snp1) 74 | snp_position_chr1 <- snp_position_chr1[ order(snp_position_chr1$position), ] 75 | 76 | snp1_order <- snp_position_chr1$name 77 | stopifnot( nrow(snp_position_chr1) == n1 ) 78 | 79 | # read in LRR/BAF 80 | dat_chr1_LRR <- fread( input = file.path( path_input, "LRR", paste0(chr1, ".tab")), header = FALSE) 81 | dat_chr1_BAF <- fread( input = file.path( path_input, "BAF", paste0(chr1, ".tab")), header = FALSE) 82 | 83 | dat_chr1_LRR <- as.data.frame(dat_chr1_LRR, stringsAsFactors = FALSE) 84 | dat_chr1_BAF <- as.data.frame(dat_chr1_BAF, stringsAsFactors = FALSE) 85 | 86 | rownames( dat_chr1_LRR ) <- NULL 87 | rownames( dat_chr1_BAF ) <- NULL 88 | 89 | dat_chr1_LRR <- column_to_rownames( dat_chr1_LRR, var = "V1") 90 | dat_chr1_BAF <- column_to_rownames( dat_chr1_BAF, var = "V1") 91 | 92 | stopifnot( ncol(dat_chr1_LRR) == n1 ) 93 | stopifnot( ncol(dat_chr1_BAF) == n1 ) 94 | 95 | names(dat_chr1_LRR) <- snp1 96 | names(dat_chr1_BAF) <- snp1 97 | 98 | dat_chr1_LRR <- dat_chr1_LRR[, snp1_order, drop = FALSE] 99 | dat_chr1_BAF <- dat_chr1_BAF[, snp1_order, drop = FALSE] 100 | 101 | ## check samples_order 102 | stopifnot( all(rownames(dat_chr1_LRR) == dat_samples_order$sampleID) ) 103 | stopifnot( all(rownames(dat_chr1_BAF) == dat_samples_order$sampleID) ) 104 | 105 | saveRDS( dat_chr1_LRR, file = file.path(path_output, "LRR", paste0("matrix_chr_", chr1, "_LRR.rds"))) 106 | saveRDS( dat_chr1_BAF, file = file.path(path_output, "BAF", paste0("matrix_chr_", chr1, "_BAF.rds"))) 107 | 108 | } 109 | 110 | cat("Analysis completed! The output files are at:", path_output, "\n") 111 | 112 | -------------------------------------------------------------------------------- /04_CNV_genotype/scripts/fun_BAF.R: -------------------------------------------------------------------------------- 1 | 2 | # BAF emission probability (defined in PennCNV paper) 3 | eBAF <- function (b, z, pB) { 4 | pib <- 0.01 5 | 6 | mu0 <- 0.00 7 | mu14 <- 0.25 8 | mu13 <- 1.0/3.0 9 | mu12 <- 0.5 10 | mu23 <- 2.0/3.0 11 | mu34 <- 0.75 12 | mu1 <- 1.00 13 | 14 | sd0 <- 0.016372 15 | sd14 <- 0.042099 16 | sd13 <- 0.045126 17 | sd12 <- 0.034982 18 | sd23 <- 0.045126 19 | sd34 <- 0.042099 20 | sd1 <- 0.016372 21 | 22 | M0 <- 0.5 23 | M1 <- 0.5 24 | 25 | sd5 <- 0.304243 ## for calculate CN = 0 26 | ## z=1, CN = 0, two copy deletion state 27 | if (z == 1) { 28 | e <- dnorm(b, mean = mu12, sd = sd5) 29 | } 30 | 31 | ## z=2, CN=1, one copy deletion state 32 | if (z==2) { 33 | e <- pib + 34 | (1 - pib) * (1-pB) * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) + 35 | (1 - pib) * pB * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) ) 36 | } 37 | 38 | ## z=3, CN=2, normal copy number state 39 | if (z==3) { 40 | e <- pib + 41 | (1 - pib) * 2*pB*(1-pB) * dnorm(b, mu12, sd12) + 42 | (1 - pib) * (1-pB)^2 * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) + 43 | (1 - pib) * pB^2 * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) ) 44 | } 45 | 46 | ## z=4, CN=2, CN-LOH state 47 | if (z==4) { 48 | e <- pib + 49 | (1 - pib) * (1-pB) * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) + 50 | (1 - pib) * pB * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) ) 51 | } 52 | 53 | ## z=5, CN=3, one copy duplication state 54 | if (z==5) { 55 | e <- pib + 56 | (1 - pib) * 3*pB*(1-pB)^2 * dnorm(b, mu13, sd13) + 57 | (1 - pib) * 3*pB^2*(1-pB) * dnorm(b, mu23, sd23) + 58 | (1 - pib) * (1-pB)^3 * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) + 59 | (1 - pib) * pB^3 * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) ) 60 | } 61 | 62 | return(e) 63 | } 64 | 65 | # BAF for gatk 66 | baf_gatk_whole <- function(b, pB1, CN) { 67 | 68 | if (CN == 2) { 69 | return(eBAF(b = b, z = 3, pB = pB1)) 70 | } else if (CN == 1) { 71 | return(eBAF(b = b, z = 2, pB = pB1)) 72 | } else if (CN ==3) { 73 | return(eBAF(b = b, z = 5, pB = pB1)) 74 | } else if (CN == 0) { 75 | return(eBAF(b = b, z = 1, pB = pB1)) 76 | } 77 | 78 | } 79 | 80 | # calculate_BAF_gatk_whole CN = 0, 1, 2, 3 81 | calculate_BAF_gatk_whole <- function(dt_cnvrs) { 82 | dt_cnvrs <- arrange(dt_cnvrs, Sample_ID, Name) 83 | samples <- unique(dt_cnvrs$Sample_ID) 84 | snps <- unique(dt_cnvrs$Name) 85 | snps <- dt_cnvrs$Name[1:length(snps)] ## snps 86 | pfbs <- dt_cnvrs$PFB[1:length(snps)] ## PFB 87 | 88 | m0 <- matrix(data = NA, nrow = length(samples), ncol = length(snps)) 89 | m1 <- matrix(data = NA, nrow = length(samples), ncol = length(snps)) 90 | m2 <- matrix(data = NA, nrow = length(samples), ncol = length(snps)) 91 | m3 <- matrix(data = NA, nrow = length(samples), ncol = length(snps)) 92 | 93 | for (i in 1:length(snps)) { 94 | 95 | snp1 <- snps[i] 96 | pfb1 <- pfbs[i] 97 | 98 | samples_snp <- subset(dt_cnvrs, Name == snp1) 99 | samples_new <- samples_snp$Sample_ID 100 | cns_new <- samples_snp$CN ## CN 101 | 102 | bafs_ep_0 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 0)) 103 | bafs_ep_1 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 1)) 104 | bafs_ep_2 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 2)) 105 | bafs_ep_3 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 3)) 106 | 107 | ## detect NaN values in bafs_ep_1/2/3 108 | idxs_na_0 <- which(is.na(bafs_ep_0)) 109 | bafs_ep_0[idxs_na_0] <- median(bafs_ep_0, na.rm = TRUE) 110 | idxs_na_1 <- which(is.na(bafs_ep_1)) 111 | bafs_ep_1[idxs_na_1] <- median(bafs_ep_1, na.rm = TRUE) ## add median values 112 | idxs_na_2 <- which(is.na(bafs_ep_2)) 113 | bafs_ep_2[idxs_na_2] <- median(bafs_ep_2, na.rm = TRUE) ## add median values 114 | idxs_na_3 <- which(is.na(bafs_ep_3)) 115 | bafs_ep_3[idxs_na_3] <- median(bafs_ep_3, na.rm = TRUE) ## add median values 116 | 117 | m0[, i] <- bafs_ep_0 118 | m1[, i] <- bafs_ep_1 119 | m2[, i] <- bafs_ep_2 120 | m3[, i] <- bafs_ep_3 121 | } 122 | 123 | baf_eps_0 <- apply(m0, MARGIN = 1, prod) 124 | baf_eps_1 <- apply(m1, MARGIN = 1, prod) ### add na.rm 125 | baf_eps_2 <- apply(m2, MARGIN = 1, prod) 126 | baf_eps_3 <- apply(m3, MARGIN = 1, prod) 127 | 128 | dt_BAF <- data.frame(Sample_ID = samples_new, 129 | CN = cns_new, stringsAsFactors = FALSE) 130 | dt_BAF$BAF0 <- baf_eps_0 131 | dt_BAF$BAF1 <- baf_eps_1 132 | dt_BAF$BAF2 <- baf_eps_2 133 | dt_BAF$BAF3 <- baf_eps_3 134 | 135 | dt_BAF 136 | } 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /01_initial_call/run_PennCNV/step.5.clean.PennCNV.res.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## The script was used to run PennCNV on Minerva high performance cluster. 4 | ## You need to modifiy it according to the system you are using if you would like to use it. 5 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information 6 | 7 | suppressMessages({ 8 | require( optparse, quietly = TRUE) 9 | }) 10 | 11 | option_list <- list( 12 | make_option(c("-p", "--penncnv"), action = "store", default = NA, type = "character", 13 | help = "path to PennCNV installation folder."), 14 | make_option(c("-i", "--input"), action = "store", default = NA, type = "character", 15 | help = "input path for combined PennCNV result."), 16 | make_option(c("-f", "--pfb"), action = "store", default = NA, type = "character", 17 | help = "pfb file."), 18 | make_option(c("-n", "--name"), action = "store", default = "CNV.PennCNV", type = "character", 19 | help = "rawcnv filename generated in step (4).") 20 | ) 21 | 22 | opt <- parse_args(OptionParser(option_list = option_list)) 23 | 24 | path_penncnv <- opt$penncnv 25 | path_input <- opt$input 26 | file_pfb <- opt$pfb 27 | name_rawcnv <- opt$name 28 | 29 | if (any(is.na(c(path_input, file_pfb, name_rawcnv)))) { 30 | stop("All parameters must be supplied.( --help for details )") 31 | } 32 | 33 | # clean CNV --------------------------------------------------------------- 34 | 35 | setwd(dir = path_input) 36 | path_clean <- path_input 37 | name_project <- name_rawcnv 38 | 39 | file_rawcnv <- paste(name_project, "rawcnv", sep = ".") 40 | file_pfb <- file_pfb 41 | 42 | n_rawcnv <- system(paste("cat", file_rawcnv, "| wc -l"), intern = TRUE) 43 | n_rawcnv <- as.integer(n_rawcnv) 44 | 45 | cat("CNV number before clean:", n_rawcnv, "\n") 46 | 47 | flag = 0 48 | idx = 1 49 | cnv1_in <- file_rawcnv 50 | while(flag == 0) { 51 | 52 | n_rawcnv <- as.integer(system(paste("cat", cnv1_in, "|", "wc -l"), intern = TRUE)) 53 | cnv1_out <- paste(name_project, idx, "rawcnv", sep = ".") 54 | 55 | cmd1 <- paste(file.path(path_penncnv, "bin/clean_cnv.pl"), 56 | "combineseg", cnv1_in, "--signalfile", file_pfb, 57 | "--fraction 0.2", "--bp >", cnv1_out) 58 | 59 | cat("Start run Time:", idx, cmd1, "...\n") 60 | system(cmd1) 61 | cat("End run ......\n") 62 | 63 | cmd2 <- paste("cat", cnv1_out, "|", "wc -l") 64 | n_newcnv <- system(cmd2, intern = TRUE) 65 | n_newcnv <- as.integer(n_newcnv) 66 | 67 | cat("raw number:", n_rawcnv, "\n") 68 | cat("new number:", n_newcnv, "\n") 69 | 70 | if (n_rawcnv == n_newcnv) { 71 | flag = 1 72 | } else { 73 | cnv1_in <- cnv1_out 74 | idx <- idx + 1 75 | } 76 | 77 | } 78 | 79 | ## convert final PennCNV results to tab-delimit text file 80 | cnv_penncnv <- paste(name_project, idx, "rawcnv", sep = ".") 81 | cnv_tab <- paste(name_project, "txt", sep = ".") 82 | cat("Convert final PennCNV results to tab-delimit text file.\n") 83 | cmd.transform <- paste(file.path(path_penncnv, "bin/convert_cnv.pl"), 84 | "--intype", "penncnv", "--outtype", "tab", cnv_penncnv, ">", cnv_tab) 85 | system(cmd.transform) 86 | 87 | ## extract individual level statistics for QC 88 | cat("Extract individual level statistics for QC.\n") 89 | cnv_log <- paste(name_project, "log", sep = ".") 90 | cnv_qc <- paste0(name_project, "_qc.txt") 91 | cmd.extract <- paste(file.path(path_penncnv, "bin/filter_cnv.pl"), cnv_penncnv, 92 | "-qclogfile", cnv_log, "-qcsumout", cnv_qc, ">", "step5.log") 93 | system(cmd.extract) 94 | 95 | # Change SampleID column information -------------------------------------- 96 | # remove the path before Sample_ID to get a "clean" Sample_ID 97 | 98 | ## CNV results 99 | dat_CNV <- read.table(file = cnv_tab, sep = "\t", 100 | header = FALSE, comment.char = "", check.names = FALSE, stringsAsFactors = FALSE) 101 | samples_path <- dat_CNV$V5 102 | sampleIDs <- unlist(lapply(1:length(samples_path), FUN = function(k) { 103 | sample1 <- samples_path[k] 104 | str1 <- unlist(strsplit(sample1, split = "/", fixed = TRUE)) 105 | str1[length(str1)] 106 | })) 107 | dat_CNV$V5 <- sampleIDs ## change 108 | 109 | write.table(dat_CNV, file = paste0(name_project, "_new.txt"), 110 | sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) 111 | 112 | 113 | ## Sample-wise summary statistics 114 | dat_Sample_Stat <- read.table(file = cnv_qc, sep = "\t", 115 | header = TRUE, check.names = FALSE, stringsAsFactors = FALSE) 116 | files <- dat_Sample_Stat$File 117 | files_new <- unlist(lapply(1:length(files), FUN = function(k) { 118 | file1 <- files[k] 119 | str1 <- unlist(strsplit(file1, split = "/", fixed = TRUE)) 120 | str1[length(str1)] 121 | })) 122 | 123 | dat_Sample_Stat$File <- files_new 124 | 125 | write.table(dat_Sample_Stat, file = paste0(name_project, "_qc_new.txt"), 126 | sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /06_performance_assessment/step.2.set.GQ.generate.results.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(optparse)) 4 | suppressMessages(library(plyr)) 5 | 6 | option_list <- list( 7 | make_option(c("-n", "--matrixCN"), action = "store", default = NA,type = "character", 8 | help = "Path to matrix of copy number (CN)"), 9 | make_option(c("-g", "--matrixGQ"), action = "store", default = NA,type = "character", 10 | help = "Path to matrix of genotyping quality (GQ) score."), 11 | make_option(c("-c", "--cnvrfile"), action = "store", default = NA, type = "character", 12 | help = "Path to CNVR information after boundary refinement."), 13 | make_option(c("-o", "--resultpath"), action = "store", default = NA,type = "character", 14 | help = "Path to directory for saving assessment results."), 15 | make_option(c("-s", "--gqscore"), action = "store", default = NA, type = "integer", 16 | help = "Set GQ score threshold.") 17 | ) 18 | 19 | opt <- parse_args(OptionParser(option_list = option_list)) 20 | pars <- c(opt$matrixCN, opt$matrixGQ, opt$cnvrfile, 21 | opt$resultpath, opt$gqscore) 22 | 23 | if (any(is.na(pars))) { 24 | stop("All required parameters must be supplied. (--help for detail)") 25 | } 26 | 27 | file_matrixcn <- opt$matrixCN 28 | file_matrixgq <- opt$matrixGQ 29 | file_cnvr <- opt$cnvrfile 30 | path_result <- opt$resultpath 31 | gqscore <- as.numeric(opt$gqscore) 32 | 33 | matrix_CN <- readRDS(file = file_matrixcn) 34 | matrix_gq <- readRDS(file = file_matrixgq) 35 | dat_cnvr <- read.delim(file = file_cnvr, check.names = FALSE, as.is = TRUE) 36 | 37 | # main -------------------------------------------------------------------- 38 | 39 | idxs.nocall = which(matrix_gq < gqscore) 40 | 41 | if (length(idxs.nocall) >= 1) matrix_CN[idxs.nocall] = -9 42 | 43 | cnvrs <- rownames( matrix_CN ) 44 | samples <- colnames( matrix_CN ) 45 | 46 | n_cnvr <- nrow(matrix_CN) 47 | n_sample <- ncol(matrix_CN) 48 | 49 | ## cnvr freq 50 | list_freqs_cnvr <- lapply(1:n_cnvr, FUN = function(k) { 51 | v1 <- as.vector(matrix_CN[k, ]) 52 | data.frame(n = length(v1), 53 | n0 = sum(v1 == 0), 54 | n1 = sum(v1 == 1), 55 | n2 = sum(v1 == 2), 56 | n3 = sum(v1 == 3), 57 | n_nocall = sum(v1 == -9)) 58 | }) 59 | 60 | freqs_cnvr <- do.call(rbind, list_freqs_cnvr) 61 | 62 | dat_freqs_cnvr <- data.frame(freqs_cnvr, stringsAsFactors = F, check.names = F) 63 | dat_freqs_cnvr$CNVR_ID <- cnvrs 64 | 65 | dat_freqs_cnvr$callRate <- (dat_freqs_cnvr$n0 + dat_freqs_cnvr$n1 + dat_freqs_cnvr$n2 + dat_freqs_cnvr$n3)/dat_freqs_cnvr$n 66 | dat_freqs_cnvr$freq <- (dat_freqs_cnvr$n0 + dat_freqs_cnvr$n1 + dat_freqs_cnvr$n3)/dat_freqs_cnvr$n 67 | 68 | idxs_cnvr_filter <- which(dat_freqs_cnvr$freq == 0) 69 | 70 | if (length(idxs_cnvr_filter) >= 1) { 71 | dat_freqs_cnvr <- dat_freqs_cnvr[-idxs_cnvr_filter, ] 72 | } 73 | dat_cnvr_final <- merge(dat_cnvr, dat_freqs_cnvr) 74 | dat_cnvr_final <- dat_cnvr_final[order(dat_cnvr_final$chr, 75 | dat_cnvr_final$arm, 76 | dat_cnvr_final$posStart, 77 | dat_cnvr_final$posEnd), ] 78 | 79 | dat_cnvr_final <- dat_cnvr_final[, c("CNVR_ID", "chr", "arm", "posStart", "posEnd", "start_snp", "end_snp", 80 | "n", "n0", "n1", "n2", "n3", "n_nocall", "callRate", "freq")] 81 | dat_cnvr_final <- rename(dat_cnvr_final, c("start_snp"="snpStart", "end_snp"="snpEnd")) 82 | 83 | cat(nrow(dat_cnvr_final), "CNVRs remains from", nrow(n_cnvr), "CNVRs after GQ cut-off.\n") 84 | 85 | write.table(dat_cnvr_final, file = file.path(path_result, "cnvr_after_GQ.txt"), 86 | sep = "\t", row.names = F, col.names = T, quote = F) 87 | 88 | matrix_CN_final <- matrix_CN[dat_cnvr_final$CNVR_ID, ] 89 | saveRDS(matrix_CN_final, file = file.path(path_result, "matrix_CN_after_GQ.rds")) 90 | 91 | # sample information ------------------------------------------------------ 92 | 93 | list_samples_info <- lapply(1:ncol(matrix_CN_final), FUN = function(k) { 94 | v1 <- as.vector(matrix_CN_final[, k]) 95 | data.frame(n = length(v1), 96 | n0 = sum(v1 == 0), 97 | n1 = sum(v1 == 1), 98 | n2 = sum(v1 == 2), 99 | n3 = sum(v1 == 3), 100 | n_nocall = sum(v1 == -9)) 101 | }) 102 | 103 | samples_info <- do.call(rbind, list_samples_info) 104 | samples_info <- data.frame(samples_info, stringsAsFactors = F, check.names = F) 105 | samples_info$Sample_ID <- samples 106 | 107 | samples_info$callRate <- (samples_info$n0 + samples_info$n1 + samples_info$n2 + samples_info$n3)/samples_info$n 108 | samples_info$freq <- (samples_info$n0 + samples_info$n1 + samples_info$n3)/samples_info$n 109 | samples_info <- samples_info[, c("Sample_ID", "callRate", "freq", "n", "n0", "n1", "n2", "n3", "n_nocall")] 110 | 111 | write.table(samples_info, file = file.path(path_result, "sample_after_GQ.txt"), 112 | sep = "\t", row.names = F, col.names = T, quote = F) 113 | 114 | -------------------------------------------------------------------------------- /05_boundary_refinement/step.3.clean.results.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(optparse)) 4 | suppressMessages(library(plyr)) 5 | 6 | option_list <- list( 7 | make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA, 8 | help = "Path to the directory for saving results.") 9 | ) 10 | 11 | opt <- parse_args(OptionParser(option_list = option_list)) 12 | 13 | pars <- c(opt$resultpath) 14 | if (any(is.na(pars))) { 15 | stop("All parameters must be supplied. (--help for detail)") 16 | } 17 | 18 | path_result <- opt$resultpath 19 | # combine refinement results ---------------------------------------------- 20 | path_refine <- file.path(path_result, "res_refine") 21 | folders_chr1 <- list.files(path = path_refine, pattern = "^chr") 22 | 23 | res_refinement <- data.frame() 24 | for ( folder.chr1 in folders_chr1 ) { 25 | 26 | chr1 <- gsub("^chr", "", folder.chr1, perl = T) 27 | file.chr1 <- paste("CNVR_refine_chr_", chr1, "_detail.rds", sep = "") 28 | path.chr1.data <- file.path(path_refine, folder.chr1, "data") 29 | res.chr1 <- readRDS( file = file.path( path.chr1.data, file.chr1) ) 30 | 31 | res_refinement <- rbind(res_refinement, res.chr1) 32 | } 33 | 34 | ## merge CNVRs with identifcal boundaries after refinement 35 | res_refinement$identicalID <- paste(res_refinement$Chr, 36 | res_refinement$snp.start.refine, 37 | res_refinement$snp.end.refine, sep = "___") 38 | 39 | res_refinement_same <- subset(res_refinement, type.overlap.based.on.raw == "same") 40 | res_refinement_refine <- subset(res_refinement, type.overlap.based.on.raw != "same") 41 | 42 | res_refinement_refine <- subset(res_refinement_refine, 43 | !identicalID %in% res_refinement_same$identicalID) 44 | 45 | # de-dulplicate CNVR 46 | res_refinement_refine <- res_refinement_refine[!duplicated(res_refinement_refine$identicalID), ] 47 | cat("number of CNVRs with refined boundaries:", nrow(res_refinement_refine), "\n") 48 | 49 | cnvrID_refine_same <- res_refinement_same$CNVR_ID 50 | 51 | # clean ------------------------------------------------------------------- 52 | dat_cnvr_keep <- read.delim(file = file.path(path_result, "cnvr_keep.txt"), as.is = TRUE) 53 | dat_cnvr_keep$identicalID <- paste(dat_cnvr_keep$chr, 54 | dat_cnvr_keep$start_snp, 55 | dat_cnvr_keep$end_snp, sep = "___") 56 | 57 | dat_cnvr_refine <- read.delim(file = file.path(path_result, "cnvr_refine.txt"), as.is = TRUE) 58 | dat_cnvr_refine$identicalID <- paste(dat_cnvr_refine$chr, 59 | dat_cnvr_refine$start_snp, 60 | dat_cnvr_refine$end_snp, sep = "___") 61 | 62 | cnvrID_keep <- dat_cnvr_keep$CNVR_ID 63 | cnvrID_keep_final <- union(cnvrID_keep, cnvrID_refine_same) 64 | 65 | dat_cnvr_keep_after_refine <- rbind(dat_cnvr_keep, dat_cnvr_refine) 66 | dat_cnvr_keep_after_refine <- subset(dat_cnvr_keep_after_refine, CNVR_ID %in% cnvrID_keep_final) 67 | 68 | ## CNVRs with refined boundaries 69 | res_refinement_refine_clean <- subset( res_refinement_refine, !identicalID %in% dat_cnvr_keep$identicalID ) 70 | 71 | ## CNVRs to be regnotyped after updating boundary information 72 | dat_cnvr_regt <- subset(dat_cnvr_refine, CNVR_ID %in% res_refinement_refine_clean$CNVR_ID) 73 | dat_cnvr_regt <- rename(dat_cnvr_regt, 74 | c("posStart"="posStart.round1", 75 | "posEnd"="posEnd.round1", 76 | "start_snp"="start_snp.round1", 77 | "end_snp"="end_snp.round1", 78 | "batch"="batch.round1", 79 | "genotype"="genotype.round1", 80 | "Freq"="Freq.round1", 81 | "identicalID"="identicalID.round1")) 82 | dat_cnvr_regt <- merge(dat_cnvr_regt, 83 | res_refinement_refine_clean[, 84 | c("CNVR_ID", "identicalID", "snp.posStart.refine", "snp.posEnd.refine", 85 | "snp.start.refine", "snp.end.refine")], 86 | by = "CNVR_ID") 87 | stopifnot( nrow(dat_cnvr_regt) == nrow(res_refinement_refine_clean) ) 88 | 89 | dat_cnvr_regt <- rename(dat_cnvr_regt, 90 | c("snp.posStart.refine"="posStart", 91 | "snp.posEnd.refine"="posEnd", 92 | "snp.start.refine"="start_snp", 93 | "snp.end.refine"="end_snp")) 94 | 95 | 96 | write.table(dat_cnvr_keep_after_refine, 97 | file = file.path(path_result, "cnvr_kept_after_refine.txt"), 98 | quote = F, row.names = F, sep = "\t") 99 | 100 | write.table(res_refinement_refine_clean, 101 | file = file.path(path_result, "cnvr_refined_after_refine.txt"), 102 | quote = F, row.names = F, sep = "\t") 103 | 104 | write.table(dat_cnvr_regt, 105 | file = file.path(path_result, "cnvr_regenotype_after_refine.txt"), 106 | quote = F, row.names = F, sep = "\t") 107 | 108 | -------------------------------------------------------------------------------- /01_initial_call/run_PennCNV/step.3.check.PennCNV.jobs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system 4 | 5 | ## The script was used to run PennCNV on Minerva high performance cluster. 6 | ## You need to modifiy it according to the system you are using if you would like to use it. 7 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information 8 | 9 | suppressMessages({ 10 | require( optparse, quietly = TRUE) 11 | }) 12 | 13 | options(warn = 2) 14 | 15 | option_list <- list( 16 | make_option(c("-p", "--penncnv"), action = "store", default = NA, type = "character", 17 | help = "path to PennCNV installation folder."), 18 | make_option(c("-a", "--data"), action = "store", default = NA, type = "character", 19 | help = "path to tab-delimit text data files for each sample."), 20 | make_option(c("-d", "--wkdir"), action = "store", default = NA, type = "character", 21 | help = "working directory."), 22 | make_option(c("-f", "--pfb"), action = "store", default = NA, type = "character", 23 | help = "pfb file."), 24 | make_option(c("-g", "--gcmodel"), action = "store", default = NA, type = "character", 25 | help = "gcmodel file."), 26 | make_option(c("-m", "--hmm"), action = "store", default = NA, type = "character", 27 | help = "HMM model file.") 28 | ) 29 | 30 | opt = parse_args(OptionParser(option_list = option_list)) 31 | 32 | path_penncnv <- opt$penncnv 33 | path_data <- opt$data 34 | path_wkdir <- opt$wkdir 35 | file_pfb <- opt$pfb 36 | file_gcmodel <- opt$gcmodel 37 | file_hmm <- opt$hmm 38 | 39 | if (any(is.na(c(path_data, path_wkdir, file_pfb, file_gcmodel, file_hmm)))) { 40 | stop("All parameters must be supplied. (--help for details)") 41 | } 42 | 43 | path_list <- file.path(path_wkdir, "list") 44 | path_res <- file.path(path_wkdir, "res") ## PennCNV results folder 45 | 46 | # submit jobs functions --------------------------------------------------- 47 | 48 | cmd_PennCNV <- function(file_hmm, file_pfb, file_gcmodel, 49 | filename_sample, path_list, path_res_sample) { 50 | 51 | file_list <- file.path(path_list, sub("\\.txt$", ".list", filename_sample)) 52 | 53 | samplename <- gsub(pattern = "\\.txt$", replacement = "", filename_sample) 54 | file_log <- file.path(path_res_sample, paste0(samplename, ".log")) 55 | file_rawcnv <- file.path(path_res_sample, paste0(samplename, ".rawcnv")) 56 | 57 | cmd <- paste(file.path(path_penncnv, "bin/detect_cnv.pl"), 58 | "-test --confidence", 59 | "-hmm", file_hmm, 60 | "-pfb", file_pfb, 61 | "-gcmodel", file_gcmodel, 62 | "-list", file_list, 63 | "-log", file_log, 64 | "-out", file_rawcnv) 65 | 66 | cmd 67 | } 68 | 69 | cmd_submitjob <- function(cmd.sample, samplename) { 70 | 71 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 72 | ## configure based on your system 73 | bsub.cmd <- paste("bsub -n 2 -W 00:30 -R 'rusage[mem=5000]' -P ", 74 | "-J", samplename, 75 | "-q premium", 76 | shQuote(cmd.sample)) 77 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 78 | 79 | bsub.cmd 80 | } 81 | 82 | # main loop --------------------------------------------------------------- 83 | 84 | sample_files <- list.files(path = path_data) 85 | cat("number of samples:", length(sample_files), "\n") 86 | 87 | n.success <- 0 88 | n.fail <- 0 89 | for ( i in 1:length(sample_files) ) { 90 | 91 | sample_file <- sample_files[i] 92 | samplename <- gsub(pattern = "\\.txt$", replacement = "", sample_file) 93 | 94 | path_res_sample <- file.path(path_res, samplename) 95 | file_rawcnv <- file.path(path_res_sample, paste0(samplename, ".rawcnv")) 96 | 97 | flag.folder <- dir.exists(paths = path_res_sample) 98 | flag.rawcnv <- file.exists(file_rawcnv) 99 | 100 | if ( flag.folder & flag.rawcnv ) { 101 | cat("Sample_ID:", samplename, "SUCCESS\n") 102 | n.success <- n.success + 1 103 | } else { 104 | 105 | cat("Sample_ID:", samplename, "FAILED\n") 106 | dir.create(path = path_res_sample, showWarnings = FALSE, recursive = TRUE) 107 | 108 | cmd.sample <- cmd_PennCNV(file_hmm = file_hmm, 109 | file_pfb = file_pfb, 110 | file_gcmodel = file_gcmodel, 111 | filename_sample = sample_file, 112 | path_list = path_list, 113 | path_res_sample = path_res_sample) 114 | 115 | cmd.job <- cmd_submitjob(cmd.sample = cmd.sample, samplename = samplename) 116 | 117 | system(cmd.job) 118 | Sys.sleep(0.1) 119 | 120 | n.fail <- n.fail + 1 121 | 122 | } 123 | } 124 | 125 | cat("total number of samples:", length(sample_files), 126 | "number of success:", n.success, 127 | "number of fail:", n.fail, "\n") 128 | 129 | 130 | -------------------------------------------------------------------------------- /01_initial_call/run_PennCNV/step.2.run.PennCNV.jobs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscirpt 2 | 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system 4 | 5 | ## The script was used to run PennCNV on Minerva high performance cluster. 6 | ## You need to modifiy it according to the system you are using if you would like to use it. 7 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information 8 | 9 | suppressMessages({ 10 | require( optparse, quietly = TRUE) 11 | }) 12 | 13 | option_list <- list( 14 | make_option(c("-p", "--penncnv"), action = "store", default = NA, type = "character", 15 | help = "path to PennCNV installation folder."), 16 | make_option(c("-a", "--data"), action = "store", default = NA, type = "character", 17 | help = "path to tab-delimit text data files for each sample."), 18 | make_option(c("-d", "--wkdir"), action = "store", default = NA, type = "character", 19 | help = "working directory."), 20 | make_option(c("-f", "--pfb"), action = "store", default = NA, type = "character", 21 | help = "pfb file."), 22 | make_option(c("-g", "--gcmodel"), action = "store", default = NA, type = "character", 23 | help = "gcmodel file."), 24 | make_option(c("-m", "--hmm"), action = "store", default = NA, type = "character", 25 | help = "HMM model file.") 26 | ) 27 | 28 | opt = parse_args(OptionParser(option_list = option_list)) 29 | 30 | path_penncnv <- opt$penncnv 31 | path_data <- opt$data 32 | path_wkdir <- opt$wkdir 33 | file_pfb <- opt$pfb 34 | file_gcmodel <- opt$gcmodel 35 | file_hmm <- opt$hmm 36 | 37 | if (any(is.na(c(path_data, path_wkdir, file_pfb, file_gcmodel, file_hmm)))) { 38 | stop("All parameters must be supplied. (--help for details)") 39 | } 40 | 41 | # create path ------------------------------------------------------------- 42 | 43 | path_list <- file.path(path_wkdir, "list") 44 | path_res <- file.path(path_wkdir, "res") ## PennCNV raw results folder 45 | 46 | if ( !dir.exists(path_list) ) { 47 | dir.create(path = path_list, showWarnings = FALSE, recursive = TRUE) 48 | } 49 | if ( !dir.exists(path_res) ) { 50 | dir.create(path = path_res, showWarnings = FALSE, recursive = TRUE) 51 | } 52 | 53 | 54 | # generate list.txt for each sample --------------------------------------- 55 | 56 | sample_files <- list.files(path = path_data) 57 | 58 | cat("number of samples:", length(sample_files), "\n") 59 | 60 | for ( i in 1:length(sample_files) ) { 61 | 62 | sample_file <- sample_files[i] 63 | sample_list <- sub("\\.txt$", ".list", sample_file) 64 | 65 | dat1 <- data.frame(file_name = file.path(path_data, sample_file), ## add whole path information 66 | stringsAsFactors = FALSE) 67 | write.table(dat1, file = file.path(path_list, sample_list), sep = "\t", 68 | row.names = FALSE, col.names = FALSE, quote = FALSE) 69 | } 70 | 71 | 72 | # cmd_PennCNV ------------------------------------------------------------- 73 | 74 | cmd_PennCNV <- function(file_hmm, file_pfb, file_gcmodel, 75 | filename_sample, path_list, path_res_sample) { 76 | 77 | file_list <- file.path(path_list, sub("\\.txt$", ".list", filename_sample)) 78 | 79 | samplename <- gsub(pattern = "\\.txt$", replacement = "", filename_sample) 80 | 81 | file_log <- file.path(path_res_sample, paste0(samplename, ".log")) 82 | file_rawcnv <- file.path(path_res_sample, paste0(samplename, ".rawcnv")) 83 | 84 | cmd <- paste(file.path(path_penncnv, "bin/detect_cnv.pl"), 85 | "-test --confidence", 86 | "-hmm", file_hmm, 87 | "-pfb", file_pfb, 88 | "-gcmodel", file_gcmodel, 89 | "-list", file_list, 90 | "-log", file_log, 91 | "-out", file_rawcnv) 92 | cmd 93 | } 94 | 95 | cmd_submitjob <- function(cmd.sample, samplename) { 96 | 97 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 98 | ## configure based on your system 99 | bsub.cmd <- paste("bsub -n 2 -W 00:30 -R 'rusage[mem=5000]' -P ", 100 | "-J", samplename, 101 | "-q premium", 102 | shQuote(cmd.sample)) 103 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 104 | 105 | bsub.cmd 106 | } 107 | 108 | # main loop --------------------------------------------------------------- 109 | 110 | for ( i in 1:length(sample_files) ) { 111 | 112 | sample_file <- sample_files[i] 113 | samplename <- gsub(pattern = "\\.txt$", replacement = "", sample_file) 114 | 115 | path_res_sample <- file.path(path_res, samplename) 116 | dir.create(path = path_res_sample, showWarnings = FALSE, recursive = TRUE) 117 | 118 | cat("Sample_ID:", samplename, "\n") 119 | 120 | cmd.sample <- cmd_PennCNV(file_hmm = file_hmm, 121 | file_pfb = file_pfb, 122 | file_gcmodel = file_gcmodel, 123 | filename_sample = sample_file, 124 | path_list = path_list, 125 | path_res_sample = path_res_sample) 126 | 127 | 128 | cmd.job <- cmd_submitjob(cmd.sample = cmd.sample, samplename = samplename) 129 | 130 | system(cmd.job) 131 | Sys.sleep(0.1) 132 | 133 | } 134 | 135 | 136 | -------------------------------------------------------------------------------- /01_initial_call/run_QuantiSNP/step.1.prepare.QuantiSNP.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system 4 | 5 | ## The script was used to run QuantiSNP on Minerva high performance cluster. 6 | ## You need to modifiy it according to the system you are using if you would like to use it. 7 | ## Please refer to original QuantiSNP documents (https://sites.google.com/site/quantisnp/) for more information 8 | 9 | ## sample file: in tab-delimited format and has two columns: Sample_ID and Gender 10 | ## for example 11 | # Sample_ID Gender 12 | # sample_1 Female 13 | # sample_2 Male 14 | 15 | suppressPackageStartupMessages(require(optparse)) 16 | 17 | option_list <- list( 18 | make_option(c("-q", "--quantisnp"), action = "store", default = NA, type = "character", 19 | help = "path to QuantiSNP installation folder."), 20 | make_option(c("-d", "--data"), action = "store", default = NA, type = "character", 21 | help = "data folder for runing QuantiSNP"), 22 | make_option(c("-s", "--sample"), action = "store", default = NA, type = "character", 23 | help = "sample file with Sample_ID and Gender information for runing QuantiSNP"), 24 | make_option(c("-r", "--result"), action = "store", default = NA, type = "character", 25 | help = "output folder for QuantiSNP results") 26 | ) 27 | 28 | opt <- parse_args(OptionParser(option_list = option_list)) 29 | if (is.na(opt$data) | is.na(opt$result)) { 30 | stop("All input and output arguments must be supplied.") 31 | } 32 | 33 | path_quantisnp <- opt$quantisnp 34 | path_dat <- opt$data 35 | sample_file <- opt$sample 36 | path_output <- opt$result 37 | 38 | dat_sample <- read.delim(file = sample_file, as.is = TRUE) 39 | 40 | cat("number of rows of sample table:", nrow(dat_sample), "\n") ## number of samples 41 | 42 | for (i in 1:nrow(dat_sample)) { 43 | 44 | sample_name <- as.character(dat_sample$Sample_ID[i]) 45 | gender <- tolower(as.character(dat_sample$Gender[i])) 46 | ## must change Female => female and Male => male 47 | 48 | ## check if folder exists 49 | res_files <- list.files(path = file.path(path_output, sample_name)) 50 | idx <- grep(pattern = "cnv", res_files) 51 | if (length(idx) > 0) { 52 | cat("i:", sample_name, "\n") 53 | next 54 | } 55 | 56 | ## define program variables 57 | EMITERS <- "10" ## number of EM iterations to use during training 58 | LSETTING <- "2000000" ## characteristic CNV length parameter 59 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 60 | GCDIR <- file.path(path_quantisnp, "data/b37/") ## path to GC data files (contents of gc_data.zip) 61 | PARAMSFILE <- file.path(path_quantisnp, "quantisnp/config/params.dat") ## path to parameters file 62 | LEVELSFILE <- file.path(path_quantisnp, "quantisnp/config/levels-hd.dat") ## path to levels file 63 | MCRROOT <- file.path(path_quantisnp, "v79/") ## path to MCR Run-Time Libraries 64 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 65 | CHRRANGE <- "1:23" ## chromosomes 66 | CHRX <- "23" ## which chromosome is X? 67 | OUTDIR <- file.path(path_output, sample_name) ## output directory 68 | SAMPLEID <- sample_name ## sample name 69 | GENDER <- gender ## sample gender 70 | INFILE <- file.path(path_dat, paste0(sample_name, ".txt")) ## input data file generated with finalreport_to_QuantiSNP.pl 71 | 72 | 73 | if (!file.exists(OUTDIR)) dir.create(OUTDIR) 74 | 75 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 76 | cmd <- paste(file.path(path_quantisnp, "quantisnp/linux64/run_quantisnp2.sh"), 77 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 78 | MCRROOT, 79 | paste("--chr", CHRRANGE), 80 | paste("--outdir", OUTDIR), 81 | paste("--sampleid", SAMPLEID), 82 | paste("--gender", GENDER), 83 | paste("--emiters", EMITERS), 84 | paste("--lsetting", LSETTING), 85 | paste("--gcdir", GCDIR), 86 | "--plot", 87 | "--genotype", 88 | paste("--config", PARAMSFILE), 89 | paste("--levels", LEVELSFILE), 90 | paste("--input-files", INFILE), 91 | paste("--chrX", CHRX), 92 | "--doXcorrect") 93 | 94 | job.name <- sample_name 95 | log.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.log")) 96 | err.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.err")) 97 | 98 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 99 | ## configure based on your system 100 | bsub.cmd <- paste("bsub -n 2 -W 02:00 -R 'rusage[mem=5000]' -P ", 101 | "-J", job.name, 102 | "-q premium", 103 | "-oo", log.file, 104 | "-eo", err.file , 105 | shQuote(cmd)) 106 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 107 | 108 | cat("i =", i, bsub.cmd, "\n") 109 | system(bsub.cmd) 110 | Sys.sleep(0.1) 111 | 112 | cat("i = ", i , sample_name, "\n") 113 | } 114 | 115 | -------------------------------------------------------------------------------- /03_create_CNVR/step.1.CNV.data.R: -------------------------------------------------------------------------------- 1 | #!/urs/bin/env Rscript 2 | 3 | args <- commandArgs( trailingOnly = TRUE ) 4 | 5 | path_output <- args[1] 6 | file_ipattern <- args[2] 7 | file_penncnv <- args[3] 8 | file_quantisnp <- args[4] 9 | sample_map <- args[5] 10 | 11 | ## Sample_Map.txt can be generated along with final report from Genome Studio 12 | ## used to check Sample_ID in the CNV results generated by individual methods 13 | 14 | suppressMessages({ 15 | require(data.table) 16 | }) 17 | 18 | ## selected columns from CNV results 19 | col_sel <- c("chr", "posStart", "posEnd", "CN", "Sample_ID", "conf", 20 | "numSNP", "avgConf", "length", "CNV_type", "method") 21 | 22 | sample <- read.delim(file = sample_map, as.is = TRUE) 23 | 24 | # generate data from iPattern, PennCNV, QuantiSNP CNV calls ------------------------ 25 | 26 | ## ipattern --------------------------------------------------------------------- 27 | 28 | read_icnv <- function(file_icnv, col_sel, sample) { 29 | 30 | cat("Read in CNV calls from iPattern ...\n") 31 | dat <- read.table(file = file_icnv, sep = "\t", check.names = FALSE, ##skip = 17, 32 | header = FALSE, comment.char = "#", 33 | stringsAsFactors = FALSE) 34 | 35 | names(dat) <- c("CNV_type", "chr", "posStart", "posEnd", 36 | "numSNP", "on_probe.num", "clusterIdx", 37 | "gain_loss_score", "cluster_score", "gain_loss_sample.num", 38 | "conf", "Sample_ID", "CNV_event_ID", "CNVR_ID") 39 | dat$length <- dat$posEnd - dat$posStart + 1 40 | dat$avgConf <- dat$conf/dat$numSNP 41 | 42 | # filter chr, CNV_type 43 | dat <- subset(dat, chr %in% c(1:22) & CNV_type %in% c("Gain", "Loss")) 44 | dat$CN <- ifelse(dat$CNV_type == "Gain", 3, 1) 45 | dat$chr <- as.integer(dat$chr) 46 | dat$method <- "iPattern" 47 | 48 | dat$Sample_ID <- gsub("^X", "", dat$Sample_ID, perl = TRUE) 49 | #dat$Sample_ID <- gsub(".", "-", dat$Sample_ID, fixed = TRUE) 50 | 51 | ## iPattern converts "-" in Sample_ID to "." 52 | ## recover the original Sample_ID 53 | idx <- grep("-", sample$Sample_ID) 54 | samples.raw <- sample$Sample_ID[ idx ] 55 | samples.alt <- sub("-", ".", samples.raw) 56 | 57 | for (i in 1:length(samples.alt)) { 58 | idxs1 <- which( dat$Sample_ID == samples.alt[i] ) 59 | if (length(idxs1) > 0 ) dat$Sample_ID[ idxs1 ] <- samples.raw[i] 60 | } 61 | 62 | stopifnot( all(dat$Sample_ID %in% sample$Sample_ID) ) 63 | dat[, col_sel] ## selected columns 64 | } 65 | 66 | # merge all groups results 67 | dat_ipattern <- read_icnv( file_icnv = file_ipattern, col_sel = col_sel, sample = sample ) 68 | 69 | write.table( dat_ipattern, 70 | file = file.path(path_output, "cnv.ipattern.txt"), 71 | quote = F, row.names = F, sep = "\t") 72 | 73 | 74 | # penncnv ----------------------------------------------------------------- 75 | 76 | read_pcnv <- function(file_pcnv, col_sel, sample) { 77 | 78 | cat("Read in CNV calls from PennCNV ...\n") 79 | dat <- read.table(file = file_pcnv, sep = "\t", check.names = FALSE, 80 | header = FALSE, stringsAsFactors = FALSE, 81 | comment.char = "") 82 | names(dat) <- c("chr", "posStart", "posEnd", "CN", "Sample_ID", "snpStart", "snpEnd", "conf", "numSNP") 83 | dat$Sample_ID <- gsub("\\.txt", "", dat$Sample_ID) 84 | dat$length <- dat$posEnd - dat$posStart + 1 85 | dat$avgConf <- dat$conf/dat$numSNP 86 | 87 | dat <- subset(dat, chr %in% c(1:22) & CN != 2) 88 | dat$CNV_type <- ifelse(dat$CN > 2, "Gain", "Loss") 89 | dat$method <- "PennCNV" 90 | dat$CN[which(dat$CN >= 3)] <- 3 ## set CN >= 3 to CN = 3 91 | 92 | stopifnot( all(dat$Sample_ID %in% sample$Sample_ID) ) 93 | dat[, col_sel] 94 | } 95 | 96 | dat_penncnv <- read_pcnv(file_pcnv = file_penncnv, col_sel = col_sel, sample = sample ) 97 | 98 | write.table( dat_penncnv, 99 | file = file.path(path_output, "cnv.penncnv.txt"), 100 | quote = F, row.names = F, sep = "\t") 101 | 102 | 103 | # quantisnp --------------------------------------------------------------- 104 | # read from combined CNV results from all individuals 105 | 106 | read_qcnv <- function(file_qcnv, col_sel, sample) { 107 | 108 | cat("Read in CNV calls from QuantiSNP ...\n") 109 | dat <- read.table(file = file_qcnv, 110 | sep = "\t", 111 | header = TRUE, 112 | check.names = FALSE, 113 | stringsAsFactors = FALSE, 114 | comment.char = "") 115 | 116 | ## change column name Max.log BF => conf 117 | names(dat) <- c("Sample_ID", "chr", "posStart", "posEnd", "snpStart", "snpEnd", "length", "numSNP", 118 | "CN", "conf", "Log_BF.State.0", "Log_BF.State.1", "Log_BF.State.2", "Log_BF.State.3", 119 | "Log_BF.State.4", "Log_BF.State.5", "Log_BF.State.6") 120 | 121 | dat <- subset(dat, chr %in% c(1:22) & CN != 2) 122 | dat$CNV_type <- ifelse(dat$CN > 2, "Gain", "Loss") 123 | dat$avgConf <- dat$conf/dat$numSNP 124 | dat$method <- "QuantiSNP" 125 | dat$CN[which(dat$CN >= 3)] <- 3 # set CN >= 3 to CN = 3 126 | 127 | stopifnot( all(dat$Sample_ID %in% sample$Sample_ID) ) 128 | dat[, col_sel] 129 | } 130 | 131 | dat_quantisnp <- read_qcnv(file_qcnv = file_quantisnp, col_sel = col_sel, sample = sample ) 132 | 133 | write.table( dat_quantisnp, 134 | file = file.path(path_output, "cnv.quantisnp.txt"), 135 | quote = F, row.names = F, sep = "\t") 136 | 137 | -------------------------------------------------------------------------------- /01_initial_call/run_iPattern/README.md: -------------------------------------------------------------------------------- 1 | ## iPattern 2 | 3 | ### Installation 4 | 5 | To request the iPattern package, please contact the corresponding author Dr. Stephen W. Scherer (stephen.scherer@sickkids.ca) of the [paper](https://www.ncbi.nlm.nih.gov/pubmed/20531469). For more information about iPattern, please refer to the [paper](https://www.ncbi.nlm.nih.gov/pubmed/?term=21552272). 6 | 7 | After obtaining the package (e.g., ipn.0.581.tar.gz is the version we received), please follow the instructions in the iPattern tutorial enclosed in the package for installation and usage. Here we echo the installation instructions in their tutorial. 8 | 9 | #### Requirements 10 | - R (2.7.1+) 11 | - The R "ppc" package – it can be downloaded from http://www-stat.stanford.edu/~tibs/PPC/Rdist/index.html 12 | - The R "cluster" package (1.15.2+) 13 | - Python (2.5.5+) 14 | 15 | #### Setup 16 | 17 | - untar the package file with `tar -zvxf ipn.0.581.tar.gz` 18 | - setup environment: 19 | - set up environment variable IPNBASE: `export IPNBASE='/path/to/ipn_0.581'` 20 | - set up environment variable PYTHONPATH: `PYTHONPATH=$PYTHONPATH:'/path/to/ipn_0.581/ipnlib'` 21 | 22 | Note: the directory structure/name must be kept as it is. Changing the directory structure will break the iPattern pipeline, the pipeline finds all the necessary scripts based on IPNBASE and the directory structure. When PBS job submitting system is not available, you can use `–-noqsub` option to run iPattern sequentially. 23 | 24 | Remark: 25 | 26 | - In version 0.581, `${IPNBASE}/ipnlib/IpnFormat.py` will process the columns `Allele1 - Forward` and `Allele2 - Forward` in the final report (see the detailed decription of [data](https://github.com/HaoKeLab/ensembleCNV#data)). If only the `Allele1 - Top` and `Allele1 - Top` columns exist instead of the `Allele1 - Forward` and `Allele2 - Forward` columns in the final report, the users need to substitute `'Allele1 - Forward'` and `'Allele2 - Forward'` to the corresponding code `'Allele1 - Forward'` and `'Allele2 - Forward'` appearing in the `class IPNFormat` block of `${IPNBASE}/ipnlib/IpnFormat.py`. 27 | 28 | - In version 0.581, two reference files `${IPNBASE}/ipn/known.cnvr.txt` and `${IPNBASE}/preprocess/ref_files/pq.txt` in the iPattern package are in hg18. We perpared a hg19 version [here](https://github.com/HaoKeLab/ensembleCNV/tree/master/01_initial_call/run_iPattern/ref_files_hg19) by [LiftOver](https://genome.ucsc.edu/cgi-bin/hgLiftOver). The users can substitute the two files when processing hg19 data. 29 | 30 | ### Analysis workflow 31 | 32 | #### Prepare auxiliary input files 33 | 34 | In addition to the sample-wise final report files in `${WKDIR}/01_initial_call/run_iPattern/data`, which are supposed to have been generated by `${WKDIR}/01_initial_call/prepare_IPQ_input_file/finalreport_to_iPattern.pl`, three other auxiliary input files for iPattern can be generated as follows. 35 | 36 | ```sh 37 | PROJECT_NAME= 38 | Rscript ${WKDIR}/01_initial_call/run_iPattern/prepare_input_files_for_iPattern.R ${WKDIR} ${PROJECT_NAME} 39 | ``` 40 | 41 | When the processing is completed, three files are supposed to be generated at `${WKDIR}/01_initial_call/run_iPattern/data_aux`: 42 | 43 | - `${PROJECT_NAME}_data_file.txt`: lists the absolute path to all the sample-wise final report files in `${WKDIR}/01_initial_call/run_iPattern/data`, so that iPattern knows where to find these data files. 44 | 45 | - `${PROJECT_NAME}_gender_file.txt`: tab-delimited table including two columns (without column names in table header): Sample ID and Gender ("M" for male and "F" for female), which is generated based on `${WKDIR}/data/Samples_Table.txt` (see the detailed decription of [data](https://github.com/HaoKeLab/ensembleCNV#data)). 46 | 47 | - `${PROJECT_NAME}_bad_samples.txt`: is used to list sample IDs to be excluded from iPattern analysis. We prepared an empty file where the users can type in the sample IDs to be excluded from the analysis if there is any. 48 | 49 | #### Run iPattern 50 | 51 | ```sh 52 | ${IPNBASE}/ipn_0.581/preprocess/ilmn/ilmn_run.py \ 53 | --data-file-list ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_data_file.txt \ 54 | --gender-file ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_gender_file.txt \ 55 | --bad-sample-file ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_bad_samples.txt \ 56 | --experiment $PROJECT_NAME \ 57 | --output-directory ${WKDIR}/01_initial_call/run_iPattern/results/ \ 58 | --do-log \ 59 | --do-cleanup \ 60 | --noqsub 61 | ``` 62 | When the analysis is completed, you will find two files, which will be used by ensembleCNV, in the directory `${WKDIR}/01_initial_call/run_iPattern/results`: 63 | - `${PROJECT_NAME}_all_calls.txt`: raw CNV calls of all samples. 64 | - `${PROJECT_NAME}_sample.stats.txt`: sample-level summary statistics. 65 | 66 | Note: 67 | - All other parameters for `ilmn_run.py` are set by their default values. 68 | - When the sample size of the project is large, the authors of iPattern recommend the whole dataset be split into batches with balanced sample size in order to control for the number of CNV calls per sample. The batches are analyzed by iPattern independently. In each batch (or iPattern run), a minimum of 90-96 samples (e.g. one 96-well plate of samples) and a maximum of 400 samples are recommended based on iPattern tutorial. Creating batches can be easily implemented by splitting `${PROJECT_NAME}_data_file.txt` into batch-level data files (e.g. `${PROJECT_NAME}_batch1_data_file.txt`, `${PROJECT_NAME}_batch2_data_file.txt`, etc.) with each batch having a batch-specific project name (e.g., `${PROJECT_NAME}_batch1`, `${PROJECT_NAME}_batch2`, etc.), while `${PROJECT_NAME}_gender_file.txt` and `${PROJECT_NAME}_bad_samples.txt` remains unchanged. When the analysis for all batches are completed, the batch-wise results (e.g., `${PROJECT_NAME}_batch*_all_calls.txt` and `${PROJECT_NAME}_batch*_sample.stats.txt`) will need to be combined into the final results for the whole project (e.g., `${PROJECT_NAME}_all_calls.txt` and `${PROJECT_NAME}_sample.stats.txt`). 69 | -------------------------------------------------------------------------------- /01_initial_call/run_QuantiSNP/step.2.check.QuantiSNP.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system 4 | 5 | ## The script was used to run QuantiSNP on Minerva high performance cluster. 6 | ## You need to modifiy it according to the system you are using if you would like to use it. 7 | ## Please refer to original QuantiSNP documents (https://sites.google.com/site/quantisnp/) for more information 8 | 9 | suppressPackageStartupMessages(require(optparse)) 10 | 11 | ## function ------------------------------------------------------------------ 12 | run.quantisnp <- function(path_output, path_dat, sample_name, gender) { 13 | 14 | ## define program variables 15 | EMITERS <- "10" ## number of EM iterations to use during training 16 | LSETTING <- "2000000" ## characteristic CNV length parameter 17 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 18 | GCDIR <- file.path(path_quantisnp, "data/b37/") ## path to GC data files (contents of gc_data.zip) 19 | PARAMSFILE <- file.path(path_quantisnp, "quantisnp/config/params.dat") ## path to parameters file 20 | LEVELSFILE <- file.path(path_quantisnp, "quantisnp/config/levels-hd.dat") ## path to levels file 21 | MCRROOT <- file.path(path_quantisnp, "v79/") ## path to MCR Run-Time Libraries 22 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 23 | CHRRANGE <- "1:23" ## chromosome 24 | CHRX <- "23" ## which chromosome is X? 25 | OUTDIR <- file.path(path_output, sample_name) ## output directory 26 | SAMPLEID <- sample_name ## sample name 27 | GENDER <- gender ## sample gender 28 | INFILE <- file.path(path_dat, paste0(sample_name, ".txt")) ## input data file 29 | 30 | if (!file.exists(OUTDIR)) dir.create(OUTDIR) 31 | 32 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 33 | cmd <- paste(file.path(path_quantisnp, "quantisnp/linux64/run_quantisnp2.sh"), 34 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 35 | MCRROOT, 36 | paste("--chr", CHRRANGE), 37 | paste("--outdir", OUTDIR), 38 | paste("--sampleid", SAMPLEID), 39 | paste("--gender", GENDER), 40 | paste("--emiters", EMITERS), 41 | paste("--lsetting", LSETTING), 42 | paste("--gcdir", GCDIR), 43 | "--plot", 44 | "--genotype", 45 | paste("--config", PARAMSFILE), 46 | paste("--levels", LEVELSFILE), 47 | paste("--input-files", INFILE), 48 | paste("--chrX", CHRX), 49 | "--doXcorrect") 50 | 51 | job.name <- sample_name 52 | log.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.log")) 53 | err.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.err")) 54 | 55 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 56 | bsub.cmd <- paste("bsub -n 2 -W 10:00 -R 'rusage[mem=5000]' -P ", 57 | "-J", job.name, 58 | "-q premium", 59 | "-oo", log.file, 60 | "-eo", err.file , 61 | shQuote(cmd)) 62 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 63 | 64 | cat(bsub.cmd, "\n") 65 | system(bsub.cmd) 66 | 67 | } 68 | 69 | ## =============================================================================================== 70 | 71 | option_list <- list( 72 | make_option(c("-q", "--quantisnp"), action = "store", default = NA, type = "character", 73 | help = "path to QuantiSNP installation folder."), 74 | make_option(c("-d", "--data"), default = NA, type = "character", action = "store", 75 | help = "data folder for runing QuantiSNP."), 76 | make_option(c("-s", "--sample"), action = "store", default = NA, type = "character", 77 | help = "sample file with Sample_ID and Gender information for runing QuantiSNP"), 78 | make_option(c("-r", "--result"), default = NA, type = "character", action = "store", 79 | help = "path to CNV results generated in the first step.") 80 | ) 81 | 82 | opt <- parse_args(OptionParser(option_list = option_list)) 83 | 84 | if (is.na(opt$data) | is.na(opt$result)) { 85 | stop("Three input argument must be supplied.") 86 | } 87 | 88 | # get paras 89 | path_quantisnp <- opt$quantisnp 90 | path_data <- opt$data 91 | sample_file <- opt$sample 92 | path_res <- opt$result 93 | 94 | dat_sample <- read.delim(file = sample_file, as.is = TRUE) 95 | 96 | cat("number of rows of sample table:", nrow(dat_sample), "\n") ## number of samples 97 | 98 | samples <- dat_sample$Sample_ID 99 | genders <- tolower(dat_sample$Gender) 100 | ## must change Female => female and Male => male 101 | 102 | n.success <- 0 103 | n.fail <- 0 104 | for (i in 1:length(samples)) { 105 | 106 | sample_name <- samples[i] 107 | gender <- genders[i] 108 | path_sample1 <- file.path(path_res, sample_name) 109 | 110 | if (dir.exists(paths = path_sample1)) { 111 | 112 | # check if .cnv file have been generated 113 | files <- list.files(path = path_sample1) 114 | idx1 <- grep(pattern = ".cnv", files) 115 | if (length(idx1) == 1) { 116 | n.success <- n.success + 1 117 | cat("Sample_ID:", sample_name, "SUCCESS.\n") 118 | } else { 119 | n.fail <- n.fail + 1 120 | cat("Sample_ID:", sample_name, "FAILED.\n") 121 | run.quantisnp(path_output = path_res, path_dat = path_data, sample_name = sample_name, gender = gender) 122 | } 123 | 124 | } else { 125 | n.fail <- n.fail + 1 126 | cat("Sample_ID:", sample_name, "FAILED.\n") 127 | run.quantisnp(path_output = path_res, path_dat = path_data, sample_name = sample_name, gender = gender) 128 | } 129 | 130 | } 131 | 132 | cat("total number of samples:", length(samples), 133 | "number of success:", n.success, 134 | "number of fail:", n.fail, "\n") 135 | -------------------------------------------------------------------------------- /04_CNV_genotype/scripts/fun_plot_steps.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # add dup pairs flag 4 | add_dup_pairs_flag <- function(dt, dup_pairs) { 5 | 6 | # dup pair with dup_flag not equal to 0 7 | dt$dup_flag <- 0 8 | for (i in 1:nrow(dup_pairs)) { 9 | samples <- c(dup_pairs$sample1.name[i], dup_pairs$sample2.name[i]) 10 | idxs <- which(dt$Sample_ID %in% samples) 11 | if (length(idxs) >= 1) { 12 | dt$dup_flag[idxs] <- i 13 | } 14 | } 15 | 16 | dt 17 | } 18 | 19 | # tranfrom dt_LRRBAF from gatk to raw method we used LRR12/BAF12 20 | add_LRRBAF_ratio <- function(dt) { 21 | 22 | # add log_ratio LRR12 and BAF12 and LRR32 and BAF32 23 | dt$LRR12 <- log(dt$LRR1/dt$LRR2) 24 | dt$BAF12 <- log(dt$BAF1/dt$BAF2) 25 | 26 | dt$LRR32 <- log(dt$LRR3/dt$LRR2) 27 | dt$BAF32 <- log(dt$BAF3/dt$BAF2) 28 | 29 | return(dt) 30 | } 31 | 32 | # plot model 33 | plot_model <- function(paras, dt_cnvr, title) { 34 | 35 | mu1 <- paras$mu[1] 36 | sigma1 <- paras$sigma[1] 37 | lambda1 <- paras$lambda[1] 38 | 39 | mu2 <- paras$mu[2] 40 | sigma2 <- paras$sigma[2] 41 | lambda2 <- paras$lambda[2] 42 | 43 | mu3 <- paras$mu[3] 44 | sigma3 <- paras$sigma[3] 45 | lambda3 <- paras$lambda[3] 46 | 47 | x <- dt_cnvr$LRR_median 48 | range_x <- range(x) 49 | 50 | xs <- seq(range_x[1], range_x[2], length.out = 800) 51 | dt <- data.frame(x = xs, stringsAsFactors = F) 52 | 53 | dt1 <- data.frame(x = xs, d = lambda1*dnorm(xs, mean = mu1, sd = sigma1), CN = 1) 54 | dt3 <- data.frame(x = xs, d = lambda3*dnorm(xs, mean = mu3, sd = sigma3), CN = 3) 55 | dt2 <- data.frame(x = xs, d = lambda2*dnorm(xs, mean = mu2, sd = sigma2), CN = 2) 56 | dt123 <- rbind(dt1, dt2, dt3) 57 | dt123$CN <- as.factor(dt123$CN) 58 | 59 | p <- ggplot(data = dt_cnvr, aes(LRR_median, y = ..density..)) + 60 | geom_histogram(bins = 100, fill = NA, color = "black") + 61 | geom_line(data = dt123, aes(x, d, col = CN), lwd = 1.5) + 62 | theme_bw(base_size = 10) + 63 | labs(title = title, 64 | subtitle = paste("mu1:", round(mu1, 2), "mu2:", round(mu2, 2), "mu3:", round(mu3, 2), "\n", 65 | "sd1:", round(sigma1, 2), "sd2:", round(sigma2, 2), "sd3:", round(sigma3, 2))) 66 | p 67 | } 68 | 69 | 70 | # plot steps 71 | plot_steps <- function(dt_cnvr_train, dup_pairs, paras, dt_cnvr_raw, dt_LRRBAF) { 72 | 73 | dt_cnvr_train <- dt_cnvr_train[order(dt_cnvr_train$CN), ] 74 | dt_cnvr_train$idx <- 1:nrow(dt_cnvr_train) 75 | 76 | dt_cnvr_raw <- dt_cnvr_raw[order(dt_cnvr_raw$CN), ] 77 | dt_cnvr_raw$idx <- 1:nrow(dt_cnvr_raw) 78 | 79 | dt_dup <- data.frame() 80 | dt_dup_raw <- data.frame() 81 | if (! is.null(dup_pairs) ) { 82 | # add flag for dup 83 | dt_cnvr_train <- add_dup_pairs_flag(dt = dt_cnvr_train, dup_pairs = dup_pairs) 84 | dt_dup <- subset(dt_cnvr_train, dup_flag != 0) 85 | 86 | dt_cnvr_raw <- add_dup_pairs_flag(dt = dt_cnvr_raw, dup_pairs = dup_pairs) 87 | dt_dup_raw <- subset(dt_cnvr_raw, dup_flag != 0) 88 | } 89 | 90 | numsnp <- unique(dt_cnvr_raw$numSNP) 91 | 92 | plot1 <- ggplot(data = dt_cnvr_raw, aes(idx, LRR_median, col = factor(CN))) + 93 | geom_point() + 94 | theme_bw(base_size = 10) + 95 | annotate("text", x = dt_dup_raw$idx, y = dt_dup_raw$LRR_median, label = dt_dup_raw$dup_flag) + 96 | labs(title = paste("scatter plot of LRR_median with numsnp:", numsnp)) + 97 | theme(legend.position = "top") 98 | 99 | # plot1 100 | 101 | # add gmm model paras 102 | plot2 <- plot_model(dt_cnvr = dt_cnvr_train, paras = paras, 103 | title = "fit model for LRR_median, only contain CN = 1/2/3") 104 | 105 | # plot2 106 | 107 | # -------------------------------------------------- 108 | # add steps infromation 109 | # filter CN != 0 110 | dt_LRRBAF <- subset(dt_LRRBAF, CN_gatk_pred != 0) 111 | 112 | dt_LRRBAF <- add_dup_pairs_flag(dt = dt_LRRBAF, dup_pairs = dup_pairs) 113 | dt_LRRBAF_new <- add_LRRBAF_ratio(dt = dt_LRRBAF) 114 | 115 | # step1 116 | dt1_gatk <- subset(dt_LRRBAF_new, CN_gatk_pred == 1) 117 | dt1_annotate <- subset(dt_LRRBAF_new, dup_flag != 0) 118 | plot_step1 <- ggplot() + 119 | geom_point(data = dt_LRRBAF_new, aes(BAF12, LRR12), col = "gray") + 120 | geom_vline(xintercept = 0, lty = 2, lwd = 1) + 121 | geom_hline(yintercept = 0, lty = 2, lwd = 1) + 122 | theme_bw(base_size = 10) + 123 | geom_point(data = dt1_gatk, aes(BAF12, LRR12), col = "red") + 124 | annotate(geom = "text", x = dt1_annotate$BAF12, y = dt1_annotate$LRR12, label = dt1_annotate$dup_flag) + 125 | ggtitle(label = "step 1 for CN = 1") 126 | 127 | # step2 128 | dt3 <- subset(dt_LRRBAF_new, LRR12 <= 0 | BAF12 <= 0) 129 | dt3_gatk <- subset(dt_LRRBAF_new, CN_gatk_pred == 3) 130 | dt3_annotate <- subset(dt3, dup_flag != 0) 131 | plot_step2 <- ggplot() + 132 | geom_point(data = dt3, aes(BAF32, LRR32), col = "gray") + 133 | geom_vline(xintercept = 0, lty = 2, lwd = 1) + 134 | geom_hline(yintercept = 0, lty = 2, lwd = 1) + 135 | theme_bw(base_size = 10) + 136 | geom_point(data = dt3_gatk, aes(BAF32, LRR32), col = "red") + 137 | annotate(geom = "text", x = dt3_annotate$BAF32, y = dt3_annotate$LRR32, label = dt3_annotate$dup_flag) + 138 | ggtitle(label = "step 2 for CN = 3") 139 | 140 | 141 | ps <- gridExtra::grid.arrange(plot1, plot_step1, plot2, plot_step2, nrow = 2) 142 | 143 | return(ps) 144 | } 145 | 146 | 147 | # plot model_final 148 | plot_model_final <- function(paras, dt_cnvr, title) { 149 | 150 | mu1 <- paras$mu[2] 151 | sigma1 <- paras$sigma[2] 152 | lambda1 <- paras$lambda[2] 153 | 154 | mu2 <- paras$mu[3] 155 | sigma2 <- paras$sigma[3] 156 | lambda2 <- paras$lambda[3] 157 | 158 | mu3 <- paras$mu[4] 159 | sigma3 <- paras$sigma[4] 160 | lambda3 <- paras$lambda[4] 161 | 162 | 163 | # transfrom lambdas -------------- 164 | lambdas <- lambda1 + lambda2 + lambda3 165 | lambda1 <- lambda1/lambdas 166 | lambda2 <- lambda2/lambdas 167 | lambda3 <- lambda3/lambdas 168 | 169 | x <- dt_cnvr$LRR_median 170 | range_x <- range(x) 171 | 172 | xs <- seq(range_x[1], range_x[2], length.out = 800) 173 | dt <- data.frame(x = xs, stringsAsFactors = F) 174 | 175 | dt1 <- data.frame(x = xs, d = lambda1*dnorm(xs, mean = mu1, sd = sigma1), CN = 1) 176 | dt2 <- data.frame(x = xs, d = lambda2*dnorm(xs, mean = mu2, sd = sigma2), CN = 2) 177 | dt3 <- data.frame(x = xs, d = lambda3*dnorm(xs, mean = mu3, sd = sigma3), CN = 3) 178 | 179 | dt123 <- rbind(dt1, dt2, dt3) 180 | dt123$CN <- as.factor(dt123$CN) 181 | 182 | p <- ggplot(data = dt_cnvr, aes(LRR_median, y = ..density..)) + 183 | geom_histogram(bins = 100, fill = NA, color = "black") + 184 | geom_line(data = dt123, aes(x, d, col = CN), lwd = 1.5) + 185 | theme_bw(base_size = 10) + 186 | labs(title = title, 187 | subtitle = paste("mu1:", round(mu1, 2), "mu2:", round(mu2, 2), "mu3:", round(mu3, 2), "\n", 188 | "sd1:", round(sigma1, 2), "sd2:", round(sigma2, 2), "sd3:", round(sigma3, 2))) 189 | p 190 | } 191 | -------------------------------------------------------------------------------- /02_batch_effect/PCA_on_summary_stats/step.1.prepare.stats.R: -------------------------------------------------------------------------------- 1 | #!/urs/bin/env Rscript 2 | 3 | args <- commandArgs( trailingOnly = TRUE ) 4 | 5 | path_ipattern <- args[1] 6 | path_penncnv <- args[2] 7 | path_quantisnp <- args[3] 8 | path_output <- args[4] 9 | 10 | suppressMessages({ 11 | require(data.table) 12 | }) 13 | 14 | # ipattern ---------------------------------------------------------------- 15 | ## for number of samples larger than 500, samples may need to be splited into batches to run ipattern 16 | read_ipattern_batch <- function(path_ipattern) { 17 | 18 | ## NumCNV 19 | cnv_file <- list.files(path = path_ipattern, pattern = "_all_calls.txt$") 20 | 21 | dat <- read.table(file = file.path(path_ipattern, cnv_file), 22 | header = FALSE, sep = "\t", comment.char = "#", as.is = TRUE) 23 | names(dat) <- c("CNV_type", "chr", "posStart", "posEnd", 24 | "numSNP", "on_probe.num", "clusterIdx", 25 | "gain_loss_score", "cluster_score", "gain_loss_sample.num", 26 | "conf", "Sample_ID", "CNV_event_ID", "CNVR_ID") 27 | dat <- subset(dat, chr %in% c(1:22)) 28 | tbl <- table(dat$Sample_ID) 29 | dat_tbl <- as.data.frame(tbl) 30 | names(dat_tbl) <- c("Sample_ID", "iPattern.NumCNV") 31 | NumSample <- nrow(dat_tbl) 32 | 33 | ## sample.stats.txt 34 | stat_file <- list.files(path = path_ipattern, pattern = "_sample.stats.txt$") 35 | 36 | dat_stat <- read.table(file = file.path(path_ipattern, stat_file), 37 | header = FALSE, sep = "\t", nrows = NumSample, as.is = TRUE) 38 | names(dat_stat) <- c("Sample_ID", "iPattern.LRR_SD", "iPattern.base_CN") 39 | dat_stat <- dat_stat[, c("Sample_ID", "iPattern.LRR_SD")] 40 | 41 | ## clean sample ID: remove path information, remove subfix ".rescale" 42 | samples <- dat_stat$Sample_ID 43 | samples <- unlist( lapply(1:length(samples), FUN = function(k) { 44 | sample1 <- samples[k] 45 | strs <- unlist(strsplit(sample1, split = "/", fixed = TRUE)) 46 | str1 <- strs[length(strs)] 47 | }) ) 48 | samples <- gsub("\\.rescale$", "", samples) 49 | dat_stat$Sample_ID <- samples 50 | 51 | res <- merge(dat_stat, dat_tbl) 52 | ## if Sample_ID starts with number 53 | res$Sample_ID <- gsub(pattern = "^X", replacement = "", res$Sample_ID, perl = TRUE) ## check 54 | 55 | res 56 | } 57 | 58 | cat("Processing iPattern results ...\n") 59 | dat_stats_ipattern <- read_ipattern_batch(path_ipattern = path_ipattern) 60 | 61 | write.table(dat_stats_ipattern, 62 | file = file.path(path_output, "ipattern.stats.txt"), 63 | quote = F, row.names = F, sep = "\t") 64 | cat("Done.\n") 65 | 66 | # penncnv sample-level ----------------------------------------------------- 67 | cat("Processing PennCNV results ...\n") 68 | dat_penncnv <- read.table(file = file.path(path_penncnv, "CNV.PennCNV_qc_new.txt"), 69 | sep = "\t", 70 | header = TRUE, 71 | check.names = FALSE, 72 | stringsAsFactors = FALSE) 73 | dat_penncnv$File <- gsub("\\.txt$", "", dat_penncnv$File, perl = TRUE) 74 | dat_penncnv$WF <- abs(dat_penncnv$WF) 75 | 76 | fp <- c( "LRR_SD", "BAF_SD", "BAF_drift", "WF", "NumCNV" ) 77 | dat_penncnv <- dat_penncnv[, c("File", fp)] 78 | names(dat_penncnv) <- c("Sample_ID", paste("PennCNV", fp, sep = ".")) 79 | 80 | dat_stats_penncnv <- dat_penncnv 81 | 82 | write.table(dat_stats_penncnv, 83 | file = file.path(path_output, "penncnv.stats.txt"), 84 | quote = F, row.names = F, sep = "\t") 85 | cat("Done.\n") 86 | 87 | # quantisnp --------------------------------------------------------------- 88 | read_quantisnp_per_sample <- function(path_res, sample_id) { 89 | 90 | ## get numCNV 91 | file_cnv <- file.path(file.path(path_res, sample_id), paste0(sample_id, ".cnv")) 92 | dat_cnv <- fread(input = file_cnv) 93 | numCNV <- sum(dat_cnv$Chromosome %in% c(1:22)) 94 | 95 | ## get LRR.SD and BAF.SD 96 | ## Note: in the .qc file, QuantiSNP has formatting issue 97 | ## the column name "Gender" is written at the start of the second line 98 | file_qc <- file.path(file.path(path_res, sample_id), paste0(sample_id, ".qc")) 99 | dat_line2 <- read.table(file = file_qc, skip = 1, nrows = 1, header = FALSE, stringsAsFactors = FALSE) 100 | dat_line2 <- dat_line2[, -1] 101 | names(dat_line2) <- c("Sample_ID", "Chr", "OutlierRate", "LRR_SD", "BAF_SD", "Gender") 102 | 103 | dat_other <- read.table(file = file_qc, skip = 2, header = FALSE, stringsAsFactors = FALSE) 104 | names(dat_other) <- c("Sample_ID", "Chr", "OutlierRate", "LRR_SD", "BAF_SD", "Gender") 105 | dat <- rbind(dat_line2, dat_other) 106 | names(dat) <- c("Sample_ID", "Chr", "OutlierRate", "LRR_SD", "BAF_SD", "Gender") 107 | dat <- subset(dat, Chr %in% c(1:22)) 108 | 109 | Sample_ID <- unique(dat$Sample_ID) 110 | LRR_SD <- mean(dat$LRR_SD, na.rm = TRUE) 111 | BAF_SD <- mean(dat$BAF_SD, na.rm = TRUE) 112 | 113 | res1 <- data.frame(Sample_ID = Sample_ID, 114 | QuantiSNP.NumCNV = numCNV, 115 | QuantiSNP.LRR_SD = LRR_SD, 116 | QuantiSNP.BAF_SD = BAF_SD, 117 | stringsAsFactors = FALSE) 118 | return(res1) ## for one sample 119 | } 120 | 121 | read_quantisnp <- function(path_res) { 122 | 123 | samples <- list.files(path = path_res) 124 | res <- data.frame() ## all QuantiSNP statistics 125 | for (i in 1:length(samples)) { 126 | 127 | sample1 <- samples[i] 128 | #cat("i:", i, length(samples), "SampleID:", sample1, "\n") 129 | 130 | res1 <- read_quantisnp_per_sample(path_res = path_res, sample_id = sample1) 131 | res <- rbind(res, res1) 132 | } 133 | res 134 | } 135 | 136 | cat("Processing QuantiSNP results ...\n") 137 | dat_stats_quantisnp <- read_quantisnp(path_res = path_quantisnp) 138 | 139 | write.table(dat_stats_quantisnp, 140 | file = file.path(path_output, "quantisnp.stats.txt"), 141 | quote = F, row.names = F, sep = "\t") 142 | cat("Done.") 143 | 144 | 145 | # IPQ --------------------------------------------------------------------- 146 | 147 | cat("Combine summary statistics from different methods ...\n") 148 | ## iPattern converts "-" in Sample_ID to "." 149 | ## recover the original Sample_ID 150 | idx <- grep("-", dat_stats_penncnv$Sample_ID) 151 | samples.raw <- dat_stats_penncnv$Sample_ID[ idx ] 152 | samples.alt <- sub("-", ".", samples.raw) 153 | 154 | for (i in 1:length(samples.alt)) { 155 | idx1 <- which(dat_stats_ipattern$Sample_ID == samples.alt[i]) 156 | dat_stats_ipattern$Sample_ID[ idx1 ] <- samples.raw[i] 157 | } 158 | 159 | res_IP <- merge(dat_stats_ipattern, dat_stats_penncnv) 160 | stopifnot( nrow(res_IP) == nrow(dat_stats_ipattern)) 161 | 162 | res_IPQ <- merge(res_IP, dat_stats_quantisnp) 163 | stopifnot( nrow(res_IPQ) == nrow(res_IP) ) 164 | 165 | write.table(res_IPQ, 166 | file = file.path(path_output, "IPQ.stats.txt"), 167 | quote = F, row.names = F, sep = "\t") 168 | cat("Done.\n") 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /example/example_create_CNVR/results/cnv.ipattern.txt: -------------------------------------------------------------------------------- 1 | chr posStart posEnd CN Sample_ID conf numSNP avgConf length CNV_type method 2 | 1 25598276 25642596 1 Sample86.1_3R05C01 19.07 8 2.38375 44321 Loss iPattern 3 | 1 25598276 25642596 1 Sample6.1_4R07C01 16.2 8 2.025 44321 Loss iPattern 4 | 1 25598276 25642596 1 Sample143.1_5R06C01 19.25 8 2.40625 44321 Loss iPattern 5 | 1 25598276 25642596 1 Sample62.1_7R02C01 27 8 3.375 44321 Loss iPattern 6 | 1 25598276 25642596 1 Sample35.1_9R01C01 16.59 8 2.07375 44321 Loss iPattern 7 | 1 25598276 25642596 1 Sample60.1_9R03C01 17.84 8 2.23 44321 Loss iPattern 8 | 1 25598276 25642596 1 Samplec9.1_38R05C01 26.89 8 3.36125 44321 Loss iPattern 9 | 1 25598276 25642596 1 Sample289.1_16R04C01 27.79 8 3.47375 44321 Loss iPattern 10 | 1 25598276 25642596 1 Sample651.1_3R05C01 14.83 8 1.85375 44321 Loss iPattern 11 | 1 25598276 25642596 1 Sample496.1_40R07C01 29.96 8 3.745 44321 Loss iPattern 12 | 1 25598276 25642596 1 Sample130.1_18R01C01 17.4 8 2.175 44321 Loss iPattern 13 | 1 25598276 25642596 1 Sample217.1_18R08C01 20.28 8 2.535 44321 Loss iPattern 14 | 1 25598276 25642596 1 Sample242.1_20R01C01 22.59 8 2.82375 44321 Loss iPattern 15 | 1 25598276 25642596 1 Sample520.1_41R08C01 33.39 8 4.17375 44321 Loss iPattern 16 | 1 25598276 25642596 1 Sample66.1_12R06C01 31.81 8 3.97625 44321 Loss iPattern 17 | 1 25598276 25642596 1 Sample177.1_21R05C01 25.54 8 3.1925 44321 Loss iPattern 18 | 1 25598276 25642596 1 Sample160.1_22R02C01 20.9 8 2.6125 44321 Loss iPattern 19 | 1 25598276 25642596 1 Samplec119.1_18R02C01 19.55 8 2.44375 44321 Loss iPattern 20 | 1 25598276 25642596 1 Sample138.1_23R06C01 23.52 8 2.94 44321 Loss iPattern 21 | 1 25598276 25642596 1 Sample390.1_24R07C01 17.26 8 2.1575 44321 Loss iPattern 22 | 1 25598276 25642596 1 SampleY-37.1_11R02C01 20.91 8 2.61375 44321 Loss iPattern 23 | 1 25598276 25642596 1 Sample212.1_25R06C01 26.63 8 3.32875 44321 Loss iPattern 24 | 1 25598276 25642596 1 Sample686.1_9R03C01 18.67 8 2.33375 44321 Loss iPattern 25 | 1 25598276 25642596 1 Sample668.1_14R05C01 19.34 8 2.4175 44321 Loss iPattern 26 | 1 25598276 25642596 1 Sample256.1_28R07C01 25.32 8 3.165 44321 Loss iPattern 27 | 1 25598276 25642596 1 Sample200.1_29R07C01 32.4 8 4.05 44321 Loss iPattern 28 | 1 25598276 25642596 1 SampleY-70.1_4R01C01 20.15 8 2.51875 44321 Loss iPattern 29 | 1 25598276 25642596 1 Sample235.1_30R03C01 17.06 8 2.1325 44321 Loss iPattern 30 | 1 25598276 25642596 1 Sample283.1_30R06C01 21.45 8 2.68125 44321 Loss iPattern 31 | 1 25598276 25642596 1 SampleY-41.1_1R05C01 25.78 8 3.2225 44321 Loss iPattern 32 | 1 25598276 25642596 1 Sample556.1_43R03C01 26.36 8 3.295 44321 Loss iPattern 33 | 1 25598276 25642596 1 Sample592.1_43R06C01 28.9 8 3.6125 44321 Loss iPattern 34 | 1 25598276 25642596 1 Sample527.1_44R01C01 17.92 8 2.24 44321 Loss iPattern 35 | 1 25598276 25642596 1 Sample550.1_46R07C01 19.96 8 2.495 44321 Loss iPattern 36 | 1 25598276 25642596 1 Sample433.1_48R01C01 19.04 8 2.38 44321 Loss iPattern 37 | 1 25598276 25642596 1 Sample445.1_79R03C01 23.29 8 2.91125 44321 Loss iPattern 38 | 1 25598276 25642596 1 Sample522.1_78R01C01 19.71 8 2.46375 44321 Loss iPattern 39 | 1 25598276 25642596 1 Sample331.1_35R05C01 25.97 8 3.24625 44321 Loss iPattern 40 | 1 25598276 25642596 1 Sample560.1_78R02C01 22.22 8 2.7775 44321 Loss iPattern 41 | 1 25598276 25642596 1 Sample392.1_35R02C01 12.07 8 1.50875 44321 Loss iPattern 42 | 1 25598276 25642596 1 Sample501.1_75R01C01 11.73 8 1.46625 44321 Loss iPattern 43 | 1 25598276 25642596 1 Sample247.1_67R01C01 22.09 8 2.76125 44321 Loss iPattern 44 | 1 25598276 25642596 1 Sample439.1_55R05C01 16.85 8 2.10625 44321 Loss iPattern 45 | 1 25598276 25642596 1 Sample131.1_5R05C01 17.91 8 2.23875 44321 Loss iPattern 46 | 1 25598276 25642596 1 Sample72.1_8R04C01 18.18 8 2.2725 44321 Loss iPattern 47 | 1 25598276 25642596 1 Sample36.1_9R08C01 18.32 8 2.29 44321 Loss iPattern 48 | 1 25598276 25642596 1 Sample724.1_12R04C01 24.24 8 3.03 44321 Loss iPattern 49 | 1 25598276 25642596 1 Sample748.1_26R05C01 18.17 8 2.27125 44321 Loss iPattern 50 | 1 25598276 25642596 1 Sample516.1_37R02C01 23.24 8 2.905 44321 Loss iPattern 51 | 1 25598276 25642596 1 Sample459.1_28R06C01 21.96 8 2.745 44321 Loss iPattern 52 | 1 25598276 25642596 1 Sample533.1_39R08C01 32.24 8 4.03 44321 Loss iPattern 53 | 1 25598276 25642596 1 Sample265.1_16R03C01 18.96 8 2.37 44321 Loss iPattern 54 | 1 25598276 25642596 1 Sample654.1_23R06C01 21.66 8 2.7075 44321 Loss iPattern 55 | 1 25598276 25642596 1 Sample737.1_9R06C01 23.52 8 2.94 44321 Loss iPattern 56 | 1 25598276 25642596 1 Sample611.1_4R05C01 27.92 8 3.49 44321 Loss iPattern 57 | 1 25598276 25642596 1 SampleY-16.1_17R04C01 27.15 8 3.39375 44321 Loss iPattern 58 | 1 25598276 25642596 1 Sample569.1_41R04C01 22.89 8 2.86125 44321 Loss iPattern 59 | 1 25598276 25642596 1 Sample751.1_10R04C01 22.17 8 2.77125 44321 Loss iPattern 60 | 1 25598276 25642596 1 SampleY-53.1_21R08C01 13.27 8 1.65875 44321 Loss iPattern 61 | 1 25598276 25642596 1 Sample29.1_11R03C01 25.16 8 3.145 44321 Loss iPattern 62 | 1 25598276 25642596 1 Sample694.1_19R07C01 25.76 8 3.22 44321 Loss iPattern 63 | 1 25598276 25642596 1 Sample13.1_12R02C01 30.73 8 3.84125 44321 Loss iPattern 64 | 1 25598276 25642596 1 SampleY-59.1_28R08C01 22.65 8 2.83125 44321 Loss iPattern 65 | 1 25598276 25642596 1 Sample141.1_21R02C01 21.73 8 2.71625 44321 Loss iPattern 66 | 1 25598276 25642596 1 SampleY-47.1_4R04C01 27.89 8 3.48625 44321 Loss iPattern 67 | 1 25598276 25642596 1 Sample697.1_24R05C01 25.43 8 3.17875 44321 Loss iPattern 68 | 1 25598276 25642596 1 Sample400.1_9R01C01 25.01 8 3.12625 44321 Loss iPattern 69 | 1 25598276 25642596 1 Sample255.1_28R01C01 21.14 8 2.6425 44321 Loss iPattern 70 | 1 25598276 25642596 1 Samplec97.1_10R06C01 16.92 8 2.115 44321 Loss iPattern 71 | 1 25598276 25642596 1 Sample296.1_30R07C01 16.7 8 2.0875 44321 Loss iPattern 72 | 1 25598276 25642596 1 SampleY-58.1_3R08C01 14.27 8 1.78375 44321 Loss iPattern 73 | 1 25598276 25642596 1 Sample371.1_31R02C01 19.71 8 2.46375 44321 Loss iPattern 74 | 1 25598276 25642596 1 SampleY-68.1_27R01C01 12.81 8 1.60125 44321 Loss iPattern 75 | 1 25598276 25642596 1 Sample580.1_43R05C01 37.41 8 4.67625 44321 Loss iPattern 76 | 1 25598276 25642596 1 Sample15.1_71R04C01 17.87 8 2.23375 44321 Loss iPattern 77 | 1 25598276 25642596 1 Sample110.1_62R05C01 25.08 8 3.135 44321 Loss iPattern 78 | 1 25598276 25642596 1 Sample192.1_63R04C01 17.28 8 2.16 44321 Loss iPattern 79 | 1 25598276 25642596 1 Sample379.1_34R03C01 22.39 8 2.79875 44321 Loss iPattern 80 | 1 25598276 25642596 1 Sample157.1_63R08C01 20.82 8 2.6025 44321 Loss iPattern 81 | 1 25598276 25642596 1 Samplec39.1_59R05C01 27.17 8 3.39625 44321 Loss iPattern 82 | 1 25598276 25642596 1 Samplec14.1_59R07C01 21.63 8 2.70375 44321 Loss iPattern 83 | 1 25598276 25642596 1 Sample252.1_33R05C01 19.32 8 2.415 44321 Loss iPattern 84 | 1 25598276 25642596 1 Samplec44.1_65R04C01 15.49 8 1.93625 44321 Loss iPattern 85 | 1 25598276 25642596 1 Sample285.1_67R04C01 20.95 8 2.61875 44321 Loss iPattern 86 | 1 25598276 25669467 1 Sample21.1_6R08C01 14.7 10 1.47 71192 Loss iPattern 87 | 1 25598276 25669467 1 Sample334.1_16R07C01 13.82 10 1.382 71192 Loss iPattern 88 | 1 25598276 25669467 1 Sample38.1_71R08C01 13.88 10 1.388 71192 Loss iPattern 89 | 1 25598276 25669467 1 Sample303.1_33R07C01 16.36 10 1.636 71192 Loss iPattern 90 | 1 25598276 25669467 1 Sample422.1_52R08C01 16.44 10 1.644 71192 Loss iPattern 91 | 1 25598276 25669467 1 Sample404.1_24R08C01 12.77 10 1.277 71192 Loss iPattern 92 | 1 25598276 25669467 1 Sample345.1_36R03C01 17.46 10 1.746 71192 Loss iPattern 93 | 1 25598276 25669467 1 Sample171.1_78R07C01 15.15 10 1.515 71192 Loss iPattern 94 | 1 25598276 25669467 1 Sample424.1_53R08C01 11.77 10 1.177 71192 Loss iPattern 95 | 1 25598276 25669467 1 Sample526.1_57R04C01 23.21 10 2.321 71192 Loss iPattern 96 | -------------------------------------------------------------------------------- /04_CNV_genotype/step.3.check.and.resubmit.jobs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system 4 | 5 | suppressMessages(require(optparse)) 6 | 7 | option_list = list( 8 | make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA, 9 | help = "Path to the directory containing necessary input data."), 10 | make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA, 11 | help = "Path to the directory for saving results."), 12 | make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA, 13 | help = "Path to chromosome-wise LRR and BAF matrices."), 14 | make_option(c("-s", "--sourcefile"), action = "store", type = "character", default = NA, 15 | help = "Path to the scripts directory containing R scripts to be loaded into R."), 16 | make_option(c("-d", "--duplicates"), action = "store_true", default = FALSE, 17 | help = "[optional] Whether duplicate pairs information will be annotated in diagnosis plots."), 18 | make_option(c("-n", "--plot"), action = "store_true", default = FALSE, 19 | help = "[optional] Whether to generate diagnosis plots."), 20 | make_option(c("-r", "--script"), action = "store", type = "character", default = NA, 21 | help = "Path to the main script CNV.genotype.one.chr.one.batch.R."), 22 | make_option(c("-l", "--joblog"), action = "store", type = "character", default = NA, 23 | help = "Path to the directory saving job logs."), 24 | make_option(c("-f", "--flag"), action = "store", type = "integer", default = NA, 25 | help = "0: only print the running status of CNV genotyping; 1: resubmit jobs for unfinished CNV genotyping") 26 | ) 27 | 28 | opt = parse_args(OptionParser(option_list = option_list)) 29 | pars = c(opt$type, opt$datapath, opt$resultpath, opt$joblog, 30 | opt$matrixpath, opt$sourcefile, opt$script, opt$flag) 31 | 32 | if ( any(is.na(pars)) ) { 33 | stop("All parameters must be supplied. (--help for detail)") 34 | } 35 | 36 | flag <- as.integer( opt$flag ) ## 0 or 1 37 | 38 | # resubmit unfinished jobs 39 | file_cnvr <- "cnvr_batch.txt" ## with batch information 40 | dt_cnvr_raw <- read.delim(file = file.path(opt$datapath, file_cnvr), as.is = TRUE) 41 | dt_cnvr_raw <- dt_cnvr_raw[order(dt_cnvr_raw$chr, dt_cnvr_raw$batch), ] 42 | # add fname column 43 | dt_cnvr_raw$fname <- paste0(dt_cnvr_raw$CNVR_ID, "_pred.rds") 44 | 45 | tbl_raw <- table(dt_cnvr_raw$chr, dt_cnvr_raw$batch) 46 | dt_freq_raw <- as.data.frame(tbl_raw) 47 | names(dt_freq_raw) <- c("chr", "batch", "Freq") 48 | 49 | dt_freq_raw <- subset(dt_freq_raw, Freq != 0) ## subset non-null batch 50 | dt_freq_raw <- dt_freq_raw[order(dt_freq_raw$chr, dt_freq_raw$batch), ] 51 | 52 | path_main_pred <- file.path(opt$resultpath, "pred") 53 | path_main_failed <- file.path(opt$resultpath, "cnvrs_error") 54 | 55 | # create script 56 | script <- file.path(opt$script, "CNV.genotype.one.chr.one.batch.R") 57 | cmd <- paste("Rscript", script, 58 | "--datapath", opt$datapath, 59 | "--resultpath", opt$resultpath, 60 | "--matrixpath", opt$matrixpath, 61 | "--sourcefile", opt$sourcefile) 62 | 63 | if ( opt$duplicates ) cmd <- paste(cmd, "--duplicates") 64 | if ( opt$plot ) cmd <- paste(cmd, "--plot") 65 | path_joblog <- opt$joblog 66 | 67 | # check if CNV genotyping for all CNVRs is finished ---------------------------------- 68 | check_jobs <- function(path_main, dt_cnvr_raw, flag, path_main_failed, path_joblog) { 69 | 70 | path_job_error <- file.path(path_joblog, "job", "ERROR") 71 | path_job_out <- file.path(path_joblog, "job", "OUT") 72 | 73 | # remove all previous results 74 | system( paste("rm -rf", path_main_failed) ) 75 | 76 | tbl_raw <- table(dt_cnvr_raw$chr, dt_cnvr_raw$batch) 77 | dt_freq_raw <- as.data.frame(tbl_raw) 78 | names(dt_freq_raw) <- c("chr", "batch", "Freq") 79 | 80 | dt_freq_raw <- subset(dt_freq_raw, Freq != 0) 81 | dt_freq_raw <- dt_freq_raw[order(dt_freq_raw$chr, dt_freq_raw$batch), ] 82 | 83 | for (i in 1:nrow(dt_freq_raw)) { 84 | 85 | chr1 <- dt_freq_raw$chr[i] 86 | batch1 <- dt_freq_raw$batch[i] 87 | freq1 <- dt_freq_raw$Freq[i] 88 | 89 | foldername1 <- paste0("chr_", chr1, "_batch_", batch1) 90 | path1 <- file.path(path_main, foldername1) 91 | 92 | if ( !dir.exists(paths = path1) ) { 93 | cat("CHR:", chr1, "BATCH:", batch1, "The whole batch failed and jobs will be resubmitted.\n") 94 | 95 | # submit jobs 96 | if (flag == 1) { 97 | 98 | cmd1 = paste(cmd, "--chr", chr1, "--batch", batch1, "--type", 0) 99 | 100 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 101 | ## configure based on your system 102 | bsub.cmd = paste("bsub -n 2 -W 10:00 -R 'rusage[mem=20000]' -P ", 103 | "-e", file.path(path_job_error, paste0("chr_", chr1, "_batch_", batch1, ".e")), 104 | "-o", file.path(path_job_out, paste0("chr_", chr1, "_batch_", batch1, ".o")), 105 | "-q premium", shQuote(cmd1)) 106 | cat(bsub.cmd, "\n") 107 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 108 | 109 | system(bsub.cmd) 110 | } 111 | 112 | } else { 113 | ## the results folder for the current batch exists 114 | files <- list.files(path = path1) 115 | dt1 <- subset(dt_cnvr_raw, chr == chr1 & batch == batch1) 116 | dt1.failed <- subset(dt1, !fname %in% files) 117 | 118 | if (nrow(dt1.failed) == 0) { 119 | cat("CHR:", chr1, "BATCH:", batch1, "TOTAL:", freq1, "SUCCEED!\n") 120 | 121 | } else { 122 | cat("CHR:", chr1, "BATCH:", batch1, "TOTAL:", freq1, "FAILED:", nrow(dt1.failed), "\n") 123 | 124 | if ( !dir.exists(paths = path_main_failed) ) { 125 | dir.create(path = path_main_failed, showWarnings = F, recursive = T) 126 | } 127 | 128 | fname.failed <- paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt") 129 | write.table(data.frame(CNVR_ID = dt1.failed$CNVR_ID, stringsAsFactors = F), 130 | file = file.path(path_main_failed, paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt")), 131 | col.names = T, row.names = F, quote = F) 132 | 133 | if (flag == 1) { 134 | cmd1 = paste(cmd, "--chr", chr1, "--batch", batch1, "--type", 1) 135 | 136 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 137 | ## configure based on your system 138 | bsub.cmd = paste("bsub -n 2 -W 10:00 -R 'rusage[mem=20000]' -P ", 139 | "-e", file.path(path_job_error, paste0("chr_", chr1, "_batch_", batch1, ".e")), 140 | "-o", file.path(path_job_out, paste0("chr_", chr1, "_batch_", batch1, ".o")), 141 | "-q premium", shQuote(cmd1)) 142 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 143 | 144 | cat(bsub.cmd, "\n") 145 | system(bsub.cmd) 146 | } 147 | } 148 | } 149 | } 150 | } 151 | 152 | # main runing function -------------------------------------------- 153 | check_jobs(path_main = path_main_pred, 154 | dt_cnvr_raw = dt_cnvr_raw, 155 | flag = flag, 156 | path_main_failed = path_main_failed, 157 | path_joblog = path_joblog) 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /01_initial_call/run_PennCNV/README.md: -------------------------------------------------------------------------------- 1 | ## PennCNV 2 | 3 | ### Installation 4 | 5 | To download and install PennCNV, please follow the detailed instructions (including trouble shooting) at the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/install/). For more information about PennCNV, please refer to their original [PennCNV website](http://penncnv.openbioinformatics.org/en/latest/). 6 | 7 | After installation, set up environment variable PENNCNV: `export PENNCNV='/path/to/penncnv'` 8 | 9 | ### Analysis workflow 10 | 11 | Note: 12 | 13 | - PennCNV was originally designed to sequentially analyze one sample at a time. Please refer to [PennCNV website](http://penncnv.openbioinformatics.org/en/latest/) for how to run PennCNV in a sequential way. Here, we provide scripts to run the analysis on multiple samples in parallel via job submitting system (one sample per job) in a cluster environment. 14 | 15 | - In the following steps (2) and (3), the scripts regarding job submission embraced by "##<<<... ##>>>..." in the scripts need to be specified by the users based on the system the users are using. 16 | 17 | We run PennCNV analysis with the following 5 steps: 18 | 19 | #### (1) Prepare SNP.pfb and SNP.gcmodel files 20 | 21 | #### (1.1) compile pfb (population frequency of B allele) file 22 | ```sh 23 | perl ${PENNCNV}/bin/compile_pfb.pl \ 24 | -snpposfile ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/SNP_pos.txt \ 25 | -listfile ${WKDIR}/01_initial_call/run_PennCNV/data_aux/list_pfb.txt \ 26 | -output ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb 27 | ``` 28 | 29 | Note: 30 | 31 | - For more information about pfb file, please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/input/#pfb-population-frequency-of-b-allele-file). 32 | 33 | - `SNP_pos.txt`: generated by `finalreport_to_matrix_LRR_and_BAF.pl` in the [initial step](https://github.com/HaoKeLab/ensembleCNV#prepare-chromosome-wise-lrr-and-baf-matrices-for-cnv-genotyping). 34 | 35 | - `list_pfb.txt`: the users need to prepare a text file that contains a list of full path to signal files in `{WKDIR}/01_initial_call/run_PennCNV/data` generated by `finalreport_to_PennCNV.pl` in the [initial step](https://github.com/HaoKeLab/ensembleCNV#prepare-data-for-individual-cnv-callers), one per line. The pfb file compiled from only a few samples is not valid -- at least about 100 samples (e.g. one 96-well plate of samples) are needed. Based on our experience, if the sample size of the projects is very large, the users do not need to use signal files from all samples. Instead, a subset of 300 to 500 samples from unrelated subjects are good enough to estimate PFB (population frequency of B allele) for the project. Please put the prepared `list_pfb.txt` in the directory `${WKDIR}/01_initial_call/run_PennCNV/data_aux`. 36 | 37 | - The `SNP.pfb` will not only be used by PennCNV but also employed by ensembleCNV for [CNV genotyping](https://github.com/HaoKeLab/ensembleCNV#4-cnv-genotyping-for-each-cnvr). 38 | 39 | 40 | #### (1.2) compile gcmodel file for GC content ajdustment 41 | 42 | ```sh 43 | perl ${PENNCNV}/bin/cal_gc_snp.pl \ 44 | ${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base_hg19.txt.sorted \ 45 | ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \ 46 | -output ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel 47 | ``` 48 | 49 | Note: 50 | 51 | - For more information about gcmodel file, please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/input/#gcmodel-file). 52 | 53 | - The `gc5Base_hg19.txt.sorted` (take hg19 for example) is generated based on UCSC Genome Browser annotation file (http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gc5Base.txt.gz). Despite the file name, it actually contains GC content per 5120bp. If you need GC annotation file for other genome assembly, please download it from the corresponding directory names. After downloading `gc5Base.txt.gz` (put in `${WKDIR}/01_initial_call/run_PennCNV/data_aux`) and unzipping the file, then sort this file such that chromosome and positions are sorted. 54 | ```sh 55 | sort -k 2,2 -k 3,3n \ 56 | <${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base.txt \ 57 | >${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base_hg19.txt.sorted 58 | ``` 59 | 60 | #### (2) Run PennCNV for each sample in parallel (through job submitting system on cluster) 61 | 62 | Note: 63 | 64 | - In `step.2.run.PennCNV.jobs.R`, the scripts regarding job submission embraced by "##<<<... ##>>>..." need to be specified based on your system. 65 | 66 | - For more information about CNV calling by PennCNV, please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/test/). 67 | 68 | ```sh 69 | Rscript ${WKDIR}/01_initial_call/run_PennCNV/step.2.run.PennCNV.jobs.R \ 70 | --penncnv ${PENNCNV} \ ## direct to ${PENNCNV}/bin/detect_cnv.pl 71 | --data ${WKDIR}/01_initial_call/run_PennCNV/data \ ## generated with finalreport_to_PennCNV.pl 72 | --wkdir ${WKDIR}/01_initial_call/run_PennCNV/results \ ## output directory 73 | --pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \ 74 | --gcmodel ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel \ 75 | --hmm ${PENNCNV}/lib/hhall.hmm 76 | ``` 77 | 78 | When the analysis is completed, there will be subfolders named after sample IDs, each for one sample respectively, created in the directory `${WKDIR}/01_initial_call/run_PennCNV/results/res`. Within each sample subfolders, two files will be generated: 79 | - `.log`: log file generated by `detect_cnv.pl`, information from which will be retrieved to generated sample-level summary statistics (see step (5) below). 80 | - `.rawcnv`: raw CNV calls made by `detect_cnv.pl`. 81 | 82 | #### (3) Check job status and resubmit failed jobs 83 | 84 | Note: In `step.3.check.PennCNV.jobs.R`, the scripts regarding job submission embraced by "##<<<... ##>>>..." need to be specified based on your system. 85 | 86 | ```sh 87 | Rscrip ${WKDIR}/01_initial_call/run_PennCNV/step.3.check.PennCNV.jobs.R \ 88 | --penncnv ${PENNCNV} \ ## direct to ${PENNCNV}/bin/detect_cnv.pl 89 | --data ${WKDIR}/01_initial_call/run_PennCNV/data/ \ ## generated with finalreport_to_PennCNV.pl 90 | --wkdir ${WKDIR}/01_initial_call/run_PennCNV/results \ ## output directory 91 | --pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \ 92 | --gcmodel ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel \ 93 | --hmm ${PENNCNV}/lib/hhall.hmm 94 | ``` 95 | This step checks if the jobs submitted for each sample in step (2) are successfully completed and resubmits failed jobs if there is any. 96 | 97 | 98 | #### (4) Combine PennCNV results (.rawcnv and .log files) from each sample 99 | ```sh 100 | perl ${WKDIR}/01_initial_call/run_PennCNV/step.4.combine.PennCNV.res.pl \ 101 | --in_dir ${WKDIR}/01_initial_call/run_PennCNV/results/res \ 102 | --out_dir ${WKDIR}/01_initial_call/run_PennCNV/results 103 | ``` 104 | This script screens `.log` and `.rawcnv` files for all samples generated in steps (2) and (3), and combines them. When this step is completed, there will be two files generated in the directory `${WKDIR}/01_initial_call/run_PennCNV/results`: 105 | 106 | - `CNV.PennCNV.log`: combined log file of the `.log` files from all samples. 107 | 108 | - `CNV.PennCNV.rawcnv`: combined raw CNV calls of the `.rawcnv` files from all samples. 109 | 110 | #### (5) Merge closely adjacent CNVs and generate final results 111 | ```sh 112 | Rscript ${WKDIR}/01_initial_call/run_PennCNV/step.5.clean.PennCNV.res.R \ 113 | --penncnv ${PENNCNV} \ ## direct to installation directory ${PENNCNV} 114 | --input ${WKDIR}/01_initial_call/run_PennCNV/results \ 115 | --pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb 116 | ``` 117 | 118 | This script is a wrapper to run three perl scripts in PennCNV package: 119 | 120 | - `${PENNCNV}/bin/clean_cnv.pl`: merge adjacent CNVs which are close to each other. Please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/annotation/#merging-adjacent-cnv-calls) for details. 121 | 122 | - `${PENNCNV}/bin/convert_cnv.pl`: convert the CNV calls in `.rawcnv` format to the tab-delimited table. 123 | 124 | - `${PENNCNV}/bin/filter_cnv.pl`: extract sample-level summary statistics from log file. 125 | 126 | When the analysis is completed, you will find two files, which will be used by ensembleCNV, in the directory `${WKDIR}/01_initial_call/run_PennCNV/results`: 127 | 128 | - `CNV.PennCNV_new.txt`: CNV calls of all samples. 129 | - `CNV.PennCNV_qc_new.txt`: sample-level summary statistics. 130 | 131 | -------------------------------------------------------------------------------- /06_performance_assessment/step.1.performance.assessment.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(optparse)) 4 | suppressMessages(library(ggplot2)) 5 | suppressMessages(library(cowplot)) 6 | 7 | option_list <- list( 8 | make_option(c("-d", "--duplicates"), action = "store", default = NA, type = "character", 9 | help = "Path to duplicate pairs information."), 10 | make_option(c("-n", "--matrixCN"), action = "store", default = NA,type = "character", 11 | help = "Path to matrix of copy number (CN)"), 12 | make_option(c("-g", "--matrixGQ"), action = "store", default = NA,type = "character", 13 | help = "Path to matrix of genotyping quality (GQ) score."), 14 | make_option(c("-o", "--resultpath"), action = "store", default = NA,type = "character", 15 | help = "Path to directory for saving assessment results.") 16 | ) 17 | 18 | opt <- parse_args(OptionParser(option_list = option_list)) 19 | pars <- c(opt$duplicates, opt$matrixCN, opt$matrixGQ, opt$resultpath) 20 | 21 | if (any(is.na(pars))) { 22 | stop("All required parameters must be supplied. (--help for detail)") 23 | } 24 | 25 | file_duplicates <- opt$duplicates 26 | file_matrixcn <- opt$matrixCN 27 | file_matrixgq <- opt$matrixGQ 28 | path_result <- opt$resultpath 29 | 30 | dup_pairs <- read.delim(file = file_duplicates, as.is = TRUE) 31 | matrix_CN <- readRDS(file = file_matrixcn) 32 | matrix_gq <- readRDS(file = file_matrixgq) 33 | 34 | # functions --------------------------------------------------------------- 35 | 36 | generate_results <- function(mat, dup_pairs, gq1) { 37 | 38 | # clean cnvr with all CN = 2 or missing (denoted as -9) 39 | mat <- mat 40 | n_cnvr_raw <- nrow(mat) 41 | n_sample <- ncol(mat) 42 | 43 | # filter CNVR 44 | freqs <- unlist(lapply(1:n_cnvr_raw, FUN = function(k) { 45 | v1 <- as.vector(mat[k, ]) 46 | freq1 <- sum(v1 %in% c(0, 1, 3)) 47 | freq1 48 | })) 49 | 50 | idxs_del <- which(freqs == 0) 51 | mat1 <- mat 52 | if (length(idxs_del) >= 1) { 53 | mat1 <- mat[-idxs_del, ] 54 | } 55 | n_cnvr <- nrow(mat1) 56 | cat("After cleaning CNVRs with no CNV calls,", n_cnvr, "CNVRs remains from", n_cnvr_raw, "CNVRs.\n") 57 | mat_clean <- mat1 ## after cleaning nocall CNVR_ID 58 | 59 | ## sample level callRate 60 | freq_sample = unlist(lapply(1:n_sample, FUN = function(k) { 61 | v1 = as.vector( mat_clean[, k] ) 62 | sum(v1 %in% c(0, 1, 2, 3)) 63 | })) 64 | callRate_sample = freq_sample/n_cnvr 65 | ## CNVR level callRate 66 | freq_cnvr = unlist(lapply(1:n_cnvr, FUN = function(k) { 67 | v1 = as.vector( mat_clean[k, ] ) 68 | sum(v1 %in% c(0, 1, 2, 3)) 69 | })) 70 | callRate_cnvr = freq_cnvr/n_sample 71 | 72 | dat_callRate_cnvr = data.frame(callRate_cnvr = callRate_cnvr, 73 | cutoff_gq = gq1, 74 | stringsAsFactors = FALSE) 75 | dat_callRate_sample = data.frame(callRate_sample = callRate_sample, 76 | cutoff_gq = gq1, 77 | stringsAsFactors = FALSE) 78 | 79 | ## consistency rate 80 | consistency_rates <- c() 81 | for (i in 1:nrow(dup_pairs)) { 82 | 83 | sample1 <- dup_pairs$sample1.name[i] 84 | sample2 <- dup_pairs$sample2.name[i] 85 | 86 | cns1 <- as.vector(mat1[, sample1]) 87 | cns2 <- as.vector(mat1[, sample2]) # copy number of sample2 88 | 89 | # filter nocall cnvr 90 | idxs <- union(which(cns1 == -9), which(cns2 == -9)) 91 | if(length(idxs) >= 1) { 92 | cns1 <- cns1[-idxs] 93 | cns2 <- cns2[-idxs] 94 | } 95 | 96 | idxs_overlap <- which(cns1 != 2 & cns2 != 2 & cns1 == cns2) 97 | idxs_union <- union(which(cns1 != 2), which(cns2 != 2)) 98 | 99 | rate1 <- length(idxs_overlap)/length(idxs_union) 100 | consistency_rates <- c(consistency_rates, rate1) 101 | } 102 | 103 | res_consistency <- data.frame(consistency_rate = consistency_rates, 104 | sample1.name = dup_pairs$sample1.name, 105 | sample2.name = dup_pairs$sample2.name, 106 | n_cnvr = n_cnvr, 107 | cutoff_gq = gq1, 108 | stringsAsFactors = FALSE) 109 | 110 | res_ncnvr = data.frame(cutoff_gq = gq1, n_cnvr = n_cnvr, 111 | stringsAsFactors = FALSE) 112 | ## return list results 113 | return(list( 114 | res_consistency = res_consistency, 115 | res_callRate_cnvr = dat_callRate_cnvr, 116 | res_callRate_sample = dat_callRate_sample, 117 | res_ncnvr = res_ncnvr 118 | )) 119 | } 120 | 121 | summary_regenotype = function(mat_CN, mat_gq, cutoffs_gq, dup_pairs) { 122 | 123 | res_consistency = data.frame() ## output dat for plot 124 | res_callRate_cnvr = data.frame() 125 | res_callRate_sample = data.frame() 126 | res_ncnvr = data.frame() 127 | 128 | for (i in 1:length(cutoffs_gq)) { 129 | 130 | gq1 = cutoffs_gq[i] 131 | cat("gq_score:", gq1, "\n") 132 | idx1 = which(mat_gq < gq1) 133 | if (length(idx1) >= 1) { 134 | mat_CN[idx1] = -9 135 | } 136 | 137 | res_gq1 = generate_results(mat = mat_CN, 138 | dup_pairs = dup_pairs, 139 | gq1 = gq1) 140 | 141 | res_consistency = rbind(res_consistency, res_gq1$res_consistency) 142 | res_callRate_sample = rbind(res_callRate_sample, res_gq1$res_callRate_sample) 143 | res_callRate_cnvr = rbind(res_callRate_cnvr, res_gq1$res_callRate_cnvr) 144 | res_ncnvr = rbind(res_ncnvr, res_gq1$res_ncnvr) 145 | 146 | } 147 | 148 | res_consistency$cutoff_gq = factor(res_consistency$cutoff_gq, levels = cutoffs_gq) 149 | res_callRate_sample$cutoff_gq = factor(res_callRate_sample$cutoff_gq, levels = cutoffs_gq) 150 | res_callRate_cnvr$cutoff_gq = factor(res_callRate_cnvr$cutoff_gq, levels = cutoffs_gq) 151 | res_ncnvr$cutoff_gq = factor(res_ncnvr$cutoff_gq, levels = cutoffs_gq) 152 | 153 | ## return list results 154 | return(list( 155 | res_consistency = res_consistency, 156 | res_callRate_cnvr = res_callRate_cnvr, 157 | res_callRate_sample = res_callRate_sample, 158 | res_ncnvr = res_ncnvr 159 | )) 160 | } 161 | 162 | # main -------------------------------------------------------------------- 163 | 164 | cutoffs_gq <- c(0, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80) 165 | 166 | res <- summary_regenotype(mat_CN = matrix_CN, 167 | mat_gq = matrix_gq, 168 | cutoffs_gq = cutoffs_gq, 169 | dup_pairs = dup_pairs) 170 | 171 | res_consistency <- res$res_consistency 172 | res_ncnvr <- res$res_ncnvr 173 | res_callRate_sample <- res$res_callRate_sample 174 | res_callRate_cnvr <- res$res_callRate_cnvr 175 | 176 | saveRDS(res, file = file.path(path_result, "performance_assessment.rds")) 177 | 178 | # start plot -------------------------------------------------------------- 179 | 180 | p1 = ggplot(data = res_consistency, aes(cutoff_gq, consistency_rate)) + 181 | geom_boxplot() + 182 | theme_bw() + 183 | geom_hline(yintercept = 0.9, lty = 2, lwd = 1, col = "grey60") + 184 | theme(panel.background = element_blank(), 185 | plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), 186 | axis.title.x = element_text(size = 18), 187 | axis.title.y = element_text(size = 18), 188 | axis.text = element_text(size = 15), 189 | strip.text = element_text(size = 15)) + 190 | xlab("GQ score threshold") + 191 | ylab("Concordance rate") + 192 | ggtitle("Concrodance rate") 193 | 194 | p2 = ggplot(data = res_ncnvr, aes(cutoff_gq, n_cnvr)) + 195 | #geom_col() + 196 | geom_bar(stat = "identity") + 197 | theme_bw() + 198 | theme(panel.background = element_blank(), 199 | plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), 200 | axis.title.x = element_text(size = 18), 201 | axis.title.y = element_text(size = 18), 202 | axis.text = element_text(size = 15), 203 | strip.text = element_text(size = 15)) + 204 | xlab("GQ score threshold") + 205 | ylab("Number of CNVRs") + 206 | ggtitle("Number of CNVRs") 207 | 208 | p3 = ggplot(data = res_callRate_sample, aes(cutoff_gq, callRate_sample)) + 209 | geom_boxplot() + 210 | theme_bw() + 211 | geom_hline(yintercept = 0.9, lty = 2, lwd = 1, col = "grey60") + 212 | theme(panel.background = element_blank(), 213 | plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), 214 | axis.title.x = element_text(size = 18), 215 | axis.title.y = element_text(size = 18), 216 | axis.text = element_text(size = 15), 217 | strip.text = element_text(size = 15)) + 218 | xlab("GQ score threshold") + 219 | ylab("Sample-wise call rate") + 220 | ggtitle("Sample-wise call rate") 221 | 222 | 223 | p4 = ggplot(data = res_callRate_cnvr, aes(cutoff_gq, callRate_cnvr)) + 224 | geom_boxplot() + 225 | geom_hline(yintercept = 0.9, lty = 2, lwd = 1, col = "grey60") + 226 | theme_bw() + 227 | theme(panel.background = element_blank(), 228 | plot.title = element_text(size = 20, hjust = 0.5, face = "bold"), 229 | axis.title.x = element_text(size = 18), 230 | axis.title.y = element_text(size = 18), 231 | axis.text = element_text(size = 15), 232 | strip.text = element_text(size = 15)) + 233 | xlab("GQ score threshold") + 234 | ylab("CNVR-wise call rate") + 235 | ggtitle("CNVR-wise call rate") 236 | 237 | png(filename = file.path(path_result, "performance_assessment.png"), 238 | width = 12, height = 12, units = "in", res = 512) 239 | 240 | p = plot_grid(p1, p2, p3, p4, 241 | nrow = 2, ncol = 2, 242 | labels = LETTERS[1:4], 243 | label_size = 22, 244 | vjust = 1.2, align = "hv") 245 | print(p) 246 | 247 | dev.off() 248 | 249 | 250 | -------------------------------------------------------------------------------- /example/example_create_CNVR/data/iPattern_all_calls.txt: -------------------------------------------------------------------------------- 1 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.07 Sample86.1_3R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 2 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 16.2 Sample6.1_4R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 3 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.25 Sample143.1_5R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 4 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 27 Sample62.1_7R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 5 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 16.59 Sample35.1_9R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 6 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.84 Sample60.1_9R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 7 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 26.89 Samplec9.1_38R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 8 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 27.79 Sample289.1_16R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 9 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 14.83 Sample651.1_3R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 10 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 29.96 Sample496.1_40R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 11 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.4 Sample130.1_18R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 12 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 20.28 Sample217.1_18R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 13 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 22.59 Sample242.1_20R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 14 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 33.39 Sample520.1_41R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 15 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 31.81 Sample66.1_12R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 16 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.54 Sample177.1_21R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 17 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 20.9 Sample160.1_22R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 18 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.55 Samplec119.1_18R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 19 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 23.52 Sample138.1_23R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 20 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.26 Sample390.1_24R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 21 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 20.91 SampleY-37.1_11R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 22 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 26.63 Sample212.1_25R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 23 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 18.67 Sample686.1_9R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 24 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.34 Sample668.1_14R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 25 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.32 Sample256.1_28R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 26 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 32.4 Sample200.1_29R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 27 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 20.15 SampleY-70.1_4R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 28 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.06 Sample235.1_30R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 29 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 21.45 Sample283.1_30R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 30 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.78 SampleY-41.1_1R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 31 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 26.36 Sample556.1_43R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 32 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 28.9 Sample592.1_43R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 33 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.92 Sample527.1_44R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 34 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.96 Sample550.1_46R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 35 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.04 Sample433.1_48R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 36 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 23.29 Sample445.1_79R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 37 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.71 Sample522.1_78R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 38 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.97 Sample331.1_35R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 39 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 22.22 Sample560.1_78R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 40 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 12.07 Sample392.1_35R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 41 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 11.73 Sample501.1_75R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 42 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 22.09 Sample247.1_67R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 43 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 16.85 Sample439.1_55R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 44 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.91 Sample131.1_5R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 45 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 18.18 Sample72.1_8R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 46 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 18.32 Sample36.1_9R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 47 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 24.24 Sample724.1_12R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 48 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 18.17 Sample748.1_26R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 49 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 23.24 Sample516.1_37R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 50 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 21.96 Sample459.1_28R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 51 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 32.24 Sample533.1_39R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 52 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 18.96 Sample265.1_16R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 53 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 21.66 Sample654.1_23R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 54 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 23.52 Sample737.1_9R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 55 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 27.92 Sample611.1_4R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 56 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 27.15 SampleY-16.1_17R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 57 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 22.89 Sample569.1_41R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 58 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 22.17 Sample751.1_10R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 59 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 13.27 SampleY-53.1_21R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 60 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.16 Sample29.1_11R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 61 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.76 Sample694.1_19R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 62 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 30.73 Sample13.1_12R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 63 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 22.65 SampleY-59.1_28R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 64 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 21.73 Sample141.1_21R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 65 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 27.89 SampleY-47.1_4R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 66 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.43 Sample697.1_24R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 67 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.01 Sample400.1_9R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 68 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 21.14 Sample255.1_28R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 69 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 16.92 Samplec97.1_10R06C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 70 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 16.7 Sample296.1_30R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 71 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 14.27 SampleY-58.1_3R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 72 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.71 Sample371.1_31R02C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 73 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 12.81 SampleY-68.1_27R01C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 74 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 37.41 Sample580.1_43R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 75 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.87 Sample15.1_71R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 76 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 25.08 Sample110.1_62R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 77 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 17.28 Sample192.1_63R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 78 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 22.39 Sample379.1_34R03C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 79 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 20.82 Sample157.1_63R08C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 80 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 27.17 Samplec39.1_59R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 81 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 21.63 Samplec14.1_59R07C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 82 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 19.32 Sample252.1_33R05C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 83 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 15.49 Samplec44.1_65R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 84 | Loss 1 25598276 25642596 8 8 1_1 500 500 106 20.95 Sample285.1_67R04C01 cnve.1.25598276.25642596.1 cnvr.1.25572993.25674785 85 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 14.7 Sample21.1_6R08C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 86 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 13.82 Sample334.1_16R07C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 87 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 13.88 Sample38.1_71R08C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 88 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 16.36 Sample303.1_33R07C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 89 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 16.44 Sample422.1_52R08C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 90 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 12.77 Sample404.1_24R08C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 91 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 17.46 Sample345.1_36R03C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 92 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 15.15 Sample171.1_78R07C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 93 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 11.77 Sample424.1_53R08C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 94 | Loss 1 25598276 25669467 10 9 1_1 483.35 483.35 26 23.21 Sample526.1_57R04C01 cnve.1.25598276.25669467.2 cnvr.1.25572993.25674785 95 | -------------------------------------------------------------------------------- /example/example_create_CNVR/results/cnv.quantisnp.txt: -------------------------------------------------------------------------------- 1 | chr posStart posEnd CN Sample_ID conf numSNP avgConf length CNV_type method 2 | 1 25598276 25638253 3 Sample100.1_3R06C01 4.05203 7 0.578861428571429 39978 Gain QuantiSNP 3 | 1 25598276 25642596 3 Sample108.1_63R05C01 9.94526 8 1.2431575 44321 Gain QuantiSNP 4 | 1 25598276 25642596 3 Sample119.1_63R02C01 2.18403 8 0.27300375 44321 Gain QuantiSNP 5 | 1 25598276 25642596 3 Sample121.1_63R06C01 8.60815 8 1.07601875 44321 Gain QuantiSNP 6 | 1 25598276 25642596 3 Sample124.1_78R08C01 16.4803 8 2.0600375 44321 Gain QuantiSNP 7 | 1 25627470 25642596 3 Sample125.1_22R06C01 4.86867 7 0.695524285714286 15127 Gain QuantiSNP 8 | 1 25598276 25638253 3 Sample126.1_23R05C01 9.03692 7 1.29098857142857 39978 Gain QuantiSNP 9 | 1 25598276 25629950 1 Sample127.1_70R03C01 18.9998 5 3.79996 31675 Loss QuantiSNP 10 | 1 25598276 25629950 1 Sample139.1_70R04C01 12.9404 5 2.58808 31675 Loss QuantiSNP 11 | 1 25598276 25642596 3 Sample142.1_70R05C01 9.2058 8 1.150725 44321 Gain QuantiSNP 12 | 1 25598276 25638253 3 Sample161.1_23R01C01 4.33492 7 0.619274285714286 39978 Gain QuantiSNP 13 | 1 25598276 25642596 3 Sample166.1_18R03C01 17.3779 8 2.1722375 44321 Gain QuantiSNP 14 | 1 25627470 25638253 3 Sample167.1_44R03C01 11.4517 6 1.90861666666667 10784 Gain QuantiSNP 15 | 1 25629943 25642596 3 Sample169.1_62R01C01 13.5597 5 2.71194 12654 Gain QuantiSNP 16 | 1 25598276 25642596 3 Sample172.1_62R08C01 8.46983 8 1.05872875 44321 Gain QuantiSNP 17 | 1 25598276 25642596 3 Sample18.1_4R08C01 1.8337 8 0.2292125 44321 Gain QuantiSNP 18 | 1 25598276 25642596 3 Sample195.1_62R07C01 10.9461 8 1.3682625 44321 Gain QuantiSNP 19 | 1 25598276 25642596 3 Sample205.1_70R07C01 7.52285 8 0.94035625 44321 Gain QuantiSNP 20 | 1 25598276 25642596 3 Sample209.1_30R02C01 0.543627 8 0.067953375 44321 Gain QuantiSNP 21 | 1 25598276 25638253 3 Sample219.1_28R04C01 7.28972 7 1.04138857142857 39978 Gain QuantiSNP 22 | 1 25598276 25642596 3 Sample227.1_68R01C01 6.54151 8 0.81768875 44321 Gain QuantiSNP 23 | 1 25598276 25642596 3 Sample236.1_67R03C01 3.70644 8 0.463305 44321 Gain QuantiSNP 24 | 1 25598276 25638253 3 Sample237.1_25R08C01 2.80132 7 0.400188571428571 39978 Gain QuantiSNP 25 | 1 25598276 25642596 1 Sample243.1_20R08C01 16.0652 8 2.00815 44321 Loss QuantiSNP 26 | 1 25598276 25642596 3 Sample261.1_32R02C01 7.99707 8 0.99963375 44321 Gain QuantiSNP 27 | 1 25598276 25642596 3 Sample262.1_32R08C01 11.5614 8 1.445175 44321 Gain QuantiSNP 28 | 1 25598276 25629950 1 Sample266.1_20R03C01 14.7599 5 2.95198 31675 Loss QuantiSNP 29 | 1 25598276 25629950 1 Sample273.1_32R03C01 13.9294 5 2.78588 31675 Loss QuantiSNP 30 | 1 25598276 25629950 1 Sample275.1_33R04C01 13.5034 5 2.70068 31675 Loss QuantiSNP 31 | 1 25598276 25638253 1 Sample278.1_20R04C01 18.6362 7 2.66231428571429 39978 Loss QuantiSNP 32 | 1 25598276 25638253 3 Sample281.1_27R05C01 3.04208 7 0.434582857142857 39978 Gain QuantiSNP 33 | 1 25598276 25642596 3 Sample282.1_27R08C01 0.613086 8 0.07663575 44321 Gain QuantiSNP 34 | 1 25598276 25642596 3 Sample291.1_20R05C01 6.36673 8 0.79584125 44321 Gain QuantiSNP 35 | 1 25598276 25642596 3 Sample294.1_72R06C01 6.31589 8 0.78948625 44321 Gain QuantiSNP 36 | 1 25598276 25642596 3 Sample297.1_25R05C01 6.15803 8 0.76975375 44321 Gain QuantiSNP 37 | 1 25598276 25638253 3 Sample308.1_31R03C01 0.207963 7 0.029709 39978 Gain QuantiSNP 38 | 1 25598276 25642596 1 Sample309.1_66R07C01 20.5669 8 2.5708625 44321 Loss QuantiSNP 39 | 1 25629943 25642596 3 Sample314.1_80R08C01 12.3155 5 2.4631 12654 Gain QuantiSNP 40 | 1 25598276 25629950 1 Sample317.1_35R04C01 12.1177 5 2.42354 31675 Loss QuantiSNP 41 | 1 25598276 25638253 3 Sample320.1_16R06C01 4.8707 7 0.695814285714286 39978 Gain QuantiSNP 42 | 1 25598276 25642596 3 Sample323.1_69R02C01 4.10757 8 0.51344625 44321 Gain QuantiSNP 43 | 1 25598276 25638253 3 Sample328.1_13R02C01 10.3853 6 1.73088333333333 39978 Gain QuantiSNP 44 | 1 25598276 25638253 1 Sample332.1_73R01C01 19.6671 7 2.80958571428571 39978 Loss QuantiSNP 45 | 1 25598276 25642596 1 Sample349.1_19R03C01 20.8153 8 2.6019125 44321 Loss QuantiSNP 46 | 1 25598276 25638253 3 Sample356.1_35R06C01 7.99647 7 1.14235285714286 39978 Gain QuantiSNP 47 | 1 25598276 25642596 3 Sample357.1_36R04C01 13.6046 8 1.700575 44321 Gain QuantiSNP 48 | 1 25598276 25642596 1 Sample358.1_52R03C01 19.4456 8 2.4307 44321 Loss QuantiSNP 49 | 1 25627470 25638253 3 Sample360.1_31R06C01 4.52212 6 0.753686666666667 10784 Gain QuantiSNP 50 | 1 25598276 25629950 3 Sample363.1_17R05C01 8.85061 5 1.770122 31675 Gain QuantiSNP 51 | 1 25598276 25642596 3 Sample369.1_36R05C01 4.06593 8 0.50824125 44321 Gain QuantiSNP 52 | 1 25598276 25642596 3 Sample375.1_69R05C01 6.37298 8 0.7966225 44321 Gain QuantiSNP 53 | 1 25598276 25638253 3 Sample376.1_17R06C01 2.94036 7 0.420051428571429 39978 Gain QuantiSNP 54 | 1 25598276 25642596 3 Sample378.1_24R06C01 13.8549 8 1.7318625 44321 Gain QuantiSNP 55 | 1 25598276 25642596 3 Sample388.1_17R07C01 8.63671 8 1.07958875 44321 Gain QuantiSNP 56 | 1 25598276 25642596 3 Sample402.1_69R08C01 8.97555 8 1.12194375 44321 Gain QuantiSNP 57 | 1 25598276 25642596 3 Sample402.1_74R03C01 13.1224 8 1.6403 44321 Gain QuantiSNP 58 | 1 25627470 25642596 3 Sample407.1_36R02C01 5.65946 7 0.808494285714286 15127 Gain QuantiSNP 59 | 1 25598276 25642596 3 Sample417.1_77R05C01 19.0808 8 2.3851 44321 Gain QuantiSNP 60 | 1 25598276 25629950 1 Sample431.1_79R01C01 16.467 5 3.2934 31675 Loss QuantiSNP 61 | 1 25598276 25642596 3 Sample438.1_73R08C01 4.94542 8 0.6181775 44321 Gain QuantiSNP 62 | 1 25598276 25642596 3 Sample448.1_53R01C01 7.05554 8 0.8819425 44321 Gain QuantiSNP 63 | 1 25598276 25642596 3 Sample463.1_50R08C01 1.82352 8 0.22794 44321 Gain QuantiSNP 64 | 1 25598276 25642596 3 Sample464.1_55R06C01 22.5451 8 2.8181375 44321 Gain QuantiSNP 65 | 1 25598276 25638253 3 Sample47.1_9R02C01 4.78743 7 0.683918571428571 39978 Gain QuantiSNP 66 | 1 25598276 25638253 1 Sample472.1_48R03C01 21.8442 7 3.1206 39978 Loss QuantiSNP 67 | 1 25598276 25642596 3 Sample475.1_50R04C01 8.12018 8 1.0150225 44321 Gain QuantiSNP 68 | 1 25598276 25642596 3 Sample476.1_55R01C01 20.4309 8 2.5538625 44321 Gain QuantiSNP 69 | 1 25598276 25642596 3 Sample477.1_75R03C01 9.45939 8 1.18242375 44321 Gain QuantiSNP 70 | 1 25598276 25642596 3 Sample485.1_79R06C01 11.2973 8 1.4121625 44321 Gain QuantiSNP 71 | 1 25598276 25629950 3 Sample491.1_54R04C01 15.2998 5 3.05996 31675 Gain QuantiSNP 72 | 1 25627470 25642596 3 Sample492.1_49R01C01 0.663154 7 0.0947362857142857 15127 Gain QuantiSNP 73 | 1 25598276 25642596 3 Sample504.1_77R02C01 5.11496 8 0.63937 44321 Gain QuantiSNP 74 | 1 25598276 25642596 3 Sample505.1_77R04C01 16.027 8 2.003375 44321 Gain QuantiSNP 75 | 1 25598276 25642596 3 Sample506.1_77R07C01 8.47885 8 1.05985625 44321 Gain QuantiSNP 76 | 1 25598276 25638253 3 Sample510.1_47R04C01 8.68003 7 1.24000428571429 39978 Gain QuantiSNP 77 | 1 25598276 25642596 3 Sample529.1_76R05C01 8.63588 8 1.079485 44321 Gain QuantiSNP 78 | 1 25598276 25642596 3 Sample538.1_76R02C01 17.5925 8 2.1990625 44321 Gain QuantiSNP 79 | 1 25598276 25638253 3 Sample540.1_44R04C01 7.68229 7 1.09747 39978 Gain QuantiSNP 80 | 1 25598276 25629950 3 Sample542.1_42R03C01 20.0362 5 4.00724 31675 Gain QuantiSNP 81 | 1 25627470 25638253 3 Sample554.1_37R05C01 7.91008 6 1.31834666666667 10784 Gain QuantiSNP 82 | 1 25598276 25638253 3 Sample564.1_57R05C01 10.6977 7 1.52824285714286 39978 Gain QuantiSNP 83 | 1 25627470 25638253 3 Sample566.1_37R06C01 1.98509 6 0.330848333333333 10784 Gain QuantiSNP 84 | 1 25598276 25642596 3 Sample575.1_57R01C01 5.66285 8 0.70785625 44321 Gain QuantiSNP 85 | 1 25598276 25642596 3 Sample58.1_3R03C01 7.08843 8 0.88605375 44321 Gain QuantiSNP 86 | 1 25598276 25642596 3 Sample583.1_76R08C01 1.28627 8 0.16078375 44321 Gain QuantiSNP 87 | 1 25598276 25642596 3 Sample584.1_78R03C01 7.37858 8 0.9223225 44321 Gain QuantiSNP 88 | 1 25627470 25638253 3 Sample587.1_57R02C01 11.0545 6 1.84241666666667 10784 Gain QuantiSNP 89 | 1 25598276 25638253 3 Sample588.1_57R06C01 7.55628 7 1.07946857142857 39978 Gain QuantiSNP 90 | 1 25627470 25642596 3 Sample625.1_4R08C01 0.0820793 6 0.0136798833333333 15127 Gain QuantiSNP 91 | 1 25598276 25642596 1 Sample626.1_11R04C01 15.2668 7 2.18097142857143 44321 Loss QuantiSNP 92 | 1 25598276 25638253 1 Sample660.1_9R02C01 14.3275 6 2.38791666666667 39978 Loss QuantiSNP 93 | 1 25598276 25638253 3 Sample7.1_8R07C01 14.0671 7 2.00958571428571 39978 Gain QuantiSNP 94 | 1 25598276 25638253 3 Sample704.1_13R08C01 10.2507 6 1.70845 39978 Gain QuantiSNP 95 | 1 25598276 25642596 3 Sample709.1_7R05C01 0.342095 7 0.0488707142857143 44321 Gain QuantiSNP 96 | 1 25598276 25629950 3 Sample721.1_24R06C01 4.30825 5 0.86165 31675 Gain QuantiSNP 97 | 1 25598276 25642596 1 Sample739.1_13R07C01 15.0622 7 2.15174285714286 44321 Loss QuantiSNP 98 | 1 25598276 25638253 3 Sample74.1_6R04C01 1.76736 7 0.25248 39978 Gain QuantiSNP 99 | 1 25598276 25638253 3 Sample745.1_7R02C01 0.780356 6 0.130059333333333 39978 Gain QuantiSNP 100 | 1 25627470 25642596 3 Sample746.1_29R04C01 4.41411 6 0.735685 15127 Gain QuantiSNP 101 | 1 25598276 25638253 3 Sample75.1_9R04C01 14.5811 7 2.08301428571429 39978 Gain QuantiSNP 102 | 1 25598276 25638253 3 Sample83.1_4R05C01 4.46067 7 0.637238571428571 39978 Gain QuantiSNP 103 | 1 25598276 25629950 3 Sample94.1_12R08C01 0.698268 5 0.1396536 31675 Gain QuantiSNP 104 | 1 25598276 25642596 3 Sample98.1_4R06C01 9.87336 8 1.23417 44321 Gain QuantiSNP 105 | 1 25598276 25629950 3 SampleY-26.1_15R03C01 4.79464 5 0.958928 31675 Gain QuantiSNP 106 | 1 25598276 25642596 1 SampleY-34.1_17R06C01 28.4924 7 4.07034285714286 44321 Loss QuantiSNP 107 | 1 25627470 25642596 3 SampleY-46.1_3R07C01 1.20922 6 0.201536666666667 15127 Gain QuantiSNP 108 | 1 25598276 25642596 3 SampleY-55.1_6R03C01 5.26557 7 0.752224285714286 44321 Gain QuantiSNP 109 | 1 25598276 25642596 3 SampleY-60.1_23R04C01 1.83716 7 0.262451428571429 44321 Gain QuantiSNP 110 | 1 25598276 25642596 3 SampleY-66.1_13R04C01 6.33878 7 0.90554 44321 Gain QuantiSNP 111 | 1 25598276 25638253 1 Samplec103.1_7R04C01 10.4957 6 1.74928333333333 39978 Loss QuantiSNP 112 | 1 25627470 25638253 3 Samplec111.1_20R06C01 10.3684 5 2.07368 10784 Gain QuantiSNP 113 | 1 25627470 25642596 3 Samplec112.1_20R07C01 5.45517 6 0.909195 15127 Gain QuantiSNP 114 | 1 25598276 25638253 1 Samplec113.1_20R08C01 13.8458 6 2.30763333333333 39978 Loss QuantiSNP 115 | 1 25598276 25629950 3 Samplec115.1_12R07C01 0.489012 5 0.0978024 31675 Gain QuantiSNP 116 | 1 25598276 25642596 1 Samplec116.1_12R08C01 13.2119 7 1.88741428571429 44321 Loss QuantiSNP 117 | 1 25598276 25638253 3 Samplec124.1_8R05C01 0.783226 6 0.130537666666667 39978 Gain QuantiSNP 118 | 1 25598276 25638253 1 Samplec134.1_30R03C01 18.9541 6 3.15901666666667 39978 Loss QuantiSNP 119 | 1 25598276 25642596 1 Samplec16.1_26R06C01 28.4213 7 4.06018571428571 44321 Loss QuantiSNP 120 | 1 25598276 25629950 1 Samplec17.1_60R08C01 14.591 5 2.9182 31675 Loss QuantiSNP 121 | 1 25598276 25642596 3 Samplec20.1_65R07C01 0.319406 8 0.03992575 44321 Gain QuantiSNP 122 | 1 25598276 25642596 3 Samplec22.1_61R03C01 9.54339 8 1.19292375 44321 Gain QuantiSNP 123 | 1 25598276 25629950 3 Samplec27.1_78R06C01 0.0851005 5 0.0170201 31675 Gain QuantiSNP 124 | 1 25627470 25638253 3 Samplec31.1_58R06C01 0.593463 6 0.0989105 10784 Gain QuantiSNP 125 | 1 25598276 25642596 3 Samplec35.1_61R04C01 1.80077 8 0.22509625 44321 Gain QuantiSNP 126 | 1 25598276 25642596 3 Samplec55.1_53R03C01 0.173868 8 0.0217335 44321 Gain QuantiSNP 127 | 1 25598276 25638253 3 Samplec57.1_64R04C01 10.1527 7 1.45038571428571 39978 Gain QuantiSNP 128 | 1 25598276 25642596 3 Samplec58.1_61R01C01 12.8555 8 1.6069375 44321 Gain QuantiSNP 129 | 1 25598276 25638253 3 Samplec59.1_61R06C01 19.3747 7 2.76781428571429 39978 Gain QuantiSNP 130 | 1 25598276 25642596 1 Samplec6.1_7R08C01 14.5154 7 2.07362857142857 44321 Loss QuantiSNP 131 | 1 25598276 25629950 1 Samplec65.1_60R07C01 12.5598 5 2.51196 31675 Loss QuantiSNP 132 | 1 25598276 25642596 3 Samplec72.1_65R06C01 0.813839 8 0.101729875 44321 Gain QuantiSNP 133 | 1 25598276 25629950 3 Samplec76.1_12R05C01 9.24304 5 1.848608 31675 Gain QuantiSNP 134 | -------------------------------------------------------------------------------- /04_CNV_genotype/CNV.genotype.one.chr.one.batch.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript --vanilla 2 | 3 | # load packages 4 | suppressMessages({ 5 | require(optparse) 6 | require(dplyr) 7 | require(mixtools) 8 | require(ggplot2) 9 | require(cowplot) 10 | require(plyr) 11 | require(modeest) 12 | require(mclust) 13 | require(gridExtra) 14 | require(pheatmap) 15 | require(RColorBrewer) 16 | }) 17 | 18 | option_list = list( 19 | make_option(c("-c", "--chr"), action = "store", type = "character", default = NA, 20 | help = "Specify the chromosome on which the list of CNVRs to be genotyped is located."), 21 | make_option(c("-b", "--batch"), action = "store", type = "character", default = NA, 22 | help = "Specify the batch to which the list of CNVRs to be genotyped belongs."), 23 | make_option(c("-t", "--type"), action = "store", type = "character", default = NA, 24 | help = "Job submission type (0 - initial submission, 1 - resubmission of failed jobs)"), 25 | make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA, 26 | help = "Path to the directory containing necessary input data."), 27 | make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA, 28 | help = "Path to the directory for saving results."), 29 | make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA, 30 | help = "Path to chromosome-wise LRR and BAF matrices."), 31 | make_option(c("-s", "--sourcefile"), action = "store", type = "character", default = NA, 32 | help = "Path to the scripts directory containing R scripts to be loaded into R."), 33 | make_option(c("-d", "--duplicates"), action = "store_true", default = FALSE, 34 | help = "[optional] Whether duplicate pairs information will be annotated in diagnosis plots."), 35 | make_option(c("-n", "--plot"), action = "store_true", default = FALSE, 36 | help = "[optional] Whether to generate diagnosis plots.") 37 | ) 38 | 39 | opt = parse_args(OptionParser(option_list = option_list)) 40 | pars = c(opt$chr, opt$batch, opt$type, opt$datapath, opt$resultpath, opt$matrixpath, opt$sourcefile) 41 | 42 | if ( any(is.na(pars)) ) { 43 | stop("All three parameters must be supplied. (--help for detail)") 44 | } 45 | 46 | chr1 <- as.integer( opt$chr ) 47 | batch1 <- as.integer( opt$batch ) 48 | type1 <- as.integer( opt$type ) 49 | 50 | path_data <- opt$datapath 51 | path_result <- opt$resultpath 52 | path_matrix <- opt$matrixpath 53 | path_sourcefile <- opt$sourcefile 54 | flag_png_plot <- opt$plot 55 | flag_duplicates <- opt$duplicates 56 | 57 | if ( type1 != 1 & type1 != 0) { 58 | stop("Job submission type must be 0 or 1. (--help for detail)") 59 | } 60 | ## print out parameters 61 | cat("Processing chr:", chr1, "batch:", batch1, "type:", type1, "\n") 62 | 63 | # source all the functions used in the pipeline 64 | source(file = file.path(path_sourcefile, 'fun_BAF.R')) 65 | source(file = file.path(path_sourcefile, 'fun_gatk.R')) 66 | source(file = file.path(path_sourcefile, 'fun_LRR.R')) 67 | source(file = file.path(path_sourcefile, 'fun_models.R')) 68 | source(file = file.path(path_sourcefile, 'fun_plot_steps.R')) 69 | source(file = file.path(path_sourcefile, 'fun_plot_diagnosis.R')) 70 | source(file = file.path(path_sourcefile, 'fun_plot_heatmap.R')) 71 | source(file = file.path(path_sourcefile, 'fun_pipeline_main.R')) 72 | 73 | ## use parameters 74 | ## PennCNV ( sample LRR mean and SD ) 75 | samples_LRR <- read.delim(file = file.path(path_data, "samples_QC.txt"), as.is = TRUE) 76 | samples_LRR$Sample_ID <- sub("\\.txt$", "", samples_LRR$File) 77 | 78 | ## dup pairs with column_name ( sample1.name sample2.name ) 79 | dup_pairs <- NULL # init 80 | if ( flag_duplicates ) { 81 | dup_pairs <- read.delim(file = file.path(path_data, "duplicate_pairs.txt"), as.is = TRUE) 82 | } 83 | 84 | ## paras_LRR ------------------------------------------------------ 85 | paras_LRR <- list(LRR_mean = list(CN_1 = -0.4156184, CN_3 = 0.1734862), 86 | LRR_sd = list(CN_1 = 0.2502591, CN_3 = 0.2249798)) ## sd for one SNP 87 | ## These parameters can be updated after the intial round of CNVgenotyping 88 | ## by selecting the CNVRs with well fitted GMM. 89 | 90 | # main part for runing on cluster ----------------------------------------- 91 | 92 | cat("read in CNV ...\n") 93 | dt_cnvs <- read.delim(file = file.path(path_data, "cnv_clean.txt"), as.is = TRUE) 94 | 95 | # PennCNV PFB information 96 | cat("read in PFB ...\n") 97 | dt_PFB <- read.table(file = file.path(path_data, "SNP.pfb"), sep = "\t", 98 | header = TRUE, as.is = TRUE, check.names = FALSE, 99 | comment.char = "") 100 | dt_PFB <- dt_PFB[, c("Name", "PFB", "Position")] # add Position information here 101 | 102 | # read in matrix dat of LRR and BAF 103 | cat("read in BAF matrix ...\n") 104 | file_BAF <- paste0("matrix_chr_", chr1, "_BAF.rds") 105 | dt_matrix_BAF <- readRDS(file = file.path( path_matrix, "BAF", file_BAF)) 106 | dt_matrix_BAF <- as.matrix(dt_matrix_BAF) 107 | 108 | cat("read in LRR matrix ...\n") 109 | file_LRR <- paste0("matrix_chr_", chr1, "_LRR.rds") 110 | dt_matrix_LRR <- readRDS(file = file.path( path_matrix, "LRR", file_LRR)) 111 | dt_matrix_LRR <- as.matrix(dt_matrix_LRR) 112 | 113 | samples <- rownames(dt_matrix_LRR) 114 | snps <- colnames(dt_matrix_LRR) 115 | n_snps <- length(snps) 116 | n_samples <- length(samples) 117 | 118 | # read in cnvrs dat ------------------------------------------------- 119 | create_path <- function(path_main, str_subpath) { 120 | 121 | path_sub = file.path(path_main, str_subpath) 122 | 123 | if ( !dir.exists(paths = path_sub) ) { 124 | dir.create(path = path_sub, showWarnings = FALSE, recursive = TRUE) 125 | } 126 | 127 | return( path_sub ) 128 | } 129 | 130 | # output pathsub_folder: summary/steps/diag/heatmap 131 | path_main <- path_result 132 | path_log <- create_path(path_main = path_main, str_subpath = "log") 133 | path_pred <- create_path(path_main = path_main, str_subpath = "pred") 134 | path_pars <- create_path(path_main = path_main, str_subpath = "pars") 135 | 136 | if ( flag_png_plot ) { 137 | path_png <- create_path(path_main = path_main, str_subpath = "png") 138 | path_heatmap <- create_path(path_main = path_png, str_subpath = "heatmap") 139 | } 140 | 141 | path_cnvrs_error <- create_path(path_main = path_main, str_subpath = "cnvrs_error") 142 | 143 | # add subfolders for each chr and each batch 144 | folder.name <- paste0("chr_", chr1, "_batch_", batch1) 145 | path_pred <- file.path(path_pred, folder.name) 146 | 147 | # test if folder exist 148 | if ( !dir.exists(path_pred) ) { 149 | dir.create(path = path_pred, showWarnings = FALSE, recursive = TRUE) 150 | } 151 | 152 | dt_cnvrs1 <- data.frame() 153 | cnvrs <- NULL 154 | if (type1 == 0) { 155 | 156 | file_cnvr <- "cnvr_batch.txt" ## with batch information 157 | dt_cnvrs <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE) 158 | dt_cnvrs1 <- subset(dt_cnvrs, chr == chr1 & batch == batch1) 159 | cnvrs <- unique( dt_cnvrs1$CNVR_ID ) 160 | 161 | } else if (type1 == 1) { 162 | 163 | ## this path can be specified by users 164 | file_cnvr <- "cnvr_batch.txt" ## with batch information 165 | dt_cnvrs <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE) 166 | 167 | cnvrs_error <- read.table(file = file.path(path_cnvrs_error, paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt")), 168 | sep = "\t", header = T, check.names = F, stringsAsFactors = F) 169 | 170 | dt_cnvrs1 <- subset(dt_cnvrs, CNVR_ID %in% cnvrs_error$CNVR_ID) 171 | cnvrs <- unique( dt_cnvrs1$CNVR_ID ) 172 | } 173 | 174 | ## must be changed here to save each CNVRID data 175 | path_cnvr_stat <- file.path(path_result, "stats") 176 | dir.create(path = path_cnvr_stat, showWarnings = FALSE) 177 | 178 | res_pars_all <- data.frame() 179 | cnvrs_error <- c() 180 | # -------------------------------------------------------------------- 181 | for (i in 1:nrow(dt_cnvrs1)) { 182 | 183 | cnvr1 <- dt_cnvrs1$CNVR_ID[i] 184 | 185 | cat("cnvr1:", cnvr1, i, "in", nrow(dt_cnvrs1), "\n") 186 | 187 | snp_start <- dt_cnvrs1$start_snp[i] 188 | snp_end <- dt_cnvrs1$end_snp[i] 189 | 190 | ## snps have been sorted by their positions on chromosome 191 | ## when preparing chromosome-wise LRR and BAF matrices 192 | idx_start <- which(snps == snp_start) 193 | idx_end <- which(snps == snp_end) 194 | 195 | ## check idx_start and idx_end 196 | idxs <- c(idx_start, idx_end) 197 | if (length(idxs) != 2 | idx_start >= idx_end) { 198 | stop("CNVR boundaries are not consistency with SNP information.") 199 | } 200 | 201 | snps_name <- snps[idx_start:idx_end] # all snps in cnvr1 202 | 203 | # plot heatmap add 20 snps on the both side ---------------------------- 204 | ## idx_outer_start <- dt_cnvrs1$outer.start[i] 205 | ## idx_outer_end <- dt_cnvrs1$outer.end[i] 206 | idx_outer_start <- idx_start 207 | idx_outer_end <- idx_end 208 | idx_start_new <- ifelse((idx_outer_start - 20) <= 0, 1, idx_outer_start - 20) # new start and end for plot heatmap 209 | idx_end_new <- ifelse((idx_outer_end + 20) > n_snps, n_snps, idx_outer_end + 20) 210 | 211 | dt_lrr_heatmap = dt_matrix_LRR[, idx_start_new:idx_end_new] 212 | 213 | snps_name_heatmap <- snps[idx_start_new:idx_end_new] 214 | snps_name_all <- snps[idx_outer_start:idx_outer_end] 215 | # colnames(dt_lrr_heatmap) <- snps_name_heatmap 216 | 217 | snps_add <- setdiff(snps_name_heatmap, snps_name_all) 218 | snps_outer <- setdiff(snps_name_all, snps_name) 219 | snps_flag <- ifelse(snps_name_heatmap %in% snps_add, 0, 220 | ifelse(snps_name_heatmap %in% snps_name, 2, 1)) 221 | dt_snps_flag <- data.frame(snp_name = snps_name_heatmap, 222 | snp_flag = snps_flag, 223 | stringsAsFactors = FALSE) 224 | 225 | if ( flag_png_plot ) { 226 | filename_heatmap <- paste0("heatmap_", cnvr1, ".png") 227 | png(filename = file.path(file.path(path_png, "heatmap"), filename_heatmap), 228 | width = 12, height = 12, units = "in", res = 512) 229 | plot_heatmap(dt_lrr_heatmap = dt_lrr_heatmap, dt_snps_flag = dt_snps_flag) 230 | dev.off() 231 | } 232 | 233 | # ------------------------------------------------------------------- 234 | dt_baf = dt_matrix_BAF[, idx_start:idx_end] 235 | dt_lrr = dt_matrix_LRR[, idx_start:idx_end] 236 | 237 | numsnp <- idx_end - idx_start + 1 238 | samples_new <- rownames(dt_baf) ## need change in dt_cnvr_stat 239 | stopifnot( all(samples_new == samples) ) 240 | 241 | dt_cnvr_stat <- data.frame(CNVR_ID = cnvr1, 242 | Chr = chr1, 243 | BAF = as.vector(dt_baf), 244 | LRR = as.vector(dt_lrr), 245 | Sample_ID = rep(samples_new, numsnp), 246 | Name = rep(snps_name, each = length(samples_new)), 247 | numSNP = numsnp, 248 | stringsAsFactors = FALSE) 249 | 250 | dt_PFB1 <- subset(dt_PFB, Name %in% snps_name) 251 | dt_cnvr_stat <- merge(dt_cnvr_stat, dt_PFB1, all.x = TRUE) 252 | 253 | dt_samples_cn <- data.frame(Sample_ID = samples_new, stringsAsFactors = FALSE) 254 | 255 | dt_cnv <- subset(dt_cnvs, CNVR_ID == cnvr1) 256 | dt_cnv <- dt_cnv[, c("Sample_ID", "CN", "alg")] 257 | dt_samples_cn <- merge(dt_samples_cn, dt_cnv, all.x = TRUE) 258 | dt_samples_cn$CN[ which(is.na(dt_samples_cn$CN)) ] <- 2 259 | dt_samples_cn$alg[ which(is.na(dt_samples_cn$alg)) ] <- "other" 260 | 261 | dt_cnvr_stat <- merge(dt_cnvr_stat, dt_samples_cn, all.x = TRUE) 262 | 263 | ## save CNVR-stat data 264 | saveRDS(dt_cnvr_stat, file = file.path(path_cnvr_stat, paste0(cnvr1, "_stat.rds"))) 265 | 266 | # catch error and warning when calling CNVR 267 | res_pipeline_cnvr1 <- tryCatch({ 268 | pipeline_main(dt_cnvrs = dt_cnvr_stat, 269 | paras_LRR = paras_LRR, 270 | dup_pairs = dup_pairs, 271 | samples_LRR = samples_LRR, 272 | path_png = path_png, 273 | n.sample = n_samples, 274 | flag_png_plot = flag_png_plot) 275 | }, error = function(e) { 276 | NULL 277 | }, warning = function(w) { 278 | NULL 279 | }) 280 | 281 | if ( is.null(res_pipeline_cnvr1) ) { 282 | cnvrs_error <- c(cnvrs_error, cnvr1) 283 | next 284 | } 285 | 286 | res_gatk_pred_final <- res_pipeline_cnvr1$res_gatk_pred_final 287 | res_pars <- res_pipeline_cnvr1$res_pars 288 | 289 | cat( names(res_pars_all), "\n") 290 | cat( names(res_gatk_pred_final), "\n") 291 | 292 | # res_pred_all <- rbind(res_pred_all, res_gatk_pred_final) 293 | filename_cnvr1 <- paste0(cnvr1, "_pred.rds") 294 | saveRDS(res_gatk_pred_final, file = file.path(path_pred, filename_cnvr1)) 295 | 296 | res_pars_all <- rbind(res_pars_all, res_pars) 297 | } 298 | 299 | filename_pars <- paste0("CNVR_pars_chr_", chr1, "_batch_", batch1, ".rds") 300 | saveRDS(res_pars_all, file = file.path(path_pars, filename_pars)) # pars file 301 | 302 | if ( length(cnvrs_error) >= 1) { 303 | write.table(data.frame(CNVR_ID = cnvrs_error, stringsAsFactors = F), 304 | file = file.path(path_cnvrs_error, paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt")), 305 | col.names = T, row.names = F, quote = F) 306 | } 307 | 308 | 309 | 310 | -------------------------------------------------------------------------------- /04_CNV_genotype/scripts/fun_pipeline_main.R: -------------------------------------------------------------------------------- 1 | 2 | # add GQ_score as parameter 3 | 4 | # transform model to data.frame 5 | trans_model <- function(model) { 6 | 7 | mus = model$mu 8 | sigmas = model$sigma 9 | lambdas = model$lambda 10 | 11 | res <- data.frame(mu0 = mus[1], mu1 = mus[2], mu2 = mus[3], mu3 = mus[4], 12 | sigma0 = sigmas[1], sigma1 = sigmas[2], sigma2 = sigmas[3], sigma3 = sigmas[4], 13 | lambda0 = lambdas[1], lambda1 = lambdas[2], lambda2 = lambdas[3], lambda3 = lambdas[4]) 14 | 15 | } 16 | 17 | # main pipeline function 18 | pipeline_main <- function(dt_cnvrs, paras_LRR, dup_pairs, samples_LRR, 19 | plot_steps = TRUE, path_png, GQ_score = 0, 20 | n.sample, flag_png_plot) { 21 | 22 | if (flag_png_plot) { 23 | path_png_diag <- create_path(path_main = path_png, str_subpath = "diag") 24 | path_png_steps <- create_path(path_main = path_png, str_subpath = "steps") 25 | path_png_summary <- create_path(path_main = path_png, str_subpath = "summary") 26 | } 27 | 28 | cnvr_id <- unique(dt_cnvrs$CNVR_ID) 29 | numsnp <- unique(dt_cnvrs$numSNP) # numsnp 30 | dt_cnvr <- process_cnvr_LRR(dt_cnvrs = dt_cnvrs, samples_LRR = samples_LRR) 31 | 32 | n_sample = nrow(dt_cnvr) 33 | stopifnot(n_sample == n.sample) ## change here for other dataset 34 | # set CN = 0 cutoff = -0.8 35 | dt_cnvr0 <- subset(dt_cnvr, LRR_median <= -0.8) 36 | n0 <- nrow(dt_cnvr0) # set cutoff of n0 is 5 37 | 38 | dt_cnvr_train <- subset(dt_cnvr, LRR_median > - 0.8 & CN != 0) # all confirmed CN = 1/2/3 39 | 40 | # if dt_cnvr_train 41 | if (nrow(dt_cnvr_train) == 0) { 42 | 43 | res_pars <- NULL 44 | 45 | res_gatk_pred_final <- data.frame(Sample_ID = dt_cnvr$Sample_ID, CN = dt_cnvr$CN, 46 | CNVR_ID = cnvr_id, CN_gatk_pred = 0, 47 | value_GQ = 100, stringsAsFactors = FALSE) 48 | 49 | 50 | res <- list(res_gatk_pred_final = res_gatk_pred_final, 51 | res_pars = res_pars) ## paras for each CNVR_ID 52 | 53 | } else { 54 | 55 | res_paras <- train_model_zz(dt_cnvr = dt_cnvr_train, paras_LRR = paras_LRR) 56 | 57 | paras_all <- res_paras$paras_all # final paras from gmm model 58 | paras_model <- res_paras$paras_model # step paras for plot diagnosis 59 | 60 | ## all predict result 61 | mu1 <- paras_all$mus[1] 62 | sigma1 <- paras_all$sigmas[1] 63 | mu2 <- paras_all$mus[2] 64 | sigma2 <- paras_all$sigmas[2] 65 | mu3 <- paras_all$mus[3] 66 | sigma3 <- paras_all$sigmas[3] 67 | 68 | 69 | cat("parameter_gmm:\n") 70 | cat(paras_all$mus, "\n") 71 | cat(paras_all$sigmas, "\n") 72 | cat(paras_all$lambdas, "\n") 73 | # save diagnosis png 74 | ## plot diagnosis 75 | if ( flag_png_plot ) { 76 | file_diagnosis <- paste0("diag_", cnvr_id, ".png") 77 | png(filename = file.path(path_png_diag, file_diagnosis), width = 12, height = 12, units = "in", res = 512) 78 | plot_gmm_diagnosis(dt_cnvr = dt_cnvr_train, paras_model = paras_model) 79 | dev.off() 80 | } 81 | 82 | # set CN = 0 83 | mu0 <- -3 84 | # sigma0 <- sigma1*10 ## MUST BE CHANGED 85 | sigma0 <- 0.8*0.8 86 | if (n0 != 0) { 87 | if (n0 >= 5) { 88 | mu0 <- median(dt_cnvr0$LRR_median) 89 | sigma0 <- sd(dt_cnvr0$LRR_median) 90 | } 91 | } 92 | 93 | model1 <- list() 94 | model1$mu <- c(mu0, paras_all$mus) 95 | model1$sigma <- c(sigma0, paras_all$sigmas) ## 2212 samples number 96 | model1$lambda <- c(n0/n_sample, ((n_sample - n0)/n_sample)*paras_all$lambdas) 97 | 98 | # add cutoff of pbf to select SNP 99 | dt_cnvrs_BAF <- subset(dt_cnvrs, PFB <= 0.99 & PFB >= 0.01) 100 | 101 | cat("nrow BAF data:", nrow(dt_cnvrs_BAF), "\n") 102 | flag_BAF <- ifelse(nrow(dt_cnvrs_BAF) == 0, 0, 1) 103 | 104 | numsnp.used <- 0 105 | numsnp.raw <- unique(dt_cnvrs$numSNP) 106 | # calculate BAF 107 | if (flag_BAF == 1) { 108 | dt_BAF1 <- calculate_BAF_gatk_whole(dt_cnvrs = dt_cnvrs_BAF) 109 | numsnp.used <- length(unique(dt_cnvrs_BAF$Name)) ## 110 | cat("numsnp.used:", numsnp.used, "\n") 111 | } 112 | 113 | # calculate LRR 114 | dt_LRR1 <- output_LRR_gatk(dt_cnvr = dt_cnvr, model = model1) 115 | 116 | # add to deal with 0 value in each column ----------------- 117 | idx0.inf <- which(dt_LRR1$LRR0 == 0) 118 | if (length(idx0.inf) >= 1) { 119 | dt_LRR1$LRR0[idx0.inf] <- 1e-10 120 | } 121 | 122 | idx1.inf <- which(dt_LRR1$LRR1 == 0) 123 | if (length(idx1.inf) >= 1) { 124 | dt_LRR1$LRR1[idx1.inf] <- 1e-10 125 | } 126 | 127 | idx2.inf <- which(dt_LRR1$LRR2 == 0) 128 | if (length(idx2.inf) >= 1) { 129 | dt_LRR1$LRR2[idx2.inf] <- 1e-10 130 | } 131 | 132 | idx3.inf <- which(dt_LRR1$LRR3 == 0) 133 | if (length(idx3.inf) >= 1) { 134 | dt_LRR1$LRR3[idx3.inf] <- 1e-10 135 | } 136 | 137 | # ========================================== 138 | 139 | dt_LRRBAF1 <- data.frame() 140 | res_gatk_pred1 <- data.frame() 141 | mean1_GQ <- 0 142 | if (flag_BAF == 1) { 143 | dt_LRRBAF1 <- merge(dt_LRR1, dt_BAF1) 144 | res_gatk_pred1 <- output_gatk_result(dt_LRRBAF = dt_LRRBAF1) # dt_LRRBAF = dt_LRR1 145 | mean1_GQ <- mean(res_gatk_pred1$value_GQ) 146 | cat("mean1_GQ:", mean1_GQ, "nrow gatk_pred1:", nrow(res_gatk_pred1),"\n") 147 | } else { 148 | dt_LRRBAF1 <- dt_LRR1 149 | res_gatk_pred1 <- output_gatk_result_LRR(dt_LRRBAF = dt_LRRBAF1) 150 | mean1_GQ <- mean(res_gatk_pred1$value_GQ) 151 | } 152 | 153 | 154 | ## save steps_1_ png 155 | if (flag_BAF == 1 & flag_png_plot) { 156 | file_steps <- paste0("steps_1_", cnvr_id, ".png") 157 | png(filename = file.path(path_png_steps, file_steps), width = 12, height = 12, units = "in", res = 512) 158 | plot_steps(dt_cnvr_train = dt_cnvr_train, dup_pairs = dup_pairs, dt_cnvr_raw = dt_cnvr, 159 | paras = paras_all, dt_LRRBAF = res_gatk_pred1) ## here 160 | dev.off() 161 | } 162 | 163 | # if number of CN = 1 <= 2* CN= 0 164 | # re_model 165 | n0_new <- sum(res_gatk_pred1$CN_gatk_pred == 0) 166 | n1_new <- sum(res_gatk_pred1$CN_gatk_pred == 1) 167 | # n0_new 168 | # n1_new 169 | 170 | # hardy weinberg test 171 | res_gatk_pred_final <- NULL 172 | model_final <- NULL 173 | if (n1_new >= n0_new | (paras_all$lambdas[2] >= 0.9)) { 174 | res_gatk_pred_final <- res_gatk_pred1 175 | model_final <- model1 ## final model 176 | } else { 177 | mu1 <- paras_all$mus[2] 178 | sigma1 <- paras_all$sigmas[2] 179 | mu2 <- paras_all$mus[3] 180 | sigma2 <- paras_all$sigmas[3] 181 | mu3 <- paras_all$mus[3] + paras_LRR$LRR_mean$CN_3 182 | sigma3 <- paras_LRR$LRR_sd$CN_3/sqrt(numsnp) 183 | 184 | if (n0 == 0) { 185 | model2 <- normalmixEM(x = dt_cnvr$LRR_median, k = 3, 186 | mean.constr = c(mu1, mu2, mu3), 187 | sd.constr = c(sigma1, sigma2, sigma3)) 188 | # add CN = 0 parameters 189 | model2$mu <- c(mu0, model2$mu) 190 | model2$sigma <- c(sigma0, model2$sigma) 191 | model2$lambda <- c(0, model2$lambda) 192 | } else { 193 | model2 <- normalmixEM(x = dt_cnvr$LRR_median, k = 4, 194 | mean.constr = c(mu0, mu1, mu2, mu3), 195 | sd.constr = c(sigma0, sigma1, sigma2, sigma3)) 196 | } 197 | 198 | # calculate LRR 199 | dt_LRR2 <- output_LRR_gatk(dt_cnvr = dt_cnvr, model = model2) 200 | 201 | idx0.inf <- which(dt_LRR2$LRR0 == 0) 202 | if (length(idx0.inf) >= 1) { 203 | dt_LRR2$LRR0[idx0.inf] <- 1e-10 204 | } 205 | 206 | idx1.inf <- which(dt_LRR2$LRR1 == 0) 207 | if (length(idx1.inf) >= 1) { 208 | dt_LRR2$LRR1[idx1.inf] <- 1e-10 209 | } 210 | 211 | idx2.inf <- which(dt_LRR2$LRR2 == 0) 212 | if (length(idx2.inf) >= 1) { 213 | dt_LRR2$LRR2[idx2.inf] <- 1e-10 214 | } 215 | 216 | idx3.inf <- which(dt_LRR2$LRR3 == 0) 217 | if (length(idx3.inf) >= 1) { 218 | dt_LRR2$LRR3[idx3.inf] <- 1e-10 219 | } 220 | 221 | res_gatk_pred2 <- data.frame() 222 | mean2_GQ <- 0 223 | cat("calculate mean2_GQ.\n") 224 | # calculate BAF 225 | if (flag_BAF == 1) { 226 | dt_BAF2 <- dt_BAF1 ## save as dt_BAF1 227 | dt_LRRBAF2 <- merge(dt_LRR2, dt_BAF2) 228 | res_gatk_pred2 <- output_gatk_result(dt_LRRBAF = dt_LRRBAF2) 229 | mean2_GQ <- mean(res_gatk_pred2$value_GQ) 230 | } else { 231 | dt_LRRBAF2 <- dt_LRR2 232 | res_gatk_pred2 <- output_gatk_result_LRR(dt_LRRBAF = dt_LRRBAF2) 233 | mean2_GQ <- mean(res_gatk_pred2$value_GQ) 234 | } 235 | 236 | cat(mean1_GQ, mean2_GQ, "\n") 237 | 238 | res_gatk_pred_final <- res_gatk_pred2 239 | model_final <- model2 ## model final 240 | } 241 | 242 | ## save steps_2 png 243 | if (flag_BAF == 1 & flag_png_plot) { 244 | file_steps <- paste0("steps_2_", cnvr_id, ".png") 245 | png(filename = file.path(path_png_steps, file_steps), width = 12, height = 12, units = "in", res = 512) 246 | plot_steps(dt_cnvr_train = dt_cnvr_train, dup_pairs = dup_pairs, dt_cnvr_raw = dt_cnvr, 247 | paras = paras_all, dt_LRRBAF = res_gatk_pred_final) ## here 248 | dev.off() 249 | } 250 | 251 | 252 | # add GQ_score cutoff here 253 | idxs_nocall <- which(res_gatk_pred_final$value_GQ <= GQ_score) 254 | call_rate <- 1 - length(idxs_nocall)/nrow(res_gatk_pred_final) 255 | if (length(idxs_nocall) >= 1) { 256 | res_gatk_pred_final$CN_gatk_pred[idxs_nocall] <- 4 257 | } 258 | if ( flag_png_plot ) { 259 | 260 | # plot for final 261 | # for new input must ordered as follow 262 | # dt_pfb <- dt_cnvrs[order(dt_cnvrs$Sample_ID, dt_cnvrs$Position), ] 263 | dt_pfb <- dt_cnvrs[1:numsnp, ] 264 | dt_pfb$MAF <- pmin(dt_pfb$PFB, 1 - dt_pfb$PFB) ## 265 | dt_pfb <- dt_pfb[, c("Name", "MAF")] 266 | 267 | plot_MAF <- ggplot(data = dt_pfb, aes(Name, MAF)) + 268 | geom_col() + 269 | ggtitle(label = paste("snps MAF in Position order", "numSNP:", numsnp)) + 270 | labs(x = "SNP Name") + 271 | theme_bw(base_size = 9) + 272 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 273 | 274 | # final 275 | 276 | cat("model_final_parameter:\n") 277 | cat(model_final$mu, "\n") 278 | cat(model_final$sigma, "\n") 279 | cat(model_final$lambda, "\n") 280 | plot_final <- plot_model_final(paras = model_final, dt_cnvr = dt_cnvr_train, 281 | title = paste("final model for", cnvr_id, "numSNP:", numsnp)) 282 | 283 | 284 | # scatter plot LRR_median 285 | dt_pred <- res_gatk_pred_final[, c("Sample_ID", "CN_gatk_pred")] 286 | dt_cnvr_scatter <- merge(dt_cnvr, dt_pred) 287 | dt_cnvr_scatter <- dt_cnvr_scatter[order(dt_cnvr_scatter$CN), ] 288 | dt_cnvr_scatter$idx <- 1:nrow(dt_cnvr_scatter) 289 | myColors <- brewer.pal(4, "Set1") 290 | plot_raw <- ggplot() + 291 | geom_point(data = subset(dt_cnvr_scatter, CN == 0), aes(idx, LRR_median), col = "black") + 292 | geom_point(data = subset(dt_cnvr_scatter, CN == 1), aes(idx, LRR_median), col = "red") + 293 | geom_point(data = subset(dt_cnvr_scatter, CN == 2), aes(idx, LRR_median), col = "green") + 294 | geom_point(data = subset(dt_cnvr_scatter, CN == 3), aes(idx, LRR_median), col = "blue") + 295 | theme_bw(base_size = 10) + 296 | ggtitle(label = "CNV call from IPQ") 297 | 298 | # plot_raw 299 | # add gray color point here 300 | plot_gatk <- ggplot() + 301 | geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 0), aes(idx, LRR_median), col = "black") + 302 | geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 1), aes(idx, LRR_median), col = "red") + 303 | geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 2), aes(idx, LRR_median), col = "green") + 304 | geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 3), aes(idx, LRR_median), col = "blue") + 305 | geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 4), aes(idx, LRR_median), col = "gray") + 306 | theme_bw(base_size = 10) + 307 | ggtitle(label = "CNV call from gatk similary method", 308 | subtitle = paste("GQ score:", GQ_score, "call rate:", round(call_rate, 3), 309 | "numsnp.raw:", numsnp.raw, "numsnp.used:", numsnp.used)) 310 | 311 | # plot_gatk 312 | filefinal <- paste0("summary_", cnvr_id, ".png") 313 | png(filename = file.path(path_png_summary, filefinal), 314 | width = 12, height = 12, units = "in", res = 512) 315 | grid.arrange(plot_MAF, plot_final, plot_raw, plot_gatk, nrow = 2) 316 | dev.off() 317 | 318 | } 319 | 320 | res_pars <- trans_model(model = model_final) 321 | res_pars$CNVR_ID = cnvr_id 322 | res_pars$numSNP = numsnp 323 | 324 | # res_gatk_pred_final ## final result 325 | res <- list(res_gatk_pred_final = res_gatk_pred_final, 326 | res_pars = res_pars) ## paras for each CNVR_ID 327 | 328 | } 329 | 330 | } 331 | 332 | 333 | 334 | --------------------------------------------------------------------------------