├── example
    ├── example_create_CNVR
    │   ├── results
    │   │   ├── cnvr_create.txt
    │   │   ├── cnvr_boundary.txt
    │   │   ├── cnvr_clean.txt
    │   │   ├── cnv.penncnv.txt
    │   │   ├── cnv.ipattern.txt
    │   │   └── cnv.quantisnp.txt
    │   ├── data
    │   │   ├── centromere_hg19.txt
    │   │   ├── CNV.PennCNV_new.txt
    │   │   └── iPattern_all_calls.txt
    │   └── README.md
    ├── example_CNV_genotype
    │   ├── RDS
    │   │   ├── BAF
    │   │   │   └── matrix_chr_1_BAF.rds
    │   │   └── LRR
    │   │   │   └── matrix_chr_1_LRR.rds
    │   ├── results
    │   │   ├── pars
    │   │   │   └── CNVR_pars_chr_1_batch_2.rds
    │   │   ├── stats
    │   │   │   └── CNVR_79_r1_chr1_p_stat.rds
    │   │   ├── png
    │   │   │   ├── diag
    │   │   │   │   └── diag_CNVR_79_r1_chr1_p.png
    │   │   │   ├── steps
    │   │   │   │   ├── steps_1_CNVR_79_r1_chr1_p.png
    │   │   │   │   └── steps_2_CNVR_79_r1_chr1_p.png
    │   │   │   ├── heatmap
    │   │   │   │   └── heatmap_CNVR_79_r1_chr1_p.png
    │   │   │   └── summary
    │   │   │   │   └── summary_CNVR_79_r1_chr1_p.png
    │   │   └── pred
    │   │   │   └── chr_1_batch_2
    │   │   │       └── CNVR_79_r1_chr1_p_pred.rds
    │   ├── data
    │   │   ├── cnvr_batch.txt
    │   │   └── duplicate_pairs.txt
    │   └── README.md
    └── example_boundary_refinement
    │   ├── RDS
    │       └── LRR
    │       │   └── matrix_chr_2_LRR.rds
    │   ├── results
    │       ├── res_refine
    │       │   └── chr2
    │       │   │   ├── data
    │       │   │       └── CNVR_refine_chr_2_detail.rds
    │       │   │   └── png
    │       │   │       └── CNVR_163_r1_chr2_q_boundary_refinement.png
    │       └── cnvr_refine.txt
    │   ├── data
    │       ├── centromere_hg19.txt
    │       └── SNP_pos.txt
    │   └── README.md
├── 01_initial_call
    ├── run_iPattern
    │   ├── ref_files_hg19
    │   │   └── pq.txt
    │   ├── prepare_input_files_for_iPattern.R
    │   └── README.md
    ├── run_QuantiSNP
    │   ├── step.3.combine.QuantiSNP.pl
    │   ├── README.md
    │   ├── step.1.prepare.QuantiSNP.R
    │   └── step.2.check.QuantiSNP.R
    ├── run_PennCNV
    │   ├── step.4.combine.PennCNV.res.pl
    │   ├── step.5.clean.PennCNV.res.R
    │   ├── step.3.check.PennCNV.jobs.R
    │   ├── step.2.run.PennCNV.jobs.R
    │   └── README.md
    └── finalreport_to_matrix_LRR_and_BAF
    │   └── transform_from_tab_to_rds.R
├── 04_CNV_genotype
    ├── scripts
    │   ├── fun_plot_heatmap.R
    │   ├── fun_gatk.R
    │   ├── fun_plot_diagnosis.R
    │   ├── fun_LRR.R
    │   ├── fun_BAF.R
    │   ├── fun_plot_steps.R
    │   └── fun_pipeline_main.R
    ├── step.1.split.cnvrs.into.batches.R
    ├── step.4.prediction.results.R
    ├── step.2.submit.jobs.R
    ├── step.3.check.and.resubmit.jobs.R
    └── CNV.genotype.one.chr.one.batch.R
├── 02_batch_effect
    ├── PCA_on_LRR
    │   ├── step.1.down.sampling.R
    │   ├── step.3.LRR.pca.R
    │   └── step.2.LRR.matrix.pl
    └── PCA_on_summary_stats
    │   ├── step.2.stats.PCA.R
    │   └── step.1.prepare.stats.R
├── 05_boundary_refinement
    ├── refine.cpp
    ├── step.4.update.genotype.matrix.R
    ├── step.1.common.CNVR.to.refine.R
    ├── step.2.submit.jobs.R
    └── step.3.clean.results.R
├── create_new_project.sh
├── 06_performance_assessment
    ├── step.2.set.GQ.generate.results.R
    └── step.1.performance.assessment.R
└── 03_create_CNVR
    └── step.1.CNV.data.R


/example/example_create_CNVR/results/cnvr_create.txt:
--------------------------------------------------------------------------------
1 | CNVR_ID	outer.start	outer.end	nCNV	chr	arm
2 | CNVR_1_r1_chr1_p	11259	11268	233	1	p
3 | 


--------------------------------------------------------------------------------
/example/example_CNV_genotype/RDS/BAF/matrix_chr_1_BAF.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/RDS/BAF/matrix_chr_1_BAF.rds


--------------------------------------------------------------------------------
/example/example_CNV_genotype/RDS/LRR/matrix_chr_1_LRR.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/RDS/LRR/matrix_chr_1_LRR.rds


--------------------------------------------------------------------------------
/example/example_boundary_refinement/RDS/LRR/matrix_chr_2_LRR.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_boundary_refinement/RDS/LRR/matrix_chr_2_LRR.rds


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/pars/CNVR_pars_chr_1_batch_2.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/pars/CNVR_pars_chr_1_batch_2.rds


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/stats/CNVR_79_r1_chr1_p_stat.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/stats/CNVR_79_r1_chr1_p_stat.rds


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/png/diag/diag_CNVR_79_r1_chr1_p.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/diag/diag_CNVR_79_r1_chr1_p.png


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/png/steps/steps_1_CNVR_79_r1_chr1_p.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/steps/steps_1_CNVR_79_r1_chr1_p.png


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/png/steps/steps_2_CNVR_79_r1_chr1_p.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/steps/steps_2_CNVR_79_r1_chr1_p.png


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/png/heatmap/heatmap_CNVR_79_r1_chr1_p.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/heatmap/heatmap_CNVR_79_r1_chr1_p.png


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/png/summary/summary_CNVR_79_r1_chr1_p.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/png/summary/summary_CNVR_79_r1_chr1_p.png


--------------------------------------------------------------------------------
/example/example_CNV_genotype/results/pred/chr_1_batch_2/CNVR_79_r1_chr1_p_pred.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_CNV_genotype/results/pred/chr_1_batch_2/CNVR_79_r1_chr1_p_pred.rds


--------------------------------------------------------------------------------
/example/example_create_CNVR/results/cnvr_boundary.txt:
--------------------------------------------------------------------------------
1 | CNVR_ID	outer.start	outer.end	nCNV	chr	arm	posStart	posEnd	start_snp	end_snp	nPeak
2 | CNVR_1_r1_chr1_p	11259	11268	233	1	p	25598276	25642596	rs2517979	rs28393458	1
3 | 


--------------------------------------------------------------------------------
/example/example_create_CNVR/results/cnvr_clean.txt:
--------------------------------------------------------------------------------
1 | CNVR_ID	outer.start	outer.end	nCNV	chr	arm	posStart	posEnd	start_snp	end_snp	nPeak	Freq
2 | CNVR_1_r1_chr1_p	11259	11268	233	1	p	25598276	25642596	rs2517979	rs28393458	1	233
3 | 


--------------------------------------------------------------------------------
/example/example_CNV_genotype/data/cnvr_batch.txt:
--------------------------------------------------------------------------------
1 | CNVR_ID	outer.start	outer.end	nCNV	chr	arm	posStart	posEnd	start_snp	end_snp	nPeak	Freq	batch
2 | CNVR_79_r1_chr1_p	11259	11268	326	1	p	25598276	25642596	rs2517979	rs28393458	1	233	2
3 | 


--------------------------------------------------------------------------------
/example/example_boundary_refinement/results/res_refine/chr2/data/CNVR_refine_chr_2_detail.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_boundary_refinement/results/res_refine/chr2/data/CNVR_refine_chr_2_detail.rds


--------------------------------------------------------------------------------
/example/example_boundary_refinement/results/res_refine/chr2/png/CNVR_163_r1_chr2_q_boundary_refinement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaoKeLab/ensembleCNV/HEAD/example/example_boundary_refinement/results/res_refine/chr2/png/CNVR_163_r1_chr2_q_boundary_refinement.png


--------------------------------------------------------------------------------
/example/example_boundary_refinement/results/cnvr_refine.txt:
--------------------------------------------------------------------------------
1 | CNVR_ID	outer.start	outer.end	nCNV	chr	arm	posStart	posEnd	start_snp	end_snp	nPeak	raw_Freq	batch	genotype	Freq
2 | CNVR_163_r1_chr2_q	73662	73669	2	2	q	242929164	242971871	rs13432136	rs13431327	1	2	1	1	79
3 | 


--------------------------------------------------------------------------------
/01_initial_call/run_iPattern/ref_files_hg19/pq.txt:
--------------------------------------------------------------------------------
 1 | 1	125000000
 2 | 2	93300000
 3 | 3	91000000
 4 | 4	50400000
 5 | 5	48400000
 6 | 6	61000000
 7 | 7	59900000
 8 | 8	45600000
 9 | 9	49000000
10 | 10	40200000
11 | 11	53700000
12 | 12	35800000
13 | 13	17900000
14 | 14	17600000
15 | 15	19000000
16 | 16	36600000
17 | 17	24000000
18 | 18	17200000
19 | 19	26500000
20 | 20	27500000
21 | 21	13200000
22 | 22	14700000
23 | X	60600000
24 | Y	12500000
25 | 


--------------------------------------------------------------------------------
/example/example_create_CNVR/data/centromere_hg19.txt:
--------------------------------------------------------------------------------
 1 | chr	position
 2 | 1	125000000
 3 | 2	93300000
 4 | 3	91000000
 5 | 4	50400000
 6 | 5	48400000
 7 | 6	61000000
 8 | 7	59900000
 9 | 8	45600000
10 | 9	49000000
11 | 10	40200000
12 | 11	53700000
13 | 12	35800000
14 | 13	17900000
15 | 14	17600000
16 | 15	19000000
17 | 16	36600000
18 | 17	24000000
19 | 18	17200000
20 | 19	26500000
21 | 20	27500000
22 | 21	13200000
23 | 22	14700000
24 | X	60600000
25 | Y	12500000
26 | 


--------------------------------------------------------------------------------
/example/example_boundary_refinement/data/centromere_hg19.txt:
--------------------------------------------------------------------------------
 1 | chr	position
 2 | 1	125000000
 3 | 2	93300000
 4 | 3	91000000
 5 | 4	50400000
 6 | 5	48400000
 7 | 6	61000000
 8 | 7	59900000
 9 | 8	45600000
10 | 9	49000000
11 | 10	40200000
12 | 11	53700000
13 | 12	35800000
14 | 13	17900000
15 | 14	17600000
16 | 15	19000000
17 | 16	36600000
18 | 17	24000000
19 | 18	17200000
20 | 19	26500000
21 | 20	27500000
22 | 21	13200000
23 | 22	14700000
24 | X	60600000
25 | Y	12500000
26 | 


--------------------------------------------------------------------------------
/example/example_create_CNVR/data/CNV.PennCNV_new.txt:
--------------------------------------------------------------------------------
1 | 1	25598276	25638253	1	Sample536.1_46R06C01.txt	rs2517979	rs2427759	16.242	7
2 | 1	25598276	25642596	3	Sample353.1_80R07C01.txt	rs2517979	rs28393458	11.247	8
3 | 1	25598276	25638253	1	Sample384.1_66R04C01.txt	rs2517979	rs2427759	17.075	7
4 | 1	25598276	25642596	1	Samplec82.1_16R05C01.txt	rs2517979	rs28393458	12.154	8
5 | 1	25598276	25638253	1	Samplec134.1_8R08C01.txt	rs2517979	rs2427759	12.341	7
6 | 1	25598276	25629950	1	Sample292.1_70R08C01.txt	rs2517979	exm32925	13.511	5
7 | 1	25598276	25638253	1	Sample541.1_37R04C01.txt	rs2517979	rs2427759	12.084	7
8 | 


--------------------------------------------------------------------------------
/example/example_CNV_genotype/data/duplicate_pairs.txt:
--------------------------------------------------------------------------------
 1 | sample1.name	sample2.name
 2 | Sample167.1_44R03C01	Sample167.1_5R08C01
 3 | Sample282.1_27R08C01	Sample282.1_2R03C01
 4 | Sample312.1_64R03C01	Sample312.1_80R01C01
 5 | Sample402.1_69R08C01	Sample402.1_74R03C01
 6 | Sample408.1_15R03C01	Sample408.1_36R08C01
 7 | Sample563.1_25R03C01	Sample563.1_56R08C01
 8 | Sample713.1_15R08C01	Sample713.1_25R03C01
 9 | Samplec134.1_30R03C01	Samplec134.1_8R08C01
10 | Samplec55.1_53R03C01	Samplec55.1_58R08C01
11 | Sample170.1_15R08C01	Sample203.1_18R06C01
12 | Sample697.1_24R05C01	Samplec97.1_10R06C01
13 | SampleY-27.1_1R01C01	SampleY-6.1_6R01C01
14 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/scripts/fun_plot_heatmap.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # plot heatmap
 3 | 
 4 | plot_heatmap <- function(dt_lrr_heatmap, dt_snps_flag) {
 5 |   
 6 |   cor1 <- cor(dt_lrr_heatmap, use = "na.or.complete")
 7 |   
 8 |   groups1 <- ifelse(dt_snps_flag$snp_flag == 0, "snps_add", "snps_raw")
 9 |   groups2 <- ifelse(dt_snps_flag$snp_flag == 2, "inner_boundary", "outer_boundary")
10 |   annotation_col1 <- data.frame(
11 |     group1 = groups1,
12 |     group2 = groups2
13 |   )
14 |   rownames(annotation_col1) <- colnames(dt_lrr_heatmap)
15 |   
16 |   pheatmap(cor1, 
17 |            cluster_cols = FALSE,
18 |            cluster_rows = FALSE,
19 |            annotation_col = annotation_col1)
20 |   
21 | }
22 | 


--------------------------------------------------------------------------------
/example/example_create_CNVR/results/cnv.penncnv.txt:
--------------------------------------------------------------------------------
1 | chr	posStart	posEnd	CN	Sample_ID	conf	numSNP	avgConf	length	CNV_type	method
2 | 1	25598276	25638253	1	Sample536.1_46R06C01	16.242	7	2.32028571428571	39978	Loss	PennCNV
3 | 1	25598276	25642596	3	Sample353.1_80R07C01	11.247	8	1.405875	44321	Gain	PennCNV
4 | 1	25598276	25638253	1	Sample384.1_66R04C01	17.075	7	2.43928571428571	39978	Loss	PennCNV
5 | 1	25598276	25642596	1	Samplec82.1_16R05C01	12.154	8	1.51925	44321	Loss	PennCNV
6 | 1	25598276	25638253	1	Samplec134.1_8R08C01	12.341	7	1.763	39978	Loss	PennCNV
7 | 1	25598276	25629950	1	Sample292.1_70R08C01	13.511	5	2.7022	31675	Loss	PennCNV
8 | 1	25598276	25638253	1	Sample541.1_37R04C01	12.084	7	1.72628571428571	39978	Loss	PennCNV
9 | 


--------------------------------------------------------------------------------
/02_batch_effect/PCA_on_LRR/step.1.down.sampling.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # generate the list of 100000 randomly selected SNPs
 4 | # Input is SNP_pos.txt
 5 | # including 3 columns: Name, Chr, Position
 6 | 
 7 | suppressMessages({
 8 |   require( data.table, quietly = TRUE)
 9 | })
10 | 
11 | args <- commandArgs( trailingOnly = TRUE )
12 | file_snps <- args[1]   ## SNP_pos.txt
13 | path_output <- args[2] ## path to save randomly selected SNPs
14 | 
15 | ## sampleing from chr: 1-22
16 | dat_snps <- fread( input = file_snps )
17 | dat_snps <- as.data.frame(dat_snps, stringsAsFactors = FALSE)
18 | dat_snps <- subset(dat_snps, Chr %in% 1:22)
19 | 
20 | snps <- sample( dat_snps$Name )
21 | snps.selected <- snps[ 1:100000 ]
22 | 
23 | write.table(snps.selected, file = file.path(path_output, "snps.down.sample.txt"),
24 |             sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
25 | 


--------------------------------------------------------------------------------
/01_initial_call/run_QuantiSNP/step.3.combine.QuantiSNP.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | ## The script was used to run QuantiSNP on Minerva high performance cluster.
 4 | ## You need to modifiy it according to the system you are using if you would like to use it.
 5 | ## Please refer to original QuantiSNP documents (https://sites.google.com/site/quantisnp/) for more information 
 6 | 
 7 | use strict;
 8 | use Getopt::Long;
 9 | 
10 | my $in_dir="";   ## input directory
11 | my $out_dir="";  ## output directory
12 | 
13 | GetOptions("in_dir=s" => \$in_dir,
14 | 		   "out_dir=s" => \$out_dir);
15 | 
16 | my $out_file=$out_dir."/quantisnp.cnv";
17 | 
18 | opendir(DIR, $in_dir) or die "cannot open $in_dir: $!"; ## 2018-12-11
19 | open(OUT1, ">", $out_file) or die $!;
20 | 
21 | my $flag = 1;
22 | while (defined(my $folder = readdir(DIR))) {
23 | 	
24 | 	next if ($folder=~/^\./);
25 | 	##next if ($folder=~/^INTERNAL/);
26 | 	##next if ($folder=~/^CONTROL/);
27 | 	
28 | 	my $filename=$folder.".cnv"; ## $folder is Sample ID
29 | 	my $file=$in_dir."/".$folder."/".$filename;
30 | 	
31 | 	#print "$flag", "$file\n";
32 | 	open(IN1, "<$file") or die $!;
33 | 	while (my $line=<IN1>) {
34 | 		next if ($line=~/^Sample/);
35 | 		print OUT1 $line;
36 | 	}
37 | 
38 | 	close IN1;
39 | 	$flag = $flag + 1;
40 | }
41 | 
42 | close OUT1;
43 | 
44 | print "Analysis completed!\n";
45 | 


--------------------------------------------------------------------------------
/example/example_boundary_refinement/data/SNP_pos.txt:
--------------------------------------------------------------------------------
1 | Name	Chr	Positionexm2014254	2	242756160exm2269245	2	242918203exm285145	2	242743532exm285162	2	242755734exm285169	2	242755776exm285173	2	242755877exm285174	2	242755880exm285179	2	242756152exm285195	2	242756338exm285206	2	242757460exm285208	2	242757466exm285210	2	242757494exm285218	2	242757680exm285235	2	242758176exm285237	2	242758187exm285245	2	242758210exm285249	2	242758301exm285263	2	242793287exm285264	2	242793290exm285267	2	242793362exm285272	2	242793433exm285283	2	242794356exm285293	2	242794796exm285299	2	242794902exm285320	2	242811915exm285321	2	242811935exm285323	2	242811961exm285334	2	242813886exm285359	2	242814360exm285376	2	242814536exm285384	2	242814639exm285386	2	242814702exm285387	2	242814705exm285391	2	242814758exm285404	2	242814983exm285409	2	242815059exm285418	2	242815157exm285422	2	242815175kgp14294579	2	242937311kgp14399551	2	243017485rs12468297	2	242996474rs12469535	2	243044147rs12472007	2	242932909rs12620346	2	242962791rs12987998	2	242917734rs13390284	2	242919764rs13431327	2	242971871rs13432136	2	242929164rs28528975	2	242750743rs35399295	2	242795942rs3892357	2	242763542rs3934981	2	242926381rs3934982	2	242926558rs4072221	2	242809415rs4973649	2	243034519rs4973686	2	243020723rs6605267	2	242824974rs6712567	2	242929233rs6737774	2	242918157rs6737791	2	242918203rs6740738	2	243007368rs7421861	2	242795350rs7423746	2	242937388rs7573042	2	242996589rs7587805	2	242942878


--------------------------------------------------------------------------------
/01_initial_call/run_PennCNV/step.4.combine.PennCNV.res.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | ## The script was used to run PennCNV on Minerva high performance cluster.
 4 | ## You need to modifiy it according to the system you are using if you would like to use it.
 5 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information
 6 | 
 7 | use strict;
 8 | use Getopt::Long;
 9 | 
10 | my $in_dir="";   ## input directory
11 | my $out_dir="";  ## output directory
12 | 
13 | GetOptions("in_dir=s" => \$in_dir,
14 | 	   "out_dir=s" => \$out_dir);
15 | 
16 | my $out_file=$out_dir."/"."CNV.PennCNV.rawcnv";
17 | my $out_log=$out_dir."/"."CNV.PennCNV.log";
18 | 
19 | opendir(DIR, $in_dir) or die "cannot open $in_dir: $!"; ## 2018-12-10
20 | open(OUT1, ">", $out_file) or die $!;
21 | open(OUT2, ">", $out_log) or die $!;
22 | 
23 | while (defined(my $folder = readdir(DIR))) {
24 | 	
25 | 	next if ($folder=~/^\./);
26 | 
27 | 	my $filename=$folder.".rawcnv";
28 | 	my $logname=$folder.".log";
29 | 	my $file=$in_dir."/".$folder."/".$filename;
30 | 	my $logfile=$in_dir."/".$folder."/".$logname;
31 | 	## print "$file\n";
32 | 	open(IN1, "<$file") or die $!;
33 | 	open(IN2, "<$logfile") or die $!;
34 | 	while (my $line=<IN1>) {
35 | 		print OUT1 $line;
36 | 	}
37 | 
38 | 	while (my $line=<IN2>) {
39 | 		print OUT2 $line;
40 | 	}
41 | 
42 | 	close IN1;
43 | 	close IN2;
44 | }
45 | 
46 | close OUT1;
47 | close OUT2;
48 | 
49 | print "Analysis completed!\n";
50 | 


--------------------------------------------------------------------------------
/05_boundary_refinement/refine.cpp:
--------------------------------------------------------------------------------
 1 | #include <RcppArmadillo.h>
 2 | 
 3 | //[[Rcpp::depends(RcppArmadillo)]]
 4 | 
 5 | using namespace Rcpp;
 6 | 
 7 | //[[Rcpp::export]]
 8 | List refine_step1(arma::mat Yt, int min_len = 5) {
 9 |   
10 |   int n = Yt.n_cols;
11 |   arma::mat stat1(n, n); stat1.zeros();
12 |   double max_value = - arma::datum::inf;
13 |   int max_l = 0; int max_r = 0;
14 |   
15 |   arma::mat Mcorr = arma::cor(Yt);
16 |   double sumS = arma::accu(arma::trimatl(Mcorr)) - n;
17 |   int n2 = (1+n)*n/2 - n;
18 |   double tmp = 0;
19 |   for (int i=0; i<=(n-1); i++) {
20 |     int k = i+min_len-1;
21 |     for (int j=k; j<=(n-1); j++) {
22 |       
23 |       arma::mat M = Mcorr.submat(i, i, j, j);
24 |       arma::mat x = arma::trimatl(M);
25 |       int xcol = x.n_cols;
26 |       int n1 = (1+xcol)*xcol/2 - xcol;
27 |       double xsum = arma::accu(x) - xcol;
28 |       double xmean = xsum/n1;
29 |       
30 |       if (n1 == n2){
31 |         tmp = xmean/sqrt(1/double(n1));
32 |       } else {
33 |         
34 |         tmp = (xmean - (sumS - xsum)/double(n2-n1))/sqrt(1/double(n1)+1/double(n2-n1));
35 |       }
36 |       stat1(i,j) = tmp;
37 |       
38 |       if (tmp > max_value) {
39 |         max_value = tmp;
40 |         max_l = i+1;
41 |         max_r = j+1;
42 |       }        
43 |     }
44 |     
45 |   }
46 |   
47 |   return List::create(
48 |     _["max.value"] = max_value,
49 |     _["max.l"] = max_l,
50 |     _["max.r"] = max_r 
51 |     );
52 | }
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/example/example_CNV_genotype/README.md:
--------------------------------------------------------------------------------
 1 | ## Example: CNV genotyping
 2 | 
 3 | Here is a demo of the main script `CNV.genotype.one.chr.one.batch.R` for [CNV genotyping](https://github.com/HaoKeLab/ensembleCNV#4-cnv-genotyping-for-each-cnvr) using example data from one CNVR.
 4 | 
 5 | Please specify where the git clone of ensembleCNV is located.
 6 | ```sh
 7 | ENSEMBLECNV=</path/to/ensembleCNV>
 8 | ```
 9 | 
10 | Then run the following code for a demo of the main script `CNV.genotype.one.chr.one.batch.R` for CNV genotyping.
11 | ```sh
12 | Rscript ${ENSEMBLECNV}/04_CNV_genotype/CNV.genotype.one.chr.one.batch.R \
13 | --chr 1 \
14 | --batch 2 \
15 | --type 0 \
16 | --sourcefile ${ENSEMBLECNV}/04_CNV_genotype/scripts/ \
17 | --datapath ${ENSEMBLECNV}/example/example_CNV_genotype/data \
18 | --matrixpath ${ENSEMBLECNV}/example/example_CNV_genotype/RDS \
19 | --resultpath ${ENSEMBLECNV}/example/example_CNV_genotype/results \
20 | --duplicates \
21 | --plot
22 | ```
23 | 
24 | Note: When the analysis is successfully completed, in the directory `${path_ensembleCNV}/example/example_CNV_genotype/results`, you will find similar directory structure and outputs as in a real project. In particular,
25 | 
26 | - in the subfolders of the `pred` folder, you will find `*_pred.rds`, each corresponding to the CN genotype and GQ score for a CNVR. They are stored in `.rds` format in order to save space and improve I/O time.
27 | 
28 | - in the subfolders of the `png` folder, you will find different diagnosis plots for each CNVR.
29 | 


--------------------------------------------------------------------------------
/example/example_boundary_refinement/README.md:
--------------------------------------------------------------------------------
 1 | ## Example: Boundary refinement
 2 | 
 3 | Here is a demo of the main script `CNVR.boundary.refinement.R` for [boundary refinement](https://github.com/HaoKeLab/ensembleCNV#5-boundary-refinement) using example data from one CNVR.
 4 | 
 5 | Please specify where the git clone of ensembleCNV is located.
 6 | ```sh
 7 | ENSEMBLECNV=</path/to/ensembleCNV>
 8 | ```
 9 | 
10 | Then run the following code for a demo of the main script `CNVR.boundary.refinement.R` for boundary refinement.
11 | ```sh
12 | Rscript ${ENSEMBLECNV}/05_boundary_refinement/CNVR.boundary.refinement.R \
13 | --chr 2 \
14 | --rcppfile ${ENSEMBLECNV}/05_boundary_refinement/refine.cpp \
15 | --datapath ${ENSEMBLECNV}/example/example_boundary_refinement/data \
16 | --matrixpath ${ENSEMBLECNV}/example/example_boundary_refinement/RDS \
17 | --centromere ${ENSEMBLECNV}/example/example_boundary_refinement/data/centromere_hg19.txt \
18 | --resultpath ${ENSEMBLECNV}/example/example_boundary_refinement/results \
19 | --plot
20 | ```
21 | 
22 | Note: 
23 | 
24 | - When the analysis is successfully completed, the output will be stored at the directory `${ENSEMBLECNV}/example/example_boundary_refinement/results/res_refine`.
25 | 
26 | - In practice, the list of common CNVRs in `cnvr_refine.txt`, whose boundaries are to be refined, is selected by the step `${ENSEMBLECNV}/05_boundary_refinement/step.1.common.CNVR.to.refine.R` based on frequency cut-off specified by the user, before boundary refinement is actually performed (see step (1) of [boundary refinement](https://github.com/HaoKeLab/ensembleCNV#5-boundary-refinement)). Therefore, `cnvr_refine.txt` is supposed to appear in the directory `${ENSEMBLECNV}/example/example_boundary_refinement/results` (instead of the `data` folder) as input for subsequent `CNVR.boundary.refinement.R`. 
27 | 


--------------------------------------------------------------------------------
/create_new_project.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## create new project
 4 | wkdir=$1
 5 | 
 6 | ## create working directory
 7 | mkdir -p $wkdir
 8 | 
 9 | ## data: final report, sample table, centromere position, and duplicate pairs [optional]
10 | ## put in this directory
11 | mkdir -p ${wkdir}/data
12 | 
13 | ## 01_initial_call
14 | cp -ru ./01_initial_call $wkdir
15 | mkdir -p ${wkdir}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS
16 | 
17 | mkdir -p ${wkdir}/01_initial_call/run_iPattern/data
18 | mkdir -p ${wkdir}/01_initial_call/run_iPattern/data_aux
19 | mkdir -p ${wkdir}/01_initial_call/run_iPattern/results
20 | 
21 | mkdir -p ${wkdir}/01_initial_call/run_PennCNV/data
22 | mkdir -p ${wkdir}/01_initial_call/run_PennCNV/data_aux
23 | mkdir -p ${wkdir}/01_initial_call/run_PennCNV/results
24 | 
25 | mkdir -p ${wkdir}/01_initial_call/run_QuantiSNP/data
26 | mkdir -p ${wkdir}/01_initial_call/run_QuantiSNP/results
27 | mkdir -p ${wkdir}/01_initial_call/run_QuantiSNP/results/res
28 | 
29 | ## 02_batch_effect
30 | cp -ru ./02_batch_effect $wkdir
31 | 
32 | ## 03_create_CNVR      
33 | cp -ru ./03_create_CNVR $wkdir
34 | 
35 | ## 04_CNV_genotype
36 | cp -ru ./04_CNV_genotype $wkdir
37 | mkdir -p ${wkdir}/04_CNV_genotype/data
38 | mkdir -p ${wkdir}/04_CNV_genotype/results
39 | 
40 | ## 05_boundary_refinement 
41 | cp -ru ./05_boundary_refinement $wkdir
42 | mkdir -p ${wkdir}/05_boundary_refinement/data
43 | mkdir -p ${wkdir}/05_boundary_refinement/results
44 | 
45 | ## 05a_regenotype_after_refinement
46 | mkdir -p ${wkdir}/05a_regenotype_after_refinement
47 | mkdir -p ${wkdir}/05a_regenotype_after_refinement/data
48 | mkdir -p ${wkdir}/05a_regenotype_after_refinement/results
49 | 
50 | ## 06_performance_assessment
51 | cp -ru ./06_performance_assessment $wkdir
52 | 
53 | echo "New project directory has been created at: $wkdir"
54 | echo "Please put (or create symbolic link to) input data in the directory: $wkdir/data"
55 | 
56 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/scripts/fun_gatk.R:
--------------------------------------------------------------------------------
 1 | 
 2 | output_gatk_result_LRR <- function(dt_LRRBAF) {
 3 | 
 4 |   dt_LRRBAF$LRRBAF0 <- -10*log(dt_LRRBAF$LRR0)/log(10)
 5 |   dt_LRRBAF$LRRBAF1 <- -10*log(dt_LRRBAF$LRR1)/log(10)  ## save V1
 6 |   dt_LRRBAF$LRRBAF2 <- -10*log(dt_LRRBAF$LRR2)/log(10)
 7 |   dt_LRRBAF$LRRBAF3 <- -10*log(dt_LRRBAF$LRR3)/log(10)
 8 |   
 9 |   ## must be deal with
10 |   
11 |   dt_sub <- dt_LRRBAF[, c("LRRBAF0", "LRRBAF1", "LRRBAF2", "LRRBAF3")]
12 |   
13 |   value_GQs <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) {
14 |     v1 <- unlist(dt_sub[k, ])
15 |     v1 <- sort(v1)
16 |     gq1 <- v1[2] - v1[1]
17 |     gq1
18 |   }))
19 |   
20 |   # mean(GQs)
21 |   dt_LRRBAF$value_GQ <- value_GQs
22 |   
23 |   CN_gatk_preds <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) {
24 |     v1 <- unlist(dt_sub[k, ])
25 |     idx1 <- which.min(v1)
26 |     return(idx1 - 1)
27 |   }))
28 |   
29 |   dt_LRRBAF$CN_gatk_pred <- CN_gatk_preds
30 |   
31 |   ## add BAF0 1 2 3 column
32 |   dt_LRRBAF$BAF0 = NA
33 |   dt_LRRBAF$BAF1 = NA
34 |   dt_LRRBAF$BAF2 = NA
35 |   dt_LRRBAF$BAF3 = NA
36 |   
37 |   dt_LRRBAF # return result
38 | }
39 | 
40 | # calculate gatk result
41 | output_gatk_result <- function(dt_LRRBAF) {
42 |   
43 |   dt_LRRBAF$LRRBAF0 <- -10*log(dt_LRRBAF$LRR0*dt_LRRBAF$BAF0)/log(10)
44 |   dt_LRRBAF$LRRBAF1 <- -10*log(dt_LRRBAF$LRR1*dt_LRRBAF$BAF1)/log(10)  ## save V1
45 |   dt_LRRBAF$LRRBAF2 <- -10*log(dt_LRRBAF$LRR2*dt_LRRBAF$BAF2)/log(10)
46 |   dt_LRRBAF$LRRBAF3 <- -10*log(dt_LRRBAF$LRR3*dt_LRRBAF$BAF3)/log(10)
47 |   
48 |   dt_sub <- dt_LRRBAF[, c("LRRBAF0", "LRRBAF1", "LRRBAF2", "LRRBAF3")]
49 |   
50 |   value_GQs <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) {
51 |     v1 <- unlist(dt_sub[k, ])
52 |     v1 <- sort(v1)
53 |     gq1 <- v1[2] - v1[1]
54 |     gq1
55 |   }))
56 |   
57 |   # mean(GQs)
58 |   dt_LRRBAF$value_GQ <- value_GQs
59 |   
60 |   CN_gatk_preds <- unlist(lapply(1:nrow(dt_sub), FUN = function(k) {
61 |     v1 <- unlist(dt_sub[k, ])
62 |     idx1 <- which.min(v1)
63 |     return(idx1 - 1)
64 |   }))
65 |   
66 |   dt_LRRBAF$CN_gatk_pred <- CN_gatk_preds
67 |   
68 |   dt_LRRBAF # return result
69 | }
70 | 


--------------------------------------------------------------------------------
/02_batch_effect/PCA_on_summary_stats/step.2.stats.PCA.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscipt
 2 | 
 3 | args <- commandArgs( trailingOnly = TRUE )
 4 | wk_dir  <- args[1] ## path to IPQ.stats.txt generated in step (1)
 5 | 
 6 | suppressMessages({
 7 |   require(ggplot2)
 8 |   require(cowplot)
 9 | })
10 | 
11 | # PCA --------------------------------------------------------------------
12 | 
13 | dat <- read.delim(file = file.path(wk_dir, "IPQ.stats.txt"), as.is = TRUE)
14 | 
15 | idx1 <- which( names(dat) == "Sample_ID" )
16 | dat_pca <- dat[, -idx1]
17 | mat <- as.matrix(dat_pca)
18 | rownames(mat) <- dat$Sample_ID
19 | 
20 | PCA <- prcomp(mat, scale. = TRUE)
21 | PC  <- predict(PCA)
22 | 
23 | PC  <- data.frame(Sample_ID = rownames(PC), 
24 |                   PC, 
25 |                   stringsAsFactors = FALSE)
26 | 
27 | write.table(PC, file = file.path(wk_dir, "IPQ_stats_PCA_res.txt"),
28 |             quote = F, row.names = F, sep = "\t")
29 | 
30 | p12 <- ggplot() + 
31 |   geom_point(data = PC, aes(PC1, PC2), shape = 1, size = 3) + 
32 |   xlab("PC1") + 
33 |   ylab("PC2") + 
34 |   theme_bw(base_size = 9)+
35 |   theme(axis.text = element_text(size = 15),
36 |         axis.title = element_text(size = 15),
37 |         plot.title = element_text(size = 20, hjust = 0.5)) + 
38 |   ggtitle("PC2 ~ PC1")
39 | 
40 | p13 <- ggplot() + 
41 |   geom_point(data = PC, aes(PC1, PC3), shape = 1, size = 3) + 
42 |   xlab("PC1") + 
43 |   ylab("PC3") + 
44 |   theme_bw(base_size = 9)+
45 |   theme(axis.text = element_text(size = 15),
46 |         axis.title = element_text(size = 15),
47 |         plot.title = element_text(size = 20, hjust = 0.5)) + 
48 |   ggtitle("PC3 ~ PC1")
49 | 
50 | p23 <- ggplot() + 
51 |   geom_point(data = PC, aes(PC2, PC3), shape = 1, size = 3) + 
52 |   xlab("PC2") + 
53 |   ylab("PC3") + 
54 |   theme_bw(base_size = 9)+
55 |   theme(axis.text = element_text(size = 15),
56 |         axis.title = element_text(size = 15),
57 |         plot.title = element_text(size = 20, hjust = 0.5)) + 
58 |   ggtitle("PC3 ~ PC2")
59 | 
60 | png(filename = file.path(wk_dir, "IPQ_stats_PCA_plots.png"),
61 |     width = 12, height = 12, units = "in", res = 512)
62 | p <- plot_grid(p12, p13, p23, nrow = 2, labels = LETTERS[1:3])
63 | print(p)
64 | dev.off()
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/scripts/fun_plot_diagnosis.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | library(gridExtra)
 4 | 
 5 | plot_model <- function(paras, dt_cnvr, title) {
 6 |   
 7 |   mu1 <- paras$mu[1]
 8 |   sigma1 <- paras$sigma[1]
 9 |   lambda1 <- paras$lambda[1]
10 |   
11 |   mu2 <- paras$mu[2]
12 |   sigma2 <- paras$sigma[2]
13 |   lambda2 <- paras$lambda[2]
14 |   
15 |   mu3 <- paras$mu[3]
16 |   sigma3 <- paras$sigma[3]
17 |   lambda3 <- paras$lambda[3]
18 |   
19 |   
20 |   x <- dt_cnvr$LRR_median
21 |   range_x <- range(x)
22 |   
23 |   xs <- seq(range_x[1], range_x[2], length.out = 800)
24 |   dt <- data.frame(x = xs, stringsAsFactors = F)
25 |   
26 |   dt1 <- data.frame(x = xs, d = lambda1*dnorm(xs, mean = mu1, sd = sigma1), CN = 1)
27 |   
28 |   dt3 <- data.frame(x = xs, d = lambda3*dnorm(xs, mean = mu3, sd = sigma3), CN = 3)
29 | 
30 |   dt2 <- data.frame(x = xs, d = lambda2*dnorm(xs, mean = mu2, sd = sigma2), CN = 2)
31 |   dt123 <- rbind(dt1, dt2, dt3)
32 |   dt123$CN <- as.factor(dt123$CN)
33 |   
34 |   
35 |   p <- ggplot(data = dt_cnvr, aes(LRR_median, y = ..density..)) +
36 |     geom_histogram(bins = 50, fill = NA, color = "black") + 
37 |     geom_line(data = dt123, aes(x, d, col = CN), lwd = 1.5) + 
38 |     theme_bw(base_size = 10) + 
39 |     labs(title = title,
40 |          subtitle = paste("mu1:", round(mu1, 2), "mu2:", round(mu2, 2), "mu3:", round(mu3, 2), "\n",
41 |                           "sd1:", round(sigma1, 2), "sd2:", round(sigma2, 2), "sd3:", round(sigma3, 2)))
42 |   p
43 | }
44 | 
45 | plot_gmm_diagnosis <- function(dt_cnvr, paras_model) {
46 |   
47 |   paras_stage1 <- paras_model$stage1
48 |   paras_stage1_init <- paras_stage1$init
49 |   paras_stage1_model <- paras_stage1$model
50 |   
51 |   paras_stage2 <- paras_model$stage2
52 |   paras_stage2_init <- paras_stage2$init
53 |   paras_stage2_model <- paras_stage2$model
54 |   
55 |   # plot 
56 |   p1 <- plot_model(paras = paras_stage1_init, dt_cnvr = dt_cnvr, title = "stage1 init")
57 |   p2 <- plot_model(paras = paras_stage1_model, dt_cnvr = dt_cnvr, title = "stage1 model")
58 |     
59 |   p3 <- plot_model(paras = paras_stage2_init, dt_cnvr = dt_cnvr, title = "stage2 init")
60 |   p4 <- plot_model(paras = paras_stage2_model, dt_cnvr = dt_cnvr, title = "stage2 model")
61 |     
62 |   ps <- gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2)
63 |   return(ps) 
64 | }


--------------------------------------------------------------------------------
/example/example_create_CNVR/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Example: Boundary refinement
 3 | 
 4 | Here is a demo of [creating CNVR](https://github.com/HaoKeLab/ensembleCNV#3-create-cnvr) using example data of CNVs clumping around one CNVR.
 5 | 
 6 | Please specify where the git clone of ensembleCNV is located.
 7 | ```sh
 8 | ENSEMBLECNV=</path/to/ensembleCNV>
 9 | ```
10 | 
11 | Step 1: reformat the CNV calls generated from individual CNV caller: iPattern, PennCNV and QuantiSNP
12 | ```sh
13 | Rscript ${ENSEMBLECNV}/03_create_CNVR/step.1.CNV.data.R \
14 | ${ENSEMBLECNV}/example/example_create_CNVR/results/ \
15 | ${ENSEMBLECNV}/example/example_create_CNVR/data/iPattern_all_calls.txt \
16 | ${ENSEMBLECNV}/example/example_create_CNVR/data/CNV.PennCNV_new.txt \
17 | ${ENSEMBLECNV}/example/example_create_CNVR/data/quantisnp.cnv \
18 | ${ENSEMBLECNV}/example/example_create_CNVR/data/Samples_Table.txt
19 | ```
20 | Note:
21 | 
22 | - `iPattern_all_calls.txt`, `CNV.PennCNV_new.txt`, and `quantisnp.cnv` are examples of what raw CNV calls generated from iPattern, PennCNV and QuantiSNP look like.
23 | 
24 | - We do not include `Gender` column in `Samples_Table.txt` as gender information is not relevant for creating CNVR in this example.
25 | 
26 | - When this step is successfully completed, you will find in `${ENSEMBLECNV}/example/example_create_CNVR/results/` directory `cnv.ipattern.txt`, `cnv.penncnv.txt`, and `cnv.quantisnp.txt`, which are reformated CNV calls from the 3 CNV callers.
27 | 
28 | Step 2: create CNVR
29 | ```sh
30 | Rscript ${ENSEMBLECNV}/03_create_CNVR/step.2.create.CNVR.R \
31 | --icnv ${ENSEMBLECNV}/example/example_create_CNVR/results/cnv.ipattern.txt \
32 | --pcnv ${ENSEMBLECNV}/example/example_create_CNVR/results/cnv.penncnv.txt \
33 | --qcnv ${ENSEMBLECNV}/example/example_create_CNVR/results/cnv.quantisnp.txt \
34 | --snp ${ENSEMBLECNV}/example/example_create_CNVR/data/SNP_pos.txt \
35 | --centromere ${ENSEMBLECNV}/example/example_create_CNVR/data/centromere_hg19.txt \
36 | --output ${ENSEMBLECNV}/example/example_create_CNVR/results/
37 | ```
38 | 
39 | Note: When this step is successfully completed, you will find some intermediate outputs and the final results in the directory `${ENSEMBLECNV}/example/example_create_CNVR/results/`, including
40 | - `cnv_clean.txt`: the table of merged CNV events from iPattern, PennCNV and QuantiSNP; the `CNVR_ID` in the table indicates which CNVR each CNV belongs to.
41 | - `cnvr_clean.txt`: the table of constructed CNVRs with each assigned a `CNVR_ID`.
42 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/step.1.split.cnvrs.into.batches.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript --vanilla
 2 | 
 3 | ## Split the list of created CNVRs in each chromosome into batches, such that
 4 | ## the CNVRs can be processed in parallel. 
 5 | 
 6 | suppressMessages(require(optparse))
 7 | 
 8 | option_list = list(
 9 |   make_option(c("-i", "--input"), action = "store", type = "character", default = NA,
10 |               help = "CNVR dataset input"),
11 |   make_option(c("-o", "--output"), action = "store", type = "character", default = NA,
12 |               help = "CNVR dataset output"),
13 |   make_option(c("-c", "--cnv"), action = "store", type = "character", default = NA,
14 |               help = "CNV after cleaning dataset"),
15 |   make_option(c("-n", "--num"), action = "store", type = "integer", default = 200,
16 |               help = "number of CNVRs in each batch")
17 | )
18 | 
19 | opt = parse_args( OptionParser(option_list = option_list) )
20 | 
21 | pars = c(opt$input, opt$output, opt$num) ##opt$cnv
22 | if ( any(is.na(pars)) ) {
23 |   stop("All parameter must be supplied.(--help for detail)")
24 | }
25 | 
26 | 
27 | # main  -------------------------------------------------------------------
28 | 
29 | dt_cnvr = read.delim( file = opt$input, as.is = TRUE )
30 | n_cnvr  = nrow(dt_cnvr)
31 | 
32 | dt_cnvr = dt_cnvr[order(dt_cnvr$chr, dt_cnvr$posStart, dt_cnvr$posEnd), ]
33 | 
34 | cat('total cnvr number:', n_cnvr, "\n")
35 | 
36 | number_each_batch = as.integer( opt$num )  ## 200 default
37 | 
38 | # add raw Freq information (This has been done in create CNVR step?)
39 | # dt_cnv = readRDS(file = opt$cnv)
40 | # nrow(dt_cnv)
41 | # tbl <- table(dt_cnv$CNVR_ID)
42 | # freqs <- as.vector(tbl)
43 | # dt_freq <- data.frame(CNVR_ID = names(tbl), Freq = freqs, stringsAsFactors = FALSE)
44 | 
45 | # dt_cnvr <- merge(dt_cnvr, dt_freq, by = "CNVR_ID")
46 | # stopifnot( nrow(dt_cnvr) == n_cnvr)
47 | 
48 | # split batches in each chr
49 | chrs <- sort(unique(dt_cnvr$chr))
50 | 
51 | dt_cnvr_new <- data.frame()
52 | for (chr1 in chrs) {
53 |   
54 |   dt_cnvr1 <- subset(dt_cnvr, chr == chr1)
55 |   idxs_batch <- 1:nrow(dt_cnvr1)
56 |   
57 |   n1 <- nrow(dt_cnvr1)
58 |   n2 <- ceiling(n1/number_each_batch) 
59 |   
60 |   cat("chr:", chr1, "number of cnvrs:", n1, "\n")
61 |   if (n2 == 1) {
62 |     
63 |     dt_cnvr1$batch <- 1
64 |     dt_cnvr_new <- rbind(dt_cnvr_new, dt_cnvr1)
65 |     
66 |   } else {
67 |     
68 |     cuts <- cut(idxs_batch, breaks = n2, include.lowest = TRUE)
69 |     cuts_index <- as.integer(cuts)
70 |     dt_cnvr1$batch <- cuts_index
71 |     
72 |     dt_cnvr_new <- rbind(dt_cnvr_new, dt_cnvr1)
73 |   }
74 |   
75 | }
76 | 
77 | write.table(dt_cnvr_new, 
78 |             file = opt$output,
79 |             quote = F, row.names = F, sep = "\t")
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/02_batch_effect/PCA_on_LRR/step.3.LRR.pca.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | ## PCA
 4 | args <- commandArgs( trailingOnly = TRUE )
 5 | wk_dir <- args[1]            ## working directory where the LRR matrix is located for PCA
 6 | filename_matrix <- args[2]   ## the LRR matrix generated in step 2
 7 | 
 8 | suppressMessages({
 9 |   require(data.table)
10 |   require(tibble)
11 |   require(cowplot)
12 |   require(ggplot2)
13 | })
14 | 
15 | 
16 |   dat_LRR <- fread(input = file.path( wk_dir, filename_matrix) )  
17 |   dat_LRR <- as.data.frame(dat_LRR, stringsAsFactors = FALSE)
18 |   dat_LRR <- column_to_rownames(dat_LRR, var = "V1")
19 |   
20 |   sampleID <- rownames( dat_LRR )
21 |   
22 |   ## deal with NA values in matrix
23 |   mat <- as.matrix(dat_LRR)
24 |   rownames(mat) <- sampleID
25 |   colnames(mat) <- NULL
26 |   
27 |   col_mean <- colMeans(mat, na.rm = TRUE)
28 |   for (i in 1:nrow(mat)) {
29 |     v1 <- as.vector(mat[i, ])
30 |     idx1 <- which(is.na(v1))
31 |     if (length(idx1) >= 1) {
32 |       mat[i, idx1] <- col_mean[idx1]
33 |     }
34 |   }
35 |   
36 |   ## check which SNPs with all values being NA
37 |   idxs.na.snps <- which( is.na(col_mean) )
38 |   if (length(idxs.na.snps)>0) mat <- mat[, -idxs.na.snps] ##***
39 |   
40 |   dat.pca <- as.data.frame( mat )
41 |   rownames(dat.pca) <- sampleID
42 |   
43 |   PCA <- prcomp(dat.pca)
44 |   PC  <- predict(PCA)
45 |   PC  <- data.frame(Sample_ID = rownames(PC), 
46 |                     PC[, c("PC1", "PC2", "PC3")], 
47 |                     stringsAsFactors = FALSE)
48 |   
49 |   write.table(PC, file = file.path(wk_dir, "LRR_PCA_res.txt"),
50 |               quote = F, row.names = F, sep = "\t")
51 |   
52 |   ## plot PCA results
53 |   p12 <- ggplot(data = PC, aes(PC1, PC2)) + 
54 |     geom_point(size = 1) + 
55 |     theme_bw() +
56 |     theme(plot.title = element_text(size = 20, hjust = 0.5),
57 |           axis.title = element_text(size = 15, face = "bold"),
58 |           axis.text = element_text(size = 15, face = "bold")) + 
59 |     ggtitle("PC2 ~ PC1")
60 |   
61 |   p13 <- ggplot(data = PC, aes(PC1, PC3)) + 
62 |     geom_point(size = 1) + 
63 |     theme_bw() +
64 |     theme(plot.title = element_text(size = 20, hjust = 0.5),
65 |           axis.title = element_text(size = 15, face = "bold"),
66 |           axis.text = element_text(size = 15, face = "bold")) + 
67 |     ggtitle("PC3 ~ PC1")
68 |   
69 |   p23 <- ggplot(data = PC, aes(PC2, PC3)) + 
70 |     geom_point(size = 1) + 
71 |     theme_bw() +
72 |     theme(plot.title = element_text(size = 20, hjust = 0.5),
73 |           axis.title = element_text(size = 15, face = "bold"),
74 |           axis.text = element_text(size = 15, face = "bold")) + 
75 |     ggtitle("PC3 ~ PC2")
76 |   
77 |   png(filename = file.path(wk_dir, "LRR_PCA_plots.png"),
78 |       width = 12, height = 12, units = "in", res = 512)  
79 |     p <- plot_grid(p12, p13, p23, nrow = 2)
80 |     print(p)
81 |   dev.off()
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/01_initial_call/run_iPattern/prepare_input_files_for_iPattern.R:
--------------------------------------------------------------------------------
 1 | #!/urs/bin/env Rscript
 2 | 
 3 | ## The script was used to prepare auxiliary input files for iPattern
 4 | ## the auxiliary input files will be stored at ${WKDIR}/01_initial_call/run_iPattern/data_aux
 5 | 
 6 | args <- commandArgs( trailingOnly = TRUE )
 7 | 
 8 | ## working directory
 9 | path_wkdir <- args[1]
10 | 
11 | ## project name for running iPattern
12 | project_name <- args[2]
13 | 
14 | ## path to the directory for running iPattern
15 | path_run_ipattern <- file.path(path_wkdir, "01_initial_call", "run_iPattern")
16 | 
17 | ##--------------------------------------------------------------------------------
18 | ## 1) data_file: list of splitted final report files for each sample
19 | ## the directory contains the input files prepared by finalreport_to_iPattern.pl
20 | path_ipattern_prepare_data <- file.path(path_run_ipattern, "data")
21 | fls_all <- list.files(path = path_ipattern_prepare_data, pattern = ".txt$", full.names = TRUE)
22 | 
23 | data_file <- data.frame(data_file = fls_all, stringsAsFactors = FALSE)
24 | write.table( data_file, file = file.path( path_run_ipattern, "data_aux", 
25 |                                           paste0(project_name, "_data_file.txt")),
26 |              sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
27 | 
28 | 
29 | ##--------------------------------------------------------------------------------
30 | ## 2) gener_file: tab-delimied file which lists geneder information for each sample
31 | ## the file consists of two columns, Sample_ID and Gender,
32 | ## which may be retrieved from Samples_Table.txt (see Data section of ensembleCNV README.md)
33 | ## Samples_Table.txt is supposed to be at ${WKDIR}/data
34 | ## the gender_file does NOT have column names in the header, for example
35 | # Sample_1	M
36 | # Sample_2	F
37 | # Sample_3	F
38 | 
39 | gender_file <- read.delim(file = file.path(path_wkdir, "data", "Samples_Table.txt"), as.is = TRUE)
40 | gender_file$Gender <- toupper( substr(gender_file$Gender, 1, 1) )
41 | write.table( gender_file, file = file.path( path_run_ipattern, "data_aux", 
42 |                                             paste0(project_name, "_gender_file.txt")),
43 |              sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
44 | 
45 | 
46 | ##--------------------------------------------------------------------------------
47 | ## 3) bad_samples: file lists sample IDs of poor quality to be excluded from iPattern analysis, for example
48 | # bad_sample_1
49 | # bad_sample_2
50 | # bad_sample_3
51 | 
52 | ## We prepare an empty file. The user can type in bad samples.
53 | write.table(NULL, file = file.path( path_run_ipattern, "data_aux",
54 |                                     paste0(project_name, "_bad_samples.txt")),
55 |             sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
56 | 
57 | cat("Processing is completed.\n")
58 | cat("Three files are generated:\n")
59 | cat(file.path( path_run_ipattern, "data_aux", paste0(project_name, "_data_file.txt")), "\n")
60 | cat(file.path( path_run_ipattern, "data_aux", paste0(project_name, "_gender_file.txt")), "\n")
61 | cat(file.path( path_run_ipattern, "data_aux", paste0(project_name, "_bad_samples.txt")), "\n")
62 | 
63 | 


--------------------------------------------------------------------------------
/05_boundary_refinement/step.4.update.genotype.matrix.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressMessages( library(optparse) )
 4 | # before running this script 
 5 | # you need to regenotype CNVRs in cnvr_regenotype_after_refine.txt
 6 | # which are generated from step3.clean.results.R
 7 | 
 8 | option_list <- list(
 9 |   make_option(c("-b", "--matrixbeforerefine"), action = "store", type = "character", default = NA,
10 |               help = "Path to CN and GQ matrices generated in first round of 
11 |                       CNV genotyping step before boundary refinement."),
12 |   make_option(c("-f", "--matrixrefine"), action = "store", type = "character", default = NA, 
13 |               help = "Path to CN and GQ matrices generated in CNV regenotyping for 
14 |                       CNVRs with updated boundaries after refinement as well as CNVR information."),
15 |   make_option(c("-p", "--refinepath"), action = "store", type = "character", default = NA,
16 |               help = "Path to cnvr_kept_after_refine.txt."),
17 |   make_option(c("-o", "--output"), action = "store", type = "character", default = NA,
18 |               help = "Path to the directory for saving final CN and GQ matrices")
19 | )
20 | 
21 | opt <- parse_args(OptionParser(option_list = option_list))
22 | pars <- c(opt$matrixbeforerefine, opt$matrixrefine, 
23 |           opt$refinepath, opt$output)
24 | if ( any(is.na(pars)) ) {
25 |   stop("All parameters must be supplied. (--help for detail)")
26 | }
27 | 
28 | path_matrix_before_refine <- opt$matrixbeforerefine
29 | path_matrix_refine <- opt$matrixrefine
30 | path_refine <- opt$refinepath
31 | path_output <- opt$output
32 | 
33 | mat_CN_before_refine <- readRDS( file = file.path(path_matrix_before_refine, "matrix_CN.rds"))
34 | mat_GQ_before_refine <- readRDS( file = file.path(path_matrix_before_refine, "matrix_GQ.rds"))
35 | 
36 | cnvrs   <- rownames( mat_CN_before_refine )
37 | samples <- colnames( mat_CN_before_refine )
38 | 
39 | # keep cnvrs after refinement
40 | dat_cnvr_keep <- read.delim( file = file.path(path_refine, "cnvr_kept_after_refine.txt"), as.is = TRUE)
41 | 
42 | mat_CN_keep <- mat_CN_before_refine[dat_cnvr_keep$CNVR_ID, ]
43 | mat_GQ_keep <- mat_GQ_before_refine[dat_cnvr_keep$CNVR_ID, ]
44 | 
45 | # regenotyped CNVRs with updated boundaries ----------------------------------------------
46 | dat_cnvr_refine <- read.delim( file = file.path(path_matrix_refine, "cnvr_genotype.txt"), as.is = TRUE)
47 | 
48 | mat_CN_refine <- readRDS( file = file.path(path_matrix_refine, "matrix_CN.rds"))
49 | mat_GQ_refine <- readRDS( file = file.path(path_matrix_refine, "matrix_GQ.rds"))
50 | 
51 | samples_refine <- colnames( mat_CN_refine )
52 | stopifnot( sum(samples_refine %in% samples) == length(samples))
53 | 
54 | mat_CN_refine <- mat_CN_refine[, samples]
55 | mat_GQ_refine <- mat_GQ_refine[, samples]
56 | 
57 | ## final results
58 | mat_CN_final <- rbind( mat_CN_keep, mat_CN_refine )
59 | mat_GQ_final <- rbind( mat_GQ_keep, mat_GQ_refine )
60 | 
61 | common.cols <- intersect(names(dat_cnvr_keep), names(dat_cnvr_refine))
62 | common.cols <- setdiff(common.cols, c("batch", "genotype", "identicalID"))
63 | dat_cnvr <- rbind(dat_cnvr_keep[, common.cols], dat_cnvr_refine[, common.cols])
64 | 
65 | saveRDS( mat_CN_final, file = file.path(path_output, "matrix_CN_final.rds"))
66 | saveRDS( mat_GQ_final, file = file.path(path_output, "matrix_GQ_final.rds"))
67 | 
68 | write.table(dat_cnvr,
69 |             file = file.path(path_output, "cnvr_final.txt"),
70 |             quote = F, row.names = F, sep = "\t")
71 | 
72 | 


--------------------------------------------------------------------------------
/05_boundary_refinement/step.1.common.CNVR.to.refine.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressMessages(library(optparse))
 4 | 
 5 | option_list <- list(
 6 |   make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA,
 7 |               help = "Path to the directory containing necessary input data."),
 8 |   make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA,
 9 |               help = "Path to the directory for saving results."),
10 |   make_option(c("-c", "--freq"), action = "store", type = "double", default = NA,
11 |               help = "Frequency cut-off to select CNVRs with common CNVs for boundary refinement.")
12 | )
13 | 
14 | 
15 | opt = parse_args(OptionParser(option_list = option_list))
16 | pars = c(opt$datapath, opt$resultpath, opt$freq)
17 | 
18 | if ( any(is.na(pars)) ) {
19 |   stop("All three parameters must be supplied. (--help for detail)")
20 | }
21 | 
22 | cutoff_freq <- as.numeric( opt$freq )
23 | path_data   <- opt$datapath
24 | path_result <- opt$resultpath
25 | 
26 | path_output <- file.path( path_result ) ##"cnvr_refinement"
27 | if (!dir.exists(paths = path_output) ) dir.create(path = path_output, showWarnings = F, recursive = T)
28 | 
29 | # the copy number matrix generated from CNV genotyping step
30 | mat_CN <- readRDS( file = file.path(path_data, "matrix_CN.rds"))
31 | n.sample <- ncol( mat_CN )
32 | n.CNVR   <- nrow( mat_CN )
33 | 
34 | cnvrs <- rownames( mat_CN )
35 | 
36 | freqs_CNVR <- unlist( lapply(1:n.CNVR, FUN = function(i) {
37 |   v1 <- as.integer( mat_CN[i, ])
38 |   n1 <- sum( v1 %in% c(0, 1, 3))
39 |   n1
40 | }))
41 | 
42 | idxs.refine <- which( freqs_CNVR >= n.sample*cutoff_freq)
43 | 
44 | dat_freq <- data.frame(CNVR_ID = cnvrs, 
45 |                        Freq = freqs_CNVR,
46 |                        stringsAsFactors = F)
47 | 
48 | if (length(idxs.refine) > 0) {
49 |   cnvrs_refine <- cnvrs[ idxs.refine ]
50 |   cnvrs_keep   <- cnvrs[ -idxs.refine ]
51 | } else {
52 |   cnvrs_refine <- NULL
53 |   cnvrs_keep   <- cnvrs
54 | }
55 | 
56 | # write.table( data.frame(CNVR_ID = cnvrs_refine, stringsAsFactors = FALSE), 
57 |              # file = file.path(path_output, "cnvrs_refine.txt"),
58 |              # quote = F, row.names = F, col.names = F, sep = "\t")
59 | # write.table( data.frame(CNVR_ID = cnvrs_keep, stringsAsFactors = FALSE),
60 |              # file = file.path(path_output, "cnvrs_keep.txt"),
61 |              # quote = F, row.names = F, col.names = F, sep = "\t")
62 | 
63 | file_cnvr <- "cnvr_genotype.txt"  ## with CNV genotype information
64 | dat_cnvrs <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE)
65 | nms <- names(dat_cnvrs)
66 | names(dat_cnvrs)[nms == "Freq"] <- "raw_Freq"
67 | dat_cnvrs <- subset(dat_cnvrs, genotype == 1)
68 | 
69 | dat_cnvrs <- merge( dat_cnvrs, dat_freq, by = "CNVR_ID", all = FALSE)
70 | dat_cnvrs <- dat_cnvrs[order(dat_cnvrs$chr, dat_cnvrs$posStart, dat_cnvrs$posEnd), ]
71 | stopifnot( nrow(dat_cnvrs) == nrow(dat_freq) )
72 | 
73 | if (length(cnvrs_refine) > 0) {
74 |   dat_cnvrs_refine <- subset( dat_cnvrs, CNVR_ID %in% cnvrs_refine )
75 |   dat_cnvrs_keep   <- subset( dat_cnvrs, CNVR_ID %in% cnvrs_keep )
76 | } else {
77 |   dat_cnvrs_refine <- data.frame(NULL)
78 |   dat_cnvrs_keep   <- dat_cnvrs
79 | }
80 | 
81 | write.table( dat_cnvrs_keep, 
82 |              file = file.path(path_output, "cnvr_keep.txt"),
83 |              quote = F, row.names = F, sep = "\t")
84 | write.table( dat_cnvrs_refine, 
85 |              file = file.path(path_output, "cnvr_refine.txt"),
86 |              quote = F, row.names = F, sep = "\t")
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/05_boundary_refinement/step.2.submit.jobs.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system
 4 | 
 5 | suppressMessages(library(optparse))
 6 | 
 7 | option_list <- list(
 8 |   make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA,
 9 |               help = "Path to the directory containing necessary input data."),
10 |   make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA,
11 |               help = "Path to the directory for saving results."),
12 |   make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA, 
13 |               help = "Path to chromosome-wise LRR and BAF matrices."),
14 |   make_option(c("-i", "--refinescript"), action = "store", type = "character", default = NA,
15 |               help = "Path to the main script CNVR.boundary.refinement.R."),
16 |   make_option(c("-s", "--rcppfile"), action = "store", type = "character", default = NA,
17 |               help = "Path to refine.rcpp to be used in this R script."),
18 |   make_option(c("-r", "--centromere"), action = "store", type = "character", default = NA,
19 |               help = "Path to file with centromere position of each chromosome."),
20 |   make_option(c("-n", "--plot"), action = "store_true", default = FALSE,
21 |               help = "[optional] Whether to generate diagnosis plots.")
22 | )
23 | 
24 | opt <- parse_args(OptionParser(option_list = option_list))
25 | pars = c(opt$datapath, opt$resultpath, opt$matrixpath,
26 |          opt$rcppfile, opt$centromere, opt$refinescript)
27 | 
28 | if ( any(is.na(pars)) ) {
29 |   stop("All parameters must be supplied. (--help for detail)")
30 | }
31 | 
32 | script_refine   <- opt$refinescript
33 | path_result     <- opt$resultpath
34 | path_matrix     <- opt$matrixpath
35 | path_data       <- opt$datapath
36 | script_rcpp     <- opt$rcppfile
37 | file_centromere <- opt$centromere
38 | flag_plot       <- opt$plot
39 | 
40 | # cnvrs refinement
41 | dat_cnvrs_refine <- read.delim( file = file.path(path_result, "cnvr_refine.txt"), as.is = TRUE ) 
42 | stopifnot( nrow(dat_cnvrs_refine) > 0 )
43 | 
44 | chrs <- sort( unique(dat_cnvrs_refine$chr))
45 | 
46 | cmd <- paste("Rscript", script_refine,
47 |              "--datapath", path_data,
48 |              "--resultpath", path_result,
49 |              "--matrixpath", path_matrix,
50 |              "--rcppfile", script_rcpp,
51 |              "--centromere", file_centromere)
52 | 
53 | if ( flag_plot ) {
54 |   cmd <- paste(cmd, "--plot")
55 | } 
56 | 
57 | for (chr1 in chrs) {
58 |   
59 |   cmd.chr1 <- paste(cmd, 
60 |                     "--chr", chr1)
61 | 
62 |   path_log <- file.path(path_result, "res_refine/chr", chr1, "log")
63 |   if (!dir.exists(path_log)) dir.create(path = path_log, recursive = TRUE)
64 | 
65 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
66 | ## configure based on your system
67 |   bsub.cmd.chr1 <- paste("bsub -n 2 -W 10:00",
68 |                          "-R 'rusage[mem=10000]'",
69 |                          "-P <account>",
70 |                          "-J", paste0("chr", chr1),
71 |                          "-q premium",
72 |                          "-e", file.path(path_log, paste0("boundary_refine_chr", chr1, ".err")),
73 |                          "-o", file.path(path_log, paste0("boundary_refine_chr", chr1, ".log")),
74 |                          shQuote( cmd.chr1 ))
75 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
76 | 
77 |   cat("chr:", chr1, bsub.cmd.chr1, "\n")
78 |   system( bsub.cmd.chr1 )
79 |   Sys.sleep(0.1)
80 | }
81 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/scripts/fun_LRR.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # process cnvrs LRR from snps to sample_based
  4 | process_cnvr_LRR <- function(dt_cnvrs, samples_LRR) {
  5 |   
  6 |   samples <- unique(dt_cnvrs$Sample_ID)
  7 |   # subset samples_LRR
  8 |   samples_LRR1 <- subset(samples_LRR, Sample_ID %in% samples)
  9 |   median_samples_LRR_SD <- median(samples_LRR1$LRR_SD, na.rm = TRUE)
 10 |   # test 
 11 |   stopifnot(nrow(samples_LRR1) == length(samples))
 12 |   
 13 |   res <- data.frame(Sample_ID = samples, CNVR_ID = unique(dt_cnvrs$CNVR_ID),
 14 |                     LRR_median = 0, Chr = unique(dt_cnvrs$Chr), alg = "other",
 15 |                     CN = 2, numSNP = unique(dt_cnvrs$numSNP), stringsAsFactors = FALSE)
 16 |   
 17 |   for (i in 1:length(samples)) {
 18 |     
 19 |     sample1 <- samples[i]
 20 |     
 21 |     idx1 <- which(samples_LRR1$Sample_ID == sample1)
 22 |     sample1_LRR_SD <- samples_LRR1$LRR_SD[idx1]  ##
 23 |     
 24 |     dt1 <- subset(dt_cnvrs, Sample_ID == sample1)
 25 |     
 26 |     LRR_median1 <- median(dt1$LRR, na.rm = TRUE)
 27 |     CN1 <- unique(dt1$CN)
 28 |     alg1 <- unique(dt1$alg)
 29 |     
 30 |     res$LRR_median[i] <- (LRR_median1/sample1_LRR_SD)*median_samples_LRR_SD ## transform
 31 |     # res$LRR_median[i] <- LRR_median1
 32 |     res$CN[i] <- CN1
 33 |     res$alg[i] <- alg1
 34 |   }
 35 |   
 36 |   res
 37 | }
 38 | 
 39 | 
 40 | 
 41 | # calculate LRR gatk whole with pi
 42 | calculate_LRR_gatk_whole <- function(dt_cnvr, mu1, sigma1, lambda1, cn_type) {
 43 |   if(cn_type == 2) { # for all CN = 2 type
 44 |     
 45 |     dt_cnvr$LRR2 <- sapply(1:nrow(dt_cnvr), FUN = function(k) {
 46 |       LRR1 <- dt_cnvr$LRR_median[k]
 47 |       prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1)
 48 |       prop1
 49 |     })
 50 |     
 51 |   } else if(cn_type == 1) {
 52 |     
 53 |     dt_cnvr$LRR1 <- sapply(1:nrow(dt_cnvr), FUN = function(k) {
 54 |       LRR1 <- dt_cnvr$LRR_median[k]
 55 |       prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1)
 56 |       prop1
 57 |     })
 58 |     
 59 |   } else if(cn_type == 3) {
 60 |     
 61 |     dt_cnvr$LRR3 <- sapply(1:nrow(dt_cnvr), FUN = function(k) {
 62 |       LRR1 <- dt_cnvr$LRR_median[k]
 63 |       prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1)
 64 |       prop1
 65 |     })
 66 |     
 67 |   } else if(cn_type == 0) {
 68 |     
 69 |     dt_cnvr$LRR0 <- sapply(1:nrow(dt_cnvr), FUN = function(k) {
 70 |       LRR1 <- dt_cnvr$LRR_median[k]
 71 |       prop1 <- lambda1*dnorm(x = LRR1, mean = mu1, sd = sigma1)
 72 |       prop1
 73 |     })
 74 |     
 75 |   }
 76 |   
 77 |   return(dt_cnvr)
 78 | }
 79 | 
 80 | # output LRR calcualte gatk result
 81 | output_LRR_gatk <- function(dt_cnvr, model) {
 82 |   
 83 |   dt_LRR0 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 
 84 |                                       mu1 = model$mu[1], 
 85 |                                       sigma1 = model$sigma[1], 
 86 |                                       lambda1 = model$lambda[1], cn_type = 0)
 87 |   dt_LRR1 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 
 88 |                                       mu1 = model$mu[2], 
 89 |                                       sigma1 = model$sigma[2], 
 90 |                                       lambda1 = model$lambda[2], cn_type = 1)
 91 |   dt_LRR2 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 
 92 |                                       mu1 = model$mu[3], 
 93 |                                       sigma1 = model$sigma[3], 
 94 |                                       lambda1 = model$lambda[3], cn_type = 2)
 95 |   dt_LRR3 <- calculate_LRR_gatk_whole(dt_cnvr = dt_cnvr, 
 96 |                                       mu1 = model$mu[4], 
 97 |                                       sigma1 = model$sigma[4], 
 98 |                                       lambda1 = model$lambda[4], cn_type = 3)
 99 |   dt_LRR01 <- merge(dt_LRR0, dt_LRR1)
100 |   dt_LRR012 <- merge(dt_LRR01, dt_LRR2)
101 |   dt_LRR0123 <- merge(dt_LRR012, dt_LRR3)  ## all p(LRR_median | CN = cn_type)
102 |   
103 |   return(dt_LRR0123)
104 | }
105 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/step.4.prediction.results.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressMessages(library(optparse))
  4 | 
  5 | option_list = list(
  6 |   make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA,
  7 |               help = "Path to the directory containing necessary input data."),
  8 |   make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA,
  9 |               help = "Path to the directory for saving results.")
 10 | )
 11 | 
 12 | opt = parse_args(OptionParser(option_list = option_list))
 13 | pars = c(opt$datapath, opt$resultpath)
 14 | 
 15 | if ( any(is.na(pars)) ) {
 16 |   stop("All parameters must be supplied. (--help for detail)")
 17 | }
 18 | 
 19 | path_data   <- opt$datapath
 20 | path_result <- opt$resultpath
 21 | 
 22 | path_pred <- file.path(path_result, "pred")
 23 | 
 24 | # number of samples
 25 | # dat_samples <- read.delim(file = file.path(path_data, "samples_QC.txt"), as.is = TRUE)
 26 | # samples <- sub("\\.txt$", "", dat_samples$File)
 27 | # n_samples <- nrow(dat_samples)
 28 | 
 29 | file_cnvr <- "cnvr_batch.txt"  ## with batch information
 30 | dt_cnvr_raw <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE)
 31 | 
 32 | tbl_raw <- table(dt_cnvr_raw$chr, dt_cnvr_raw$batch)
 33 | dt_freq_raw <- as.data.frame(tbl_raw)
 34 | names(dt_freq_raw) <- c("chr", "batch", "Freq")
 35 | dt_freq_raw <- subset(dt_freq_raw, Freq != 0)
 36 | 
 37 | ## initialize sample list using the information from the first CNVR
 38 | chr1   <- dt_freq_raw$chr[1]
 39 | batch1 <- dt_freq_raw$batch[1]
 40 | preds1 <- list.files(path = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1)),
 41 |                      pattern = ".rds")
 42 | dat1 <- readRDS( file = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1), preds1[1]) )
 43 | 
 44 | samples <- dat1$Sample_ID
 45 | n_samples <- length(samples)
 46 | 
 47 | cnvrs  <- c()
 48 | 
 49 | # row: CNVRs; column: samples
 50 | res_CN <- data.frame()
 51 | res_GQ <- data.frame()
 52 | 
 53 | for ( i in 1:nrow(dt_freq_raw) ) {
 54 |   
 55 |   chr1   <- dt_freq_raw$chr[i]
 56 |   batch1 <- dt_freq_raw$batch[i]
 57 | 
 58 |   preds1 <- list.files(path = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1)),
 59 |                        pattern = ".rds")
 60 |   
 61 |   cnvrs1 <- gsub("_pred.rds$", "", preds1, perl = T)
 62 |   cnvrs <- c(cnvrs, cnvrs1)
 63 |   
 64 |   res1_GQ <- matrix(nrow = length(cnvrs1), ncol = n_samples)
 65 |   rownames(res1_GQ) <- cnvrs1
 66 |   colnames(res1_GQ) <- samples
 67 |   res1_CN <- res1_GQ
 68 |   
 69 |   for (k in 1:length(preds1)) {
 70 |     pred1 <- preds1[k]
 71 |     cnvr1 <- cnvrs1[k]
 72 |     dat1 <- readRDS(file = file.path(path_pred, paste0("chr_", chr1, "_batch_", batch1), pred1))
 73 |     
 74 |     ## sort the results according to the order of samples  
 75 |     dat1 <- dat1[match(samples, dat1$Sample_ID), ]
 76 |     stopifnot( all(dat1$Sample_ID == samples) )
 77 |     
 78 |     res1_GQ[k, ] <- dat1$value_GQ
 79 |     res1_CN[k, ] <- dat1$CN_gatk_pred
 80 |   }
 81 |   
 82 |   res_GQ <- rbind(res_GQ, res1_GQ)
 83 |   res_CN <- rbind(res_CN, res1_CN)
 84 | }
 85 | 
 86 | stopifnot( all(rownames(res_GQ) == cnvrs) )
 87 | stopifnot( all(colnames(res_GQ) == samples) )
 88 | stopifnot( all(rownames(res_CN) == cnvrs) )
 89 | stopifnot( all(colnames(res_CN) == samples) )
 90 | 
 91 | mat_GQ <- as.matrix(res_GQ)
 92 | mat_CN <- as.matrix(res_CN)
 93 | rownames(mat_GQ) <- cnvrs
 94 | rownames(mat_CN) <- cnvrs
 95 | colnames(mat_GQ) <- samples
 96 | colnames(mat_CN) <- samples
 97 | 
 98 | ## mark on successfully CNV-genotyped CNVRs
 99 | dt_cnvr_raw$genotype <- 0
100 | dt_cnvr_raw$genotype[ dt_cnvr_raw$CNVR_ID %in% cnvrs ] <- 1
101 | 
102 | write.table(dt_cnvr_raw, 
103 |             file = file.path(path_result, "cnvr_genotype.txt"),
104 |             quote = F, row.names = F, sep = "\t")
105 | 
106 | write.table(data.frame(Sample_ID = samples), 
107 |             file = file.path(path_result, "sample_genotype.txt"),
108 |             quote = F, row.names = F, col.names = F, sep = "\t")
109 | 
110 | saveRDS(mat_GQ, file = file.path(path_result, "matrix_GQ.rds"))
111 | saveRDS(mat_CN, file = file.path(path_result, "matrix_CN.rds"))
112 | 
113 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/step.2.submit.jobs.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system
 4 | 
 5 | suppressMessages(library(optparse))
 6 | 
 7 | option_list = list(
 8 |   make_option(c("-t", "--type"), action = "store", type = "character", default = NA,
 9 |               help = "Job submission type (0 - initial submission, 1 - resubmission of failed jobs)"),
10 |   make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA,
11 |               help = "Path to the directory containing necessary input data."),
12 |   make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA,
13 |               help = "Path to the directory for saving results."),
14 |   make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA,
15 |               help = "Path to chromosome-wise LRR and BAF matrices."),
16 |   make_option(c("-s", "--sourcefile"), action = "store", type = "character", default = NA,
17 |               help = "Path to the scripts directory containing R scripts to be loaded into R."),
18 |   make_option(c("-d", "--duplicates"), action = "store_true", default = FALSE,
19 |               help = "[optional] Whether duplicate pairs information will be annotated in diagnosis plots."),
20 |   make_option(c("-n", "--plot"), action = "store_true", default = FALSE,
21 |               help = "[optional] Whether to generate diagnosis plots."),
22 |   make_option(c("-r", "--script"), action = "store", type = "character", default = NA,
23 |               help = "Path to the main script CNV.genotype.one.chr.one.batch.R."),
24 |   make_option(c("-l", "--joblog"), action = "store", type = "character", default = NA,
25 |               help = "Path to the directory saving job logs.")
26 | )
27 | 
28 | opt = parse_args(OptionParser(option_list = option_list))
29 | pars = c(opt$type, opt$datapath, opt$resultpath, opt$joblog,
30 |          opt$matrixpath, opt$sourcefile, opt$script)
31 | 
32 | if ( any(is.na(pars)) ) {
33 |   stop("All required parameters must be supplied. (--help for detail)")
34 | }
35 | 
36 | script <- file.path(opt$script, "CNV.genotype.one.chr.one.batch.R")
37 | cmd    <- paste("Rscript", script, 
38 |                 "--type", opt$type,
39 |                 "--datapath", opt$datapath,
40 |                 "--resultpath", opt$resultpath,
41 |                 "--matrixpath", opt$matrixpath,
42 |                 "--sourcefile", opt$sourcefile)
43 | 
44 | if ( opt$duplicates ) cmd <- paste(cmd, "--duplicates")
45 | if ( opt$plot ) cmd <- paste(cmd, "--plot")
46 | 
47 | path_joblog <- opt$joblog
48 | if (!dir.exists(paths = path_joblog)) dir.create(path = path_joblog, showWarnings = F, recursive = T)
49 | dir.create(path = file.path(path_joblog, "job", "ERROR"), showWarnings = F, recursive = T)
50 | dir.create(path = file.path(path_joblog, "job", "OUT"), showWarnings = F, recursive = T)
51 | 
52 | path_job_error <- file.path(path_joblog, "job", "ERROR")
53 | path_job_out   <- file.path(path_joblog, "job", "OUT")
54 | 
55 | file_cnvr <- "cnvr_batch.txt"  ## with batch information
56 | dat_cnvr  <- read.delim(file = file.path(opt$datapath, file_cnvr), as.is = TRUE)
57 | chrs <- sort( unique(dat_cnvr$chr) )
58 | 
59 | for ( chr1 in chrs ) {
60 |   
61 |   dat_cnvr_chr1 = subset(dat_cnvr, chr == chr1)
62 |   batch_chr1 = sort( unique(dat_cnvr_chr1$batch) )
63 |   
64 |   if ( nrow(dat_cnvr_chr1) == 0) {
65 |     next
66 |   }
67 |   
68 |   for ( batch1 in batch_chr1 ){
69 |     
70 |     cat("chr:", chr1, "batch1:", batch1, "\n")
71 |     cmd1 = paste(cmd, "--chr", chr1, "--batch", batch1)
72 |     
73 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
74 | ## configure based on your system
75 |     bsub.cmd = paste("bsub -n 2 -W 10:00 -R 'rusage[mem=20000]' -P <account>",
76 |                      "-e", file.path(path_job_error, paste0("chr_", chr1, "_batch_", batch1, ".e")), 
77 |                      "-o", file.path(path_job_out, paste0("chr_", chr1, "_batch_", batch1, ".o")),
78 |                      "-q premium", shQuote(cmd1))
79 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
80 |     
81 |     cat(bsub.cmd, "\n")
82 |     system(bsub.cmd)
83 |     
84 |     Sys.sleep(0.1)
85 |   }
86 |   
87 | }
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/01_initial_call/run_QuantiSNP/README.md:
--------------------------------------------------------------------------------
 1 | ## QuantiSNP
 2 | 
 3 | ### Installation
 4 | 
 5 | To download and install QuantiSNP (version 2), please follow the detailed instructions at the [page](https://sites.google.com/site/quantisnp/downloads), which provides links to download MATLAB Run-Time Component Libraries, QuantiSNP package and GC content data. For more information about QuantiSNP, please refer to their original [QuantiSNP website](https://sites.google.com/site/quantisnp/home).
 6 | 
 7 | After installation, set up environment variable QUANTISNP: `export QUANTISNP='/path/to/quantisnp'`
 8 | 
 9 | Please organize the installation folder in the following way:
10 | 
11 | - MATLAB Run-Time Component Libraries root directory: `${QUANTISNP}/v79/`
12 | - QuantiSNP root directory: `${QUANTISNP}/quantisnp/`
13 | - GC content data (take b37/hg19 data for example) directory: `${QUANTISNP}/data/b37/`
14 | 
15 | Note:
16 | 
17 | - Running QuantiSNP does not require MATLAB, but rather the developers provided a self-contained MATLAB Run-Time Component Libraries in accompany with QuantiSNP.
18 | 
19 | - Package libxp6 (e.g. https://packages.ubuntu.com/trusty/libxp6) needs to be installed.
20 | 
21 | - We have checked that the installation of MATLAB Run-Time Component Libraries and QuantiSNP worked properly on two versions of Linux: CentOS 6.9 with openjdk 6 (the system used on [Minverva](https://hpc.mssm.edu/) cluster) or Ubuntu 16.04 with openjdk 8. The installation of the two components will probably require some further tweaking for other different systems.
22 | 
23 | ### Analysis workflow
24 | 
25 | Note: 
26 | 
27 | - QuantiSNP was originally designed to analyze one sample at a time or a batch of samples sequentially. Please refer to the original QuantiSNP [usage](https://sites.google.com/site/quantisnp/howto) for more details. Here, we provide scripts to run the analysis on multiple samples in parallel via job submitting system (one sample per job) in a cluster environment. 
28 | 
29 | - In the following steps (1) and (2), the scripts regarding job submission embraced by "##<<<... ##>>>..." in the scripts need to be specified by the users based on the system the users are using.
30 | 
31 | We run QuantiSNP analysis with the following 3 steps:
32 | 
33 | (1) Run QuantiSNP for each sample in parallel (through job submitting system)
34 | ```sh
35 | Rscript ${WKDIR}/01_initial_call/run_QuantiSNP/step.1.prepare.QuantiSNP.R \
36 | --quantisnp ${QUANTISNP} \
37 | --data ${WKDIR}/01_initial_call/run_QuantiSNP/data \ ## generated with finalreport_to_QuantiSNP.pl
38 | --sample ${WKDIR}/data/Samples_Table.txt \
39 | --result ${WKDIR}/01_initial_call/run_QuantiSNP/results/res
40 | ```
41 | Note: For details about `Samples_Table.txt`, please check the section [data](https://github.com/HaoKeLab/ensembleCNV#data).
42 | 
43 | When the analysis is completed, there will be subfolders named after sample IDs, each for one sample respectively, created in the directory `${WKDIR}/01_initial_call/run_QuantiSNP/results/res`. Within each sample subfolders, two files (among others) will be generated and used in downstream analysis:
44 | - `<Sample_ID>.qc`: chromosome-level summary statistics, which will be summarized later at sample level and used in checking [batch effect](https://github.com/HaoKeLab/ensembleCNV#pca-on-summary-statistics). 
45 | - `<Sample_ID>.cnv`: raw CNV calls for each sample.
46 | 
47 | (2) Check job status and resubmit unfinishing jobs
48 | ```sh
49 | Rscript ${WKDIR}/01_initial_call/run_QuantiSNP/step.2.check.QuantiSNP.R \
50 | --quantisnp ${QUANTISNP} \
51 | --data ${WKDIR}/01_initial_call/run_QuantiSNP/data \ ## generated with finalreport_to_QuantiSNP.pl
52 | --sample ${WKDIR}/data/Samples_Table.txt \
53 | --result ${WKDIR}/01_initial_call/run_QuantiSNP/results/res
54 | ```
55 | This step checks if the jobs submitted for each sample in step (1) are successfully completed and resubmits failed jobs if there is any.
56 | 
57 | (3) Combine PennCNV results from each sample, including the content in ".cnv" files
58 | ```sh
59 | perl ${WKDIR}/01_initial_call/run_QuantiSNP/step.3.combine.QuantiSNP.pl \
60 | --in_dir ${WKDIR}/01_initial_call/run_QuantiSNP/results/res \
61 | --out_dir ${WKDIR}/01_initial_call/run_QuantiSNP/results
62 | ```
63 | When the analysis is completed, you will find `quantisnp.cnv`, which will be used by ensembleCNV, in the directory `${WKDIR}/01_initial_call/run_QuantiSNP/results`. `quantisnp.cnv` combines the CNV calls from all samples generated in steps (1) and (2).
64 | 


--------------------------------------------------------------------------------
/02_batch_effect/PCA_on_LRR/step.2.LRR.matrix.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | ## build matrix file for all samples using 100000 selected SNPs
  4 | use strict;
  5 | use Carp;
  6 | 
  7 | ## input
  8 | my $file_snps_selected  = $ARGV[0];   ## seleted SNPs file from the first step
  9 | my $reportfile          = $ARGV[1];   ## finalreport from Genome Studio
 10 | my $file_matrix_LRR     = $ARGV[2];   ## output LRR matrix file
 11 | 
 12 | ## read in selected snps
 13 | open(IN, "< $file_snps_selected") or die "Error: can't open snps file $file_snps_selected: $!";
 14 | my %snps=();
 15 | while (my $line=<IN>) {
 16 | 	chomp $line;
 17 | 	#print "$line\n";
 18 | 	$snps{$line}++;
 19 | }
 20 | close IN;
 21 | 
 22 | my @snps=(keys %snps);
 23 | print "total number of SNPs: ".scalar(@snps)."\n";
 24 | 
 25 | ## parse the header of final report
 26 | open(REPORT, "< $reportfile") or die "Error: can't open finalreport $reportfile: $!";
 27 | 
 28 | my (@field);
 29 | my ($count_line, $sample_index, $name_index, $LRR_index) = (0); ## HC
 30 | 
 31 | while (<REPORT>) {
 32 | 	$count_line++;
 33 | 	m/^\[Data\]/ and last;
 34 | 	$count_line > 1000 and confess "Error: after reading 1000 lines in $reportfile, still cannot find [Data] section. The $reportfile file may not be in Illumina report format.\n";
 35 | }
 36 | 
 37 | $_ = <REPORT>;
 38 | s/[\r\n]+$//;
 39 | $count_line++;
 40 | @field = split (/\t/, $_);
 41 | @field >= 3 or confess confess "Error: invalid header line (at least 3 tab-delimited fields, including 'SNP Name', 'Sample ID', 'Log R Ratio' expected) in report file $reportfile: <$_>\n";
 42 | 
 43 | for my $i (0 .. @field-1) {
 44 | 	$field[$i] eq 'SNP Name' and $name_index = $i;
 45 | 	$field[$i] eq 'Sample ID' and $sample_index = $i;
 46 | 	$field[$i] eq 'Log R Ratio' and $LRR_index = $i;
 47 | }
 48 | 
 49 | defined $name_index or confess "Error: the 'SNP Name' field is not found in header line in report file $reportfile: <$_>\n";
 50 | defined $sample_index or confess "Error: the 'Sample ID' field is not found in header line in report file $reportfile: <$_>\n";
 51 | defined $LRR_index or confess "Error: the 'Log R Ratio' field is not found in header line in report file $reportfile: <$_>\n";
 52 | 
 53 | ## parse data part of final report
 54 | my %samples = (); ## hash for sample ID 
 55 | my %hash = ();    ## hash of LRR values at selected SNPs for one sample
 56 | 
 57 | my $flagsample = 0;    ## indicator of the first sample =0; following samples =1
 58 | my $lrrsample = ();    ## tab-delimited LRR values for one sample
 59 | my $SampleIDraw = ();  ## temporary sample ID of one sample
 60 | my $total = 0;         ## counter of current number of LRR values recorded in $lrrsample
 61 | my $flageof = 0;       ## indicaotr of eof =0 not EOF; =1 EOF
 62 | 
 63 | while (my $line = <REPORT>) {
 64 | 	
 65 | 	$flageof = 1 if eof; ## add file eof flag
 66 | 	$line =~ s/[\r\n]+$//; # HC
 67 | 
 68 | 	my @line=split(/\t/, $line);	
 69 | 
 70 | 	## tansform Log R Ratio
 71 | 	if (exists($samples{$line[$sample_index]}) && exists($snps{$line[$name_index]})) { ##%snps has been converted to @snps in line 22??
 72 | 
 73 | 		my $lrrvalue = $line[$LRR_index];
 74 | 		$lrrvalue =~ tr/\015//d;
 75 | 		$lrrsample = $lrrsample."\t".$lrrvalue;
 76 | 		$flagsample = 1;
 77 | 		$SampleIDraw = $line[$sample_index];	
 78 | 		$total++;
 79 | 
 80 | 		if ($flageof == 1) {
 81 | 			$hash{$SampleIDraw} = $lrrsample;
 82 | 			print "SampleID: $SampleIDraw\t".scalar(keys %samples)."\t$total\n";
 83 | 			last;
 84 | 		}
 85 | 
 86 | 	} elsif (exists($samples{$line[$sample_index]})) {
 87 | 			
 88 | 		if ($flageof == 1) {
 89 | 			$hash{$SampleIDraw} = $lrrsample;
 90 | 			print "SampleID: $SampleIDraw\t".scalar(keys %samples)."\t$total\n";
 91 | 			last;
 92 | 		} else {
 93 | 			next;
 94 | 		}
 95 | 			
 96 | 	} else {
 97 | 
 98 | 		if ($flagsample == 0) {
 99 | 			
100 | 			## initialize the first sample
101 | 			if (exists($snps{$line[$name_index]})) {
102 | 				$samples{$line[$sample_index]}++;
103 | 				my $lrrvalue = $line[$LRR_index];
104 | 				$lrrsample = $lrrvalue;
105 | 				$total++;
106 | 			}
107 | 		} elsif ($flagsample == 1) {
108 | 
109 | 			if (exists($snps{$line[$name_index]})) {
110 | 
111 | 				## complete the previous sample
112 | 				$hash{$SampleIDraw} = $lrrsample;
113 | 				print "SampleID: $SampleIDraw\t".scalar(keys %samples)."\t$total\n";
114 | 
115 | 				## initialize another new sample
116 | 				$samples{$line[$sample_index]}++;
117 | 				$lrrsample = ();
118 | 				my $lrrvalue = $line[$LRR_index];
119 | 				$lrrsample = $lrrvalue;
120 | 				$total = 1;
121 | 			}
122 | 		}
123 | 	}
124 | }
125 | 
126 | close IN;
127 | 
128 | ## save LRR matrix
129 | open(OUT, ">", $file_matrix_LRR) or die "Error: can't open file $file_matrix_LRR: $!";
130 | foreach my $item (keys %hash) {
131 | 	print OUT "$item\t$hash{$item}\n";
132 | }
133 | close OUT;
134 | 


--------------------------------------------------------------------------------
/01_initial_call/finalreport_to_matrix_LRR_and_BAF/transform_from_tab_to_rds.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressMessages({
  4 |   require(data.table, quietly = TRUE)
  5 |   require(tibble, quietly = TRUE)
  6 |   require(optparse, quietly = TRUE)
  7 | })
  8 | 
  9 | option_list = list(
 10 |   make_option(c("-i", "--input"), action = "store", default = NA, type = "character",
 11 |               help = "path of perl code output"),
 12 |   make_option(c("-o", "--output"), action = "store", default = NA, type = "character",
 13 |               help = "path to save .rds file"),
 14 |   make_option(c("-s", "--startChr"), action = "store", default = 1, type = "integer",
 15 |               help = "start Chr name [default %default]"),
 16 |   make_option(c("-d", "--endChr"), action = "store", default = 22, type = "integer",
 17 |               help = "end Chr name [default %default]")
 18 | )
 19 | 
 20 | opt <- parse_args(OptionParser(option_list = option_list))
 21 | pars <- c(opt$input, opt$output, opt$startChr, opt$endChr)
 22 | 
 23 | if ( any(is.na(pars)) ) {
 24 |   stop("All parameters must be supplied. (--help for detail)")
 25 | }
 26 | 
 27 | path_input <- opt$input
 28 | path_output <- opt$output
 29 | startChr <- opt$startChr
 30 | endChr <- opt$endChr
 31 | 
 32 | if ( !(is.integer(startChr) & is.integer(endChr)) ) {
 33 |   stop("parameters startChr and endChr must be integer.")
 34 | }
 35 | 
 36 | if ( startChr > endChr | startChr < 1 | endChr > 22 ) {
 37 |   stop("parameters startChr and endChr should satisfy 1 <= startChr <= endChr <= 22.")
 38 | }
 39 | 
 40 | chrs <- seq(startChr, endChr)
 41 | # create LRR/BAF folder ---------------------------------------------------/
 42 | if ( !dir.exists(file.path(path_output, "LRR"))) dir.create(path = file.path(path_output, "LRR"), showWarnings = FALSE, recursive = TRUE)
 43 | if ( !dir.exists(file.path(path_output, "BAF"))) dir.create(path = file.path(path_output, "BAF"), showWarnings = FALSE, recursive = TRUE)
 44 | 
 45 | # read in annotate files --------------------------------------------------/
 46 | dat_snpName = fread( input = file.path(path_input, "snps_name.txt"), header = FALSE)
 47 | dat_snpName = as.data.frame(dat_snpName, stringsAsFactors = FALSE)
 48 | names( dat_snpName) <- c("Chr", "SNPs")
 49 | 
 50 | dat_snpNum = fread( input = file.path(path_input, "snps_number.txt"), header = FALSE)
 51 | dat_snpNum = as.data.frame( dat_snpNum, stringsAsFactors = FALSE)
 52 | names( dat_snpNum) <- c("Chr", "number")
 53 | 
 54 | dat_snpPos = fread( input = file.path(path_input, "SNP_pos.txt"), header = TRUE)
 55 | dat_snpPos = as.data.frame(dat_snpPos, stringsAsFactors = FALSE)
 56 | names( dat_snpPos) <- c("name", "chr","position")
 57 | 
 58 | dat_samples_order <- read.table(file = file.path(path_input, "samples_order.txt"), 
 59 |                                 sep = "\t", header = F, stringsAsFactors = F)
 60 | names(dat_samples_order) <- c("sampleID" ,"order")
 61 | dat_samples_order <- dat_samples_order[order(dat_samples_order$order), ]
 62 | 
 63 | for (chr1 in chrs) {
 64 |   
 65 |   cat("chr:", chr1, "\n")
 66 |   
 67 |   snp1 <- unlist(strsplit( dat_snpName$SNPs[dat_snpName$Chr == chr1], 
 68 |                            split = "___", fixed = TRUE))
 69 |   
 70 |   n1 <- dat_snpNum$number[ dat_snpNum$Chr == chr1]
 71 |   stopifnot( length(snp1) == n1)
 72 |   
 73 |   snp_position_chr1 <- subset( dat_snpPos, name %in% snp1)
 74 |   snp_position_chr1 <- snp_position_chr1[ order(snp_position_chr1$position), ]
 75 |   
 76 |   snp1_order <- snp_position_chr1$name
 77 |   stopifnot( nrow(snp_position_chr1) == n1 )
 78 |   
 79 |   # read in LRR/BAF
 80 |   dat_chr1_LRR <- fread( input = file.path( path_input, "LRR", paste0(chr1, ".tab")), header = FALSE)
 81 |   dat_chr1_BAF <- fread( input = file.path( path_input, "BAF", paste0(chr1, ".tab")), header = FALSE)
 82 |   
 83 |   dat_chr1_LRR <- as.data.frame(dat_chr1_LRR, stringsAsFactors = FALSE)
 84 |   dat_chr1_BAF <- as.data.frame(dat_chr1_BAF, stringsAsFactors = FALSE)
 85 |   
 86 |   rownames( dat_chr1_LRR ) <- NULL
 87 |   rownames( dat_chr1_BAF ) <- NULL
 88 |   
 89 |   dat_chr1_LRR <- column_to_rownames( dat_chr1_LRR, var = "V1")
 90 |   dat_chr1_BAF <- column_to_rownames( dat_chr1_BAF, var = "V1")
 91 |   
 92 |   stopifnot( ncol(dat_chr1_LRR) == n1 )
 93 |   stopifnot( ncol(dat_chr1_BAF) == n1 )
 94 |   
 95 |   names(dat_chr1_LRR) <- snp1
 96 |   names(dat_chr1_BAF) <- snp1
 97 |   
 98 |   dat_chr1_LRR <- dat_chr1_LRR[, snp1_order, drop = FALSE]
 99 |   dat_chr1_BAF <- dat_chr1_BAF[, snp1_order, drop = FALSE]
100 |   
101 |   ## check samples_order
102 |   stopifnot( all(rownames(dat_chr1_LRR) == dat_samples_order$sampleID) )
103 |   stopifnot( all(rownames(dat_chr1_BAF) == dat_samples_order$sampleID) )
104 |   
105 |   saveRDS( dat_chr1_LRR, file = file.path(path_output, "LRR", paste0("matrix_chr_", chr1, "_LRR.rds")))
106 |   saveRDS( dat_chr1_BAF, file = file.path(path_output, "BAF", paste0("matrix_chr_", chr1, "_BAF.rds")))
107 |   
108 | }
109 | 
110 | cat("Analysis completed! The output files are at:", path_output, "\n")
111 | 
112 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/scripts/fun_BAF.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # BAF emission probability (defined in PennCNV paper)
  3 | eBAF <- function (b, z, pB) {
  4 |   pib  <- 0.01
  5 |   
  6 |   mu0  <- 0.00
  7 |   mu14 <- 0.25 
  8 |   mu13 <- 1.0/3.0
  9 |   mu12 <- 0.5
 10 |   mu23 <- 2.0/3.0
 11 |   mu34 <- 0.75
 12 |   mu1  <- 1.00
 13 |   
 14 |   sd0  <- 0.016372
 15 |   sd14 <- 0.042099
 16 |   sd13 <- 0.045126
 17 |   sd12 <- 0.034982
 18 |   sd23 <- 0.045126
 19 |   sd34 <- 0.042099
 20 |   sd1  <- 0.016372
 21 |   
 22 |   M0   <- 0.5
 23 |   M1   <- 0.5
 24 |   
 25 |   sd5 <- 0.304243 ## for calculate CN = 0
 26 |   ## z=1, CN = 0, two copy deletion state
 27 |   if (z == 1) {
 28 |     e <- dnorm(b, mean = mu12, sd = sd5)
 29 |   }
 30 |   
 31 |   ## z=2, CN=1, one copy deletion state
 32 |   if (z==2) {
 33 |     e <- pib +
 34 |       (1 - pib) * (1-pB) * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) +
 35 |       (1 - pib) * pB     * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) )
 36 |   }
 37 |   
 38 |   ## z=3, CN=2, normal copy number state
 39 |   if (z==3) {
 40 |     e <- pib +
 41 |       (1 - pib) * 2*pB*(1-pB) * dnorm(b, mu12, sd12) +
 42 |       (1 - pib) * (1-pB)^2    * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) +
 43 |       (1 - pib) * pB^2        * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) )
 44 |   }
 45 |   
 46 |   ## z=4, CN=2, CN-LOH state
 47 |   if (z==4) {
 48 |     e <- pib +
 49 |       (1 - pib) * (1-pB) * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) +
 50 |       (1 - pib) * pB     * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) )
 51 |   }
 52 |   
 53 |   ## z=5, CN=3, one copy duplication state
 54 |   if (z==5) {
 55 |     e <- pib +
 56 |       (1 - pib) * 3*pB*(1-pB)^2 * dnorm(b, mu13, sd13) +
 57 |       (1 - pib) * 3*pB^2*(1-pB) * dnorm(b, mu23, sd23) +
 58 |       (1 - pib) * (1-pB)^3      * ( I(b==0)*M0 + I(b>0 & b<1)*(1-M0)*dnorm(b, mu0, sd0)/(1-pnorm(0,mu0,sd0)) ) +
 59 |       (1 - pib) * pB^3          * ( I(b==1)*M1 + I(b>0 & b<1)*(1-M1)*dnorm(b, mu1, sd1)/pnorm(1,mu1,sd1) )
 60 |   }
 61 |   
 62 |   return(e)
 63 | }
 64 | 
 65 | # BAF for gatk
 66 | baf_gatk_whole <- function(b, pB1, CN) {
 67 |   
 68 |   if (CN == 2) {
 69 |     return(eBAF(b = b, z = 3, pB = pB1))
 70 |   } else if (CN == 1) {
 71 |     return(eBAF(b = b, z = 2, pB = pB1))
 72 |   } else if (CN ==3) {
 73 |     return(eBAF(b = b, z = 5, pB = pB1))
 74 |   } else if (CN == 0) {
 75 |     return(eBAF(b = b, z = 1, pB = pB1))
 76 |   }
 77 |   
 78 | }
 79 | 
 80 | # calculate_BAF_gatk_whole CN = 0, 1, 2, 3
 81 | calculate_BAF_gatk_whole <- function(dt_cnvrs) {
 82 |   dt_cnvrs <- arrange(dt_cnvrs, Sample_ID, Name)
 83 |   samples <- unique(dt_cnvrs$Sample_ID)
 84 |   snps <- unique(dt_cnvrs$Name)
 85 |   snps <- dt_cnvrs$Name[1:length(snps)]  ## snps
 86 |   pfbs <- dt_cnvrs$PFB[1:length(snps)]  ## PFB
 87 |   
 88 |   m0 <- matrix(data = NA, nrow = length(samples), ncol = length(snps))
 89 |   m1 <- matrix(data = NA, nrow = length(samples), ncol = length(snps))
 90 |   m2 <- matrix(data = NA, nrow = length(samples), ncol = length(snps))
 91 |   m3 <- matrix(data = NA, nrow = length(samples), ncol = length(snps))
 92 |   
 93 |   for (i in 1:length(snps)) {
 94 |     
 95 |     snp1 <- snps[i]
 96 |     pfb1 <- pfbs[i]
 97 |     
 98 |     samples_snp <- subset(dt_cnvrs, Name == snp1)
 99 |     samples_new <- samples_snp$Sample_ID
100 |     cns_new     <- samples_snp$CN  ## CN
101 |     
102 |     bafs_ep_0 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 0))
103 |     bafs_ep_1 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 1))
104 |     bafs_ep_2 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 2))
105 |     bafs_ep_3 <- sapply(samples_snp$BAF, FUN = function(x) baf_gatk_whole(b = x, pB1 = pfb1, CN = 3))
106 |     
107 |     ## detect NaN values in bafs_ep_1/2/3
108 |     idxs_na_0 <- which(is.na(bafs_ep_0))
109 |     bafs_ep_0[idxs_na_0] <- median(bafs_ep_0, na.rm = TRUE)
110 |     idxs_na_1 <- which(is.na(bafs_ep_1))
111 |     bafs_ep_1[idxs_na_1] <- median(bafs_ep_1, na.rm = TRUE)  ## add median values
112 |     idxs_na_2 <- which(is.na(bafs_ep_2))
113 |     bafs_ep_2[idxs_na_2] <- median(bafs_ep_2, na.rm = TRUE)  ## add median values
114 |     idxs_na_3 <- which(is.na(bafs_ep_3))
115 |     bafs_ep_3[idxs_na_3] <- median(bafs_ep_3, na.rm = TRUE)  ## add median values
116 |     
117 |     m0[, i] <- bafs_ep_0
118 |     m1[, i] <- bafs_ep_1
119 |     m2[, i] <- bafs_ep_2
120 |     m3[, i] <- bafs_ep_3
121 |   }
122 |   
123 |   baf_eps_0 <- apply(m0, MARGIN = 1, prod)
124 |   baf_eps_1 <- apply(m1, MARGIN = 1, prod) ### add na.rm
125 |   baf_eps_2 <- apply(m2, MARGIN = 1, prod)
126 |   baf_eps_3 <- apply(m3, MARGIN = 1, prod)
127 |   
128 |   dt_BAF <- data.frame(Sample_ID = samples_new, 
129 |                        CN = cns_new, stringsAsFactors = FALSE)
130 |   dt_BAF$BAF0 <- baf_eps_0
131 |   dt_BAF$BAF1 <- baf_eps_1
132 |   dt_BAF$BAF2 <- baf_eps_2
133 |   dt_BAF$BAF3 <- baf_eps_3
134 |   
135 |   dt_BAF
136 | }
137 | 
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/01_initial_call/run_PennCNV/step.5.clean.PennCNV.res.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## The script was used to run PennCNV on Minerva high performance cluster.
  4 | ## You need to modifiy it according to the system you are using if you would like to use it.
  5 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information
  6 | 
  7 | suppressMessages({
  8 |   require( optparse, quietly = TRUE)
  9 | })
 10 | 
 11 | option_list <- list(
 12 |   make_option(c("-p", "--penncnv"), action = "store", default = NA, type = "character",
 13 |               help = "path to PennCNV installation folder."),
 14 |   make_option(c("-i", "--input"), action = "store", default = NA, type = "character",
 15 |               help = "input path for combined PennCNV result."),
 16 |   make_option(c("-f", "--pfb"), action = "store", default = NA, type = "character",
 17 |               help = "pfb file."),
 18 |   make_option(c("-n", "--name"), action = "store", default = "CNV.PennCNV", type = "character",
 19 |               help = "rawcnv filename generated in step (4).")
 20 | )
 21 | 
 22 | opt <- parse_args(OptionParser(option_list = option_list))
 23 | 
 24 | path_penncnv <- opt$penncnv
 25 | path_input   <- opt$input
 26 | file_pfb     <- opt$pfb
 27 | name_rawcnv  <- opt$name
 28 | 
 29 | if (any(is.na(c(path_input, file_pfb, name_rawcnv)))) {
 30 |   stop("All parameters must be supplied.( --help for details )")
 31 | }
 32 | 
 33 | # clean CNV ---------------------------------------------------------------
 34 | 
 35 | setwd(dir = path_input)
 36 | path_clean <- path_input
 37 | name_project <- name_rawcnv
 38 | 
 39 | file_rawcnv <- paste(name_project, "rawcnv", sep = ".")
 40 | file_pfb <- file_pfb
 41 | 
 42 | n_rawcnv <- system(paste("cat", file_rawcnv, "| wc -l"), intern = TRUE)
 43 | n_rawcnv <- as.integer(n_rawcnv)
 44 | 
 45 | cat("CNV number before clean:", n_rawcnv, "\n")
 46 | 
 47 | flag = 0
 48 | idx = 1
 49 | cnv1_in <- file_rawcnv
 50 | while(flag == 0) {
 51 |   
 52 |   n_rawcnv <- as.integer(system(paste("cat", cnv1_in, "|", "wc -l"), intern = TRUE))
 53 |   cnv1_out <- paste(name_project, idx, "rawcnv", sep = ".")
 54 |   
 55 |   cmd1 <- paste(file.path(path_penncnv, "bin/clean_cnv.pl"),
 56 |                 "combineseg", cnv1_in, "--signalfile", file_pfb, 
 57 |                 "--fraction 0.2", "--bp >", cnv1_out)
 58 |   
 59 |   cat("Start run Time:", idx, cmd1, "...\n")
 60 |   system(cmd1)
 61 |   cat("End run ......\n")
 62 |   
 63 |   cmd2 <- paste("cat", cnv1_out, "|", "wc -l")
 64 |   n_newcnv <- system(cmd2, intern = TRUE)
 65 |   n_newcnv <- as.integer(n_newcnv)
 66 |   
 67 |   cat("raw number:", n_rawcnv, "\n")
 68 |   cat("new number:", n_newcnv, "\n")
 69 |   
 70 |   if (n_rawcnv == n_newcnv) {
 71 |     flag = 1
 72 |   } else {
 73 |     cnv1_in <- cnv1_out
 74 |     idx <- idx + 1
 75 |   }
 76 |   
 77 | }
 78 | 
 79 | ## convert final PennCNV results to tab-delimit text file
 80 | cnv_penncnv <- paste(name_project, idx, "rawcnv", sep = ".")
 81 | cnv_tab <- paste(name_project, "txt", sep = ".")
 82 | cat("Convert final PennCNV results to tab-delimit text file.\n")
 83 | cmd.transform <- paste(file.path(path_penncnv, "bin/convert_cnv.pl"),
 84 |                        "--intype", "penncnv", "--outtype", "tab", cnv_penncnv, ">", cnv_tab)
 85 | system(cmd.transform)
 86 | 
 87 | ## extract individual level statistics for QC 
 88 | cat("Extract individual level statistics for QC.\n")
 89 | cnv_log <- paste(name_project, "log", sep = ".")
 90 | cnv_qc <- paste0(name_project, "_qc.txt")
 91 | cmd.extract <- paste(file.path(path_penncnv, "bin/filter_cnv.pl"), cnv_penncnv,
 92 |                      "-qclogfile", cnv_log, "-qcsumout", cnv_qc, ">", "step5.log")
 93 | system(cmd.extract)
 94 | 
 95 | # Change SampleID column information --------------------------------------
 96 | # remove the path before Sample_ID to get a "clean" Sample_ID
 97 | 
 98 | ## CNV results
 99 | dat_CNV <- read.table(file = cnv_tab, sep = "\t",
100 |                       header = FALSE, comment.char = "", check.names = FALSE, stringsAsFactors = FALSE)
101 | samples_path <- dat_CNV$V5
102 | sampleIDs <- unlist(lapply(1:length(samples_path), FUN = function(k) {
103 |   sample1 <- samples_path[k]
104 |   str1 <- unlist(strsplit(sample1, split = "/", fixed = TRUE))
105 |   str1[length(str1)]
106 | }))
107 | dat_CNV$V5 <- sampleIDs  ## change
108 | 
109 | write.table(dat_CNV, file = paste0(name_project, "_new.txt"),
110 |             sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
111 | 
112 | 
113 | ## Sample-wise summary statistics
114 | dat_Sample_Stat <- read.table(file = cnv_qc, sep = "\t",
115 |                               header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
116 | files <- dat_Sample_Stat$File
117 | files_new <- unlist(lapply(1:length(files), FUN = function(k) {
118 |   file1 <- files[k]
119 |   str1  <- unlist(strsplit(file1, split = "/", fixed = TRUE))
120 |   str1[length(str1)]
121 | }))
122 | 
123 | dat_Sample_Stat$File <- files_new
124 | 
125 | write.table(dat_Sample_Stat, file = paste0(name_project, "_qc_new.txt"),
126 |             sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/06_performance_assessment/step.2.set.GQ.generate.results.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressMessages(library(optparse))
  4 | suppressMessages(library(plyr))
  5 | 
  6 | option_list <- list(
  7 |   make_option(c("-n", "--matrixCN"), action = "store", default = NA,type = "character",
  8 |               help = "Path to matrix of copy number (CN)"),
  9 |   make_option(c("-g", "--matrixGQ"), action = "store", default = NA,type = "character",
 10 |               help = "Path to matrix of genotyping quality (GQ) score."),
 11 |   make_option(c("-c", "--cnvrfile"), action = "store", default = NA, type = "character",
 12 |               help = "Path to CNVR information after boundary refinement."),
 13 |   make_option(c("-o", "--resultpath"), action = "store", default = NA,type = "character",
 14 |               help = "Path to directory for saving assessment results."),
 15 |   make_option(c("-s", "--gqscore"), action = "store", default = NA, type = "integer",
 16 |               help = "Set GQ score threshold.")
 17 | )
 18 | 
 19 | opt <- parse_args(OptionParser(option_list = option_list))
 20 | pars <- c(opt$matrixCN, opt$matrixGQ, opt$cnvrfile,
 21 |           opt$resultpath, opt$gqscore)
 22 | 
 23 | if (any(is.na(pars))) {
 24 |   stop("All required parameters must be supplied. (--help for detail)")
 25 | }
 26 | 
 27 | file_matrixcn <- opt$matrixCN
 28 | file_matrixgq <- opt$matrixGQ
 29 | file_cnvr     <- opt$cnvrfile
 30 | path_result   <- opt$resultpath
 31 | gqscore       <- as.numeric(opt$gqscore)
 32 | 
 33 | matrix_CN <- readRDS(file = file_matrixcn)
 34 | matrix_gq <- readRDS(file = file_matrixgq)
 35 | dat_cnvr  <- read.delim(file = file_cnvr, check.names = FALSE, as.is = TRUE)
 36 | 
 37 | # main --------------------------------------------------------------------
 38 | 
 39 | idxs.nocall = which(matrix_gq < gqscore)
 40 | 
 41 | if (length(idxs.nocall) >= 1) matrix_CN[idxs.nocall] = -9
 42 | 
 43 | cnvrs   <- rownames( matrix_CN )
 44 | samples <- colnames( matrix_CN )
 45 | 
 46 | n_cnvr   <- nrow(matrix_CN)
 47 | n_sample <- ncol(matrix_CN)
 48 | 
 49 | ## cnvr freq
 50 | list_freqs_cnvr <- lapply(1:n_cnvr, FUN = function(k) {
 51 |   v1 <- as.vector(matrix_CN[k, ])
 52 |   data.frame(n = length(v1), 
 53 |              n0 = sum(v1 == 0),
 54 |              n1 = sum(v1 == 1),
 55 |              n2 = sum(v1 == 2),
 56 |              n3 = sum(v1 == 3),
 57 |              n_nocall = sum(v1 == -9))
 58 | })
 59 | 
 60 | freqs_cnvr <- do.call(rbind, list_freqs_cnvr)
 61 | 
 62 | dat_freqs_cnvr <- data.frame(freqs_cnvr, stringsAsFactors = F, check.names = F)
 63 | dat_freqs_cnvr$CNVR_ID <- cnvrs
 64 | 
 65 | dat_freqs_cnvr$callRate <- (dat_freqs_cnvr$n0 + dat_freqs_cnvr$n1 + dat_freqs_cnvr$n2 + dat_freqs_cnvr$n3)/dat_freqs_cnvr$n
 66 | dat_freqs_cnvr$freq     <- (dat_freqs_cnvr$n0 + dat_freqs_cnvr$n1 + dat_freqs_cnvr$n3)/dat_freqs_cnvr$n
 67 | 
 68 | idxs_cnvr_filter <- which(dat_freqs_cnvr$freq == 0)
 69 | 
 70 | if (length(idxs_cnvr_filter) >= 1) {
 71 |   dat_freqs_cnvr <- dat_freqs_cnvr[-idxs_cnvr_filter, ]
 72 | }
 73 | dat_cnvr_final <- merge(dat_cnvr, dat_freqs_cnvr)
 74 | dat_cnvr_final <- dat_cnvr_final[order(dat_cnvr_final$chr, 
 75 |                                        dat_cnvr_final$arm, 
 76 |                                        dat_cnvr_final$posStart,
 77 |                                        dat_cnvr_final$posEnd), ]
 78 | 
 79 | dat_cnvr_final <- dat_cnvr_final[, c("CNVR_ID", "chr", "arm", "posStart", "posEnd", "start_snp", "end_snp", 
 80 |                                      "n", "n0", "n1", "n2", "n3", "n_nocall", "callRate", "freq")]
 81 | dat_cnvr_final <- rename(dat_cnvr_final, c("start_snp"="snpStart", "end_snp"="snpEnd"))
 82 | 
 83 | cat(nrow(dat_cnvr_final), "CNVRs remains from", nrow(n_cnvr), "CNVRs after GQ cut-off.\n")
 84 | 
 85 | write.table(dat_cnvr_final, file = file.path(path_result, "cnvr_after_GQ.txt"),
 86 |             sep = "\t", row.names = F, col.names = T, quote = F)
 87 | 
 88 | matrix_CN_final <- matrix_CN[dat_cnvr_final$CNVR_ID, ]
 89 | saveRDS(matrix_CN_final, file = file.path(path_result, "matrix_CN_after_GQ.rds"))
 90 | 
 91 | # sample information ------------------------------------------------------
 92 | 
 93 | list_samples_info <- lapply(1:ncol(matrix_CN_final), FUN = function(k) {
 94 |   v1 <- as.vector(matrix_CN_final[, k])
 95 |   data.frame(n = length(v1),
 96 |              n0 = sum(v1 == 0),
 97 |              n1 = sum(v1 == 1),
 98 |              n2 = sum(v1 == 2),
 99 |              n3 = sum(v1 == 3),
100 |              n_nocall = sum(v1 == -9))
101 | })
102 | 
103 | samples_info <- do.call(rbind, list_samples_info)
104 | samples_info <- data.frame(samples_info, stringsAsFactors = F, check.names = F)
105 | samples_info$Sample_ID <- samples
106 | 
107 | samples_info$callRate <- (samples_info$n0 + samples_info$n1 + samples_info$n2 + samples_info$n3)/samples_info$n
108 | samples_info$freq     <- (samples_info$n0 + samples_info$n1 + samples_info$n3)/samples_info$n
109 | samples_info <- samples_info[, c("Sample_ID", "callRate", "freq", "n", "n0", "n1", "n2", "n3", "n_nocall")]
110 | 
111 | write.table(samples_info, file = file.path(path_result, "sample_after_GQ.txt"),
112 |             sep = "\t", row.names = F, col.names = T, quote = F)
113 | 
114 | 


--------------------------------------------------------------------------------
/05_boundary_refinement/step.3.clean.results.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressMessages(library(optparse))
  4 | suppressMessages(library(plyr))
  5 | 
  6 | option_list <- list(
  7 |   make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA,
  8 |               help = "Path to the directory for saving results.")
  9 | )
 10 | 
 11 | opt <- parse_args(OptionParser(option_list = option_list))
 12 | 
 13 | pars <- c(opt$resultpath)
 14 | if (any(is.na(pars))) {
 15 |   stop("All parameters must be supplied. (--help for detail)")
 16 | }
 17 | 
 18 | path_result <- opt$resultpath
 19 | # combine refinement results ----------------------------------------------
 20 | path_refine <- file.path(path_result, "res_refine")
 21 | folders_chr1 <- list.files(path = path_refine, pattern = "^chr")
 22 | 
 23 | res_refinement <- data.frame()
 24 | for ( folder.chr1 in folders_chr1 ) {
 25 |   
 26 |   chr1 <- gsub("^chr", "", folder.chr1, perl = T)
 27 |   file.chr1 <- paste("CNVR_refine_chr_", chr1, "_detail.rds", sep = "")
 28 |   path.chr1.data <- file.path(path_refine, folder.chr1, "data")
 29 |   res.chr1 <- readRDS( file = file.path( path.chr1.data, file.chr1) )
 30 |   
 31 |   res_refinement <- rbind(res_refinement, res.chr1)
 32 | }
 33 | 
 34 | ## merge CNVRs with identifcal boundaries after refinement
 35 | res_refinement$identicalID <- paste(res_refinement$Chr,
 36 |                                     res_refinement$snp.start.refine,
 37 |                                     res_refinement$snp.end.refine, sep = "___")
 38 | 
 39 | res_refinement_same   <- subset(res_refinement, type.overlap.based.on.raw == "same")
 40 | res_refinement_refine <- subset(res_refinement, type.overlap.based.on.raw != "same")
 41 | 
 42 | res_refinement_refine <- subset(res_refinement_refine, 
 43 |                                 !identicalID %in% res_refinement_same$identicalID)
 44 | 
 45 | # de-dulplicate CNVR
 46 | res_refinement_refine <- res_refinement_refine[!duplicated(res_refinement_refine$identicalID), ]
 47 | cat("number of CNVRs with refined boundaries:", nrow(res_refinement_refine), "\n")
 48 | 
 49 | cnvrID_refine_same <- res_refinement_same$CNVR_ID
 50 | 
 51 | # clean -------------------------------------------------------------------
 52 | dat_cnvr_keep <- read.delim(file = file.path(path_result, "cnvr_keep.txt"), as.is = TRUE)
 53 | dat_cnvr_keep$identicalID <- paste(dat_cnvr_keep$chr,
 54 |                                    dat_cnvr_keep$start_snp,
 55 |                                    dat_cnvr_keep$end_snp, sep = "___")
 56 | 
 57 | dat_cnvr_refine <- read.delim(file = file.path(path_result, "cnvr_refine.txt"), as.is = TRUE)
 58 | dat_cnvr_refine$identicalID <- paste(dat_cnvr_refine$chr,
 59 |                                    dat_cnvr_refine$start_snp,
 60 |                                    dat_cnvr_refine$end_snp, sep = "___")
 61 | 
 62 | cnvrID_keep <- dat_cnvr_keep$CNVR_ID
 63 | cnvrID_keep_final <- union(cnvrID_keep, cnvrID_refine_same)
 64 | 
 65 | dat_cnvr_keep_after_refine <- rbind(dat_cnvr_keep, dat_cnvr_refine)
 66 | dat_cnvr_keep_after_refine <- subset(dat_cnvr_keep_after_refine, CNVR_ID %in% cnvrID_keep_final)
 67 | 
 68 | ## CNVRs with refined boundaries
 69 | res_refinement_refine_clean <- subset( res_refinement_refine, !identicalID %in% dat_cnvr_keep$identicalID )
 70 | 
 71 | ## CNVRs to be regnotyped after updating boundary information
 72 | dat_cnvr_regt <- subset(dat_cnvr_refine, CNVR_ID %in% res_refinement_refine_clean$CNVR_ID)
 73 | dat_cnvr_regt <- rename(dat_cnvr_regt, 
 74 |                         c("posStart"="posStart.round1",
 75 |                          "posEnd"="posEnd.round1",
 76 |                          "start_snp"="start_snp.round1",
 77 |                          "end_snp"="end_snp.round1",
 78 |                          "batch"="batch.round1",
 79 |                          "genotype"="genotype.round1",
 80 |                          "Freq"="Freq.round1",
 81 |                          "identicalID"="identicalID.round1"))
 82 | dat_cnvr_regt <- merge(dat_cnvr_regt, 
 83 |                        res_refinement_refine_clean[, 
 84 |                          c("CNVR_ID", "identicalID", "snp.posStart.refine", "snp.posEnd.refine", 
 85 |                          "snp.start.refine", "snp.end.refine")],
 86 |                        by = "CNVR_ID")
 87 | stopifnot( nrow(dat_cnvr_regt) == nrow(res_refinement_refine_clean) )
 88 | 
 89 | dat_cnvr_regt <- rename(dat_cnvr_regt,
 90 |                         c("snp.posStart.refine"="posStart", 
 91 |                          "snp.posEnd.refine"="posEnd", 
 92 |                          "snp.start.refine"="start_snp", 
 93 |                           "snp.end.refine"="end_snp"))
 94 | 
 95 | 
 96 | write.table(dat_cnvr_keep_after_refine, 
 97 |             file = file.path(path_result, "cnvr_kept_after_refine.txt"),
 98 |             quote = F, row.names = F, sep = "\t")
 99 | 
100 | write.table(res_refinement_refine_clean, 
101 |             file = file.path(path_result, "cnvr_refined_after_refine.txt"),
102 |             quote = F, row.names = F, sep = "\t")
103 | 
104 | write.table(dat_cnvr_regt, 
105 |             file = file.path(path_result, "cnvr_regenotype_after_refine.txt"),
106 |             quote = F, row.names = F, sep = "\t")
107 | 
108 | 


--------------------------------------------------------------------------------
/01_initial_call/run_PennCNV/step.3.check.PennCNV.jobs.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system
  4 | 
  5 | ## The script was used to run PennCNV on Minerva high performance cluster.
  6 | ## You need to modifiy it according to the system you are using if you would like to use it.
  7 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information
  8 | 
  9 | suppressMessages({
 10 |   require( optparse, quietly = TRUE)
 11 | })
 12 | 
 13 | options(warn = 2)
 14 | 
 15 | option_list <- list(
 16 |   make_option(c("-p", "--penncnv"), action = "store", default = NA, type = "character",
 17 |               help = "path to PennCNV installation folder."),  
 18 |   make_option(c("-a", "--data"), action = "store", default = NA, type = "character",
 19 |               help = "path to tab-delimit text data files for each sample."),
 20 |   make_option(c("-d", "--wkdir"), action = "store", default = NA, type = "character",
 21 |               help = "working directory."),
 22 |   make_option(c("-f", "--pfb"), action = "store", default = NA, type = "character",
 23 |               help = "pfb file."),
 24 |   make_option(c("-g", "--gcmodel"), action = "store", default = NA, type = "character",
 25 |               help = "gcmodel file."),
 26 |   make_option(c("-m", "--hmm"), action = "store", default = NA, type = "character",
 27 |               help = "HMM model file.")
 28 | )
 29 | 
 30 | opt = parse_args(OptionParser(option_list = option_list))
 31 | 
 32 | path_penncnv <- opt$penncnv
 33 | path_data    <- opt$data
 34 | path_wkdir   <- opt$wkdir
 35 | file_pfb     <- opt$pfb
 36 | file_gcmodel <- opt$gcmodel
 37 | file_hmm     <- opt$hmm
 38 | 
 39 | if (any(is.na(c(path_data, path_wkdir, file_pfb, file_gcmodel, file_hmm)))) {
 40 |   stop("All parameters must be supplied. (--help for details)")
 41 | }
 42 | 
 43 | path_list  <- file.path(path_wkdir, "list")
 44 | path_res  <- file.path(path_wkdir, "res")  ## PennCNV results folder
 45 | 
 46 | # submit jobs functions ---------------------------------------------------
 47 | 
 48 | cmd_PennCNV <- function(file_hmm, file_pfb, file_gcmodel, 
 49 |                         filename_sample, path_list, path_res_sample) {
 50 |   
 51 |   file_list <- file.path(path_list, sub("\\.txt$", ".list", filename_sample))
 52 |   
 53 |   samplename <- gsub(pattern = "\\.txt$", replacement = "", filename_sample)
 54 |   file_log   <- file.path(path_res_sample, paste0(samplename, ".log"))
 55 |   file_rawcnv <- file.path(path_res_sample, paste0(samplename, ".rawcnv"))
 56 |   
 57 |   cmd <- paste(file.path(path_penncnv, "bin/detect_cnv.pl"), 
 58 |                "-test --confidence",
 59 |                "-hmm", file_hmm,
 60 |                "-pfb", file_pfb,
 61 |                "-gcmodel", file_gcmodel,
 62 |                "-list", file_list,
 63 |                "-log", file_log,
 64 |                "-out", file_rawcnv)
 65 |   
 66 |   cmd
 67 | } 
 68 | 
 69 | cmd_submitjob <- function(cmd.sample, samplename) {
 70 | 
 71 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 72 | ## configure based on your system
 73 |   bsub.cmd <- paste("bsub -n 2 -W 00:30 -R 'rusage[mem=5000]' -P <account>",
 74 |                     "-J", samplename,
 75 |                     "-q premium",
 76 |                     shQuote(cmd.sample))
 77 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 78 | 
 79 |   bsub.cmd
 80 | }
 81 | 
 82 | # main loop ---------------------------------------------------------------
 83 | 
 84 | sample_files <- list.files(path = path_data)
 85 | cat("number of samples:", length(sample_files), "\n")
 86 | 
 87 | n.success <- 0
 88 | n.fail <- 0
 89 | for ( i in 1:length(sample_files) ) {
 90 |   
 91 |   sample_file <- sample_files[i]
 92 |   samplename  <- gsub(pattern = "\\.txt$", replacement = "", sample_file)
 93 |   
 94 |   path_res_sample <- file.path(path_res, samplename)
 95 |   file_rawcnv <- file.path(path_res_sample, paste0(samplename, ".rawcnv"))
 96 |   
 97 |   flag.folder <- dir.exists(paths = path_res_sample)
 98 |   flag.rawcnv <- file.exists(file_rawcnv)
 99 |   
100 |   if ( flag.folder & flag.rawcnv ) {
101 |     cat("Sample_ID:", samplename, "SUCCESS\n")
102 |     n.success <- n.success + 1
103 |   } else {
104 |     
105 |     cat("Sample_ID:", samplename, "FAILED\n")
106 |     dir.create(path = path_res_sample, showWarnings = FALSE, recursive = TRUE)
107 |     
108 |     cmd.sample <- cmd_PennCNV(file_hmm = file_hmm,
109 |                               file_pfb = file_pfb,
110 |                               file_gcmodel = file_gcmodel,
111 |                               filename_sample = sample_file,
112 |                               path_list = path_list,
113 |                               path_res_sample = path_res_sample)
114 |     
115 |     cmd.job   <- cmd_submitjob(cmd.sample = cmd.sample, samplename = samplename)
116 |     
117 |     system(cmd.job)
118 |     Sys.sleep(0.1)
119 |     
120 |     n.fail <- n.fail + 1
121 |     
122 |   }
123 | }
124 | 
125 | cat("total number of samples:", length(sample_files),
126 |     "number of success:", n.success,
127 |     "number of fail:", n.fail, "\n")
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/01_initial_call/run_PennCNV/step.2.run.PennCNV.jobs.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscirpt
  2 | 
  3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system
  4 | 
  5 | ## The script was used to run PennCNV on Minerva high performance cluster.
  6 | ## You need to modifiy it according to the system you are using if you would like to use it.
  7 | ## Please refer to original PennCNV documents (http://penncnv.openbioinformatics.org/en/latest/) for more information 
  8 | 
  9 | suppressMessages({
 10 |   require( optparse, quietly = TRUE)
 11 | })
 12 | 
 13 | option_list <- list(
 14 |   make_option(c("-p", "--penncnv"), action = "store", default = NA, type = "character",
 15 |               help = "path to PennCNV installation folder."),  
 16 |   make_option(c("-a", "--data"), action = "store", default = NA, type = "character",
 17 |               help = "path to tab-delimit text data files for each sample."),
 18 |   make_option(c("-d", "--wkdir"), action = "store", default = NA, type = "character",
 19 |               help = "working directory."),
 20 |   make_option(c("-f", "--pfb"), action = "store", default = NA, type = "character",
 21 |               help = "pfb file."),
 22 |   make_option(c("-g", "--gcmodel"), action = "store", default = NA, type = "character",
 23 |               help = "gcmodel file."),
 24 |   make_option(c("-m", "--hmm"), action = "store", default = NA, type = "character",
 25 |               help = "HMM model file.")
 26 | )
 27 | 
 28 | opt = parse_args(OptionParser(option_list = option_list))
 29 | 
 30 | path_penncnv <- opt$penncnv
 31 | path_data    <- opt$data
 32 | path_wkdir   <- opt$wkdir
 33 | file_pfb     <- opt$pfb
 34 | file_gcmodel <- opt$gcmodel
 35 | file_hmm     <- opt$hmm
 36 | 
 37 | if (any(is.na(c(path_data, path_wkdir, file_pfb, file_gcmodel, file_hmm)))) {
 38 |   stop("All parameters must be supplied. (--help for details)")
 39 | }
 40 | 
 41 | # create path -------------------------------------------------------------
 42 | 
 43 | path_list  <- file.path(path_wkdir, "list")
 44 | path_res   <- file.path(path_wkdir, "res") ## PennCNV raw results folder
 45 | 
 46 | if ( !dir.exists(path_list) ) {
 47 |   dir.create(path = path_list, showWarnings = FALSE, recursive = TRUE)
 48 | }
 49 | if ( !dir.exists(path_res) ) {
 50 |   dir.create(path = path_res, showWarnings = FALSE, recursive = TRUE)
 51 | }
 52 | 
 53 | 
 54 | # generate list.txt for each sample ---------------------------------------
 55 | 
 56 | sample_files <- list.files(path = path_data)
 57 | 
 58 | cat("number of samples:", length(sample_files), "\n")
 59 | 
 60 | for ( i in 1:length(sample_files) ) {
 61 | 
 62 |   sample_file <- sample_files[i]
 63 |   sample_list <- sub("\\.txt$", ".list", sample_file)
 64 |   
 65 |   dat1 <- data.frame(file_name = file.path(path_data, sample_file), ## add whole path information
 66 |                      stringsAsFactors = FALSE)
 67 |   write.table(dat1, file = file.path(path_list, sample_list), sep = "\t",
 68 |               row.names = FALSE, col.names = FALSE, quote = FALSE)
 69 | }
 70 | 
 71 | 
 72 | # cmd_PennCNV -------------------------------------------------------------
 73 | 
 74 | cmd_PennCNV <- function(file_hmm, file_pfb, file_gcmodel,
 75 |                         filename_sample, path_list, path_res_sample) {
 76 | 
 77 |   file_list <- file.path(path_list, sub("\\.txt$", ".list", filename_sample))
 78 | 
 79 |   samplename <- gsub(pattern = "\\.txt$", replacement = "", filename_sample)
 80 |   
 81 |   file_log   <- file.path(path_res_sample, paste0(samplename, ".log"))
 82 |   file_rawcnv <- file.path(path_res_sample, paste0(samplename, ".rawcnv"))
 83 | 
 84 |   cmd <- paste(file.path(path_penncnv, "bin/detect_cnv.pl"),
 85 |                "-test --confidence",
 86 |                "-hmm", file_hmm,
 87 |                "-pfb", file_pfb,
 88 |                "-gcmodel", file_gcmodel,
 89 |                "-list", file_list,
 90 |                "-log", file_log,
 91 |                "-out", file_rawcnv)
 92 |   cmd
 93 | }
 94 | 
 95 | cmd_submitjob <- function(cmd.sample, samplename) {
 96 | 
 97 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 98 | ## configure based on your system
 99 |   bsub.cmd <- paste("bsub -n 2 -W 00:30 -R 'rusage[mem=5000]' -P <account>",
100 |                     "-J", samplename,
101 |                     "-q premium",
102 |                     shQuote(cmd.sample))
103 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
104 |   
105 |   bsub.cmd
106 | }
107 | 
108 | # main loop ---------------------------------------------------------------
109 | 
110 | for ( i in 1:length(sample_files) ) {
111 | 
112 |   sample_file <- sample_files[i]
113 |   samplename <- gsub(pattern = "\\.txt$", replacement = "", sample_file)
114 | 
115 |   path_res_sample <- file.path(path_res, samplename)
116 |   dir.create(path = path_res_sample, showWarnings = FALSE, recursive = TRUE)
117 | 
118 |   cat("Sample_ID:", samplename, "\n")
119 | 
120 |   cmd.sample <- cmd_PennCNV(file_hmm = file_hmm,
121 |                             file_pfb = file_pfb,
122 |                             file_gcmodel = file_gcmodel,
123 |                             filename_sample = sample_file,
124 |                             path_list = path_list,
125 |                             path_res_sample = path_res_sample)
126 | 
127 | 
128 |   cmd.job   <- cmd_submitjob(cmd.sample = cmd.sample, samplename = samplename)
129 | 
130 |   system(cmd.job)
131 |   Sys.sleep(0.1)
132 | 
133 | }
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/01_initial_call/run_QuantiSNP/step.1.prepare.QuantiSNP.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system
  4 | 
  5 | ## The script was used to run QuantiSNP on Minerva high performance cluster.
  6 | ## You need to modifiy it according to the system you are using if you would like to use it.
  7 | ## Please refer to original QuantiSNP documents (https://sites.google.com/site/quantisnp/) for more information 
  8 | 
  9 | ## sample file: in tab-delimited format and has two columns: Sample_ID and Gender
 10 | ## for example
 11 | # Sample_ID	Gender
 12 | # sample_1	Female
 13 | # sample_2	Male
 14 | 
 15 | suppressPackageStartupMessages(require(optparse))
 16 | 
 17 | option_list <- list(
 18 |   make_option(c("-q", "--quantisnp"), action = "store", default = NA, type = "character",
 19 |               help = "path to QuantiSNP installation folder."),  
 20 |   make_option(c("-d", "--data"), action = "store", default = NA, type = "character",
 21 |               help = "data folder for runing QuantiSNP"),
 22 |   make_option(c("-s", "--sample"), action = "store", default = NA, type = "character",
 23 |               help = "sample file with Sample_ID and Gender information for runing QuantiSNP"),            
 24 |   make_option(c("-r", "--result"), action = "store", default = NA, type = "character",
 25 |               help = "output folder for QuantiSNP results")
 26 | )
 27 | 
 28 | opt <- parse_args(OptionParser(option_list = option_list))
 29 | if (is.na(opt$data) | is.na(opt$result)) {
 30 |   stop("All input and output arguments must be supplied.")
 31 | }
 32 | 
 33 | path_quantisnp <- opt$quantisnp
 34 | path_dat       <- opt$data
 35 | sample_file    <- opt$sample
 36 | path_output    <- opt$result
 37 | 
 38 | dat_sample <- read.delim(file = sample_file, as.is = TRUE)
 39 | 
 40 | cat("number of rows of sample table:", nrow(dat_sample), "\n") ## number of samples
 41 | 
 42 | for (i in 1:nrow(dat_sample)) {
 43 |   
 44 |   sample_name <- as.character(dat_sample$Sample_ID[i])
 45 |   gender <- tolower(as.character(dat_sample$Gender[i]))
 46 |   ## must change Female => female and Male => male
 47 |   
 48 |   ## check if folder exists
 49 |   res_files <- list.files(path = file.path(path_output, sample_name))
 50 |   idx <- grep(pattern = "cnv", res_files)
 51 |   if (length(idx) > 0) {
 52 |     cat("i:", sample_name, "\n")
 53 |     next
 54 |   }
 55 |   
 56 |   ## define program variables
 57 |   EMITERS    <- "10"        ## number of EM iterations to use during training
 58 |   LSETTING   <- "2000000"   ## characteristic CNV length parameter
 59 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   
 60 |   GCDIR      <- file.path(path_quantisnp, "data/b37/")                        ## path to GC data files (contents of gc_data.zip)
 61 |   PARAMSFILE <- file.path(path_quantisnp, "quantisnp/config/params.dat")      ## path to parameters file
 62 |   LEVELSFILE <- file.path(path_quantisnp, "quantisnp/config/levels-hd.dat")   ## path to levels file
 63 |   MCRROOT    <- file.path(path_quantisnp, "v79/")                             ## path to MCR Run-Time Libraries
 64 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>  
 65 |   CHRRANGE   <- "1:23"   ## chromosomes
 66 |   CHRX       <- "23"     ## which chromosome is X?
 67 |   OUTDIR     <- file.path(path_output, sample_name)    ## output directory
 68 |   SAMPLEID   <- sample_name ## sample name
 69 |   GENDER     <- gender      ## sample gender
 70 |   INFILE     <- file.path(path_dat, paste0(sample_name, ".txt"))   ## input data file generated with finalreport_to_QuantiSNP.pl
 71 | 
 72 |   
 73 |   if (!file.exists(OUTDIR)) dir.create(OUTDIR)
 74 | 
 75 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  
 76 |   cmd <- paste(file.path(path_quantisnp, "quantisnp/linux64/run_quantisnp2.sh"),
 77 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 78 |                MCRROOT, 
 79 |                paste("--chr", CHRRANGE),
 80 |                paste("--outdir", OUTDIR), 
 81 |                paste("--sampleid", SAMPLEID),
 82 |                paste("--gender", GENDER), 
 83 |                paste("--emiters", EMITERS), 
 84 |                paste("--lsetting", LSETTING), 
 85 |                paste("--gcdir", GCDIR),
 86 |                "--plot", 
 87 |                "--genotype", 
 88 |                paste("--config", PARAMSFILE), 
 89 |                paste("--levels", LEVELSFILE), 
 90 |                paste("--input-files", INFILE), 
 91 |                paste("--chrX", CHRX), 
 92 |                "--doXcorrect")
 93 |   
 94 |   job.name <- sample_name
 95 |   log.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.log"))
 96 |   err.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.err"))
 97 | 
 98 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 99 | ## configure based on your system  
100 |   bsub.cmd <- paste("bsub -n 2 -W 02:00 -R 'rusage[mem=5000]' -P <account>",
101 |                     "-J", job.name,
102 |                     "-q premium",
103 |                     "-oo", log.file,
104 |                     "-eo", err.file ,
105 |                     shQuote(cmd))
106 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
107 | 
108 |   cat("i =", i, bsub.cmd, "\n")
109 |   system(bsub.cmd)
110 |   Sys.sleep(0.1)
111 |   
112 |   cat("i = ", i , sample_name, "\n")
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/03_create_CNVR/step.1.CNV.data.R:
--------------------------------------------------------------------------------
  1 | #!/urs/bin/env Rscript
  2 | 
  3 | args <- commandArgs( trailingOnly = TRUE )
  4 | 
  5 | path_output    <- args[1]
  6 | file_ipattern  <- args[2]
  7 | file_penncnv   <- args[3]
  8 | file_quantisnp <- args[4]
  9 | sample_map     <- args[5]
 10 | 
 11 | ## Sample_Map.txt can be generated along with final report from Genome Studio
 12 | ## used to check Sample_ID in the CNV results generated by individual methods
 13 | 
 14 | suppressMessages({
 15 |   require(data.table)
 16 | })
 17 | 
 18 | ## selected columns from CNV results
 19 | col_sel <- c("chr", "posStart", "posEnd", "CN", "Sample_ID", "conf", 
 20 |              "numSNP", "avgConf", "length", "CNV_type", "method")
 21 | 
 22 | sample <- read.delim(file = sample_map, as.is = TRUE)
 23 | 
 24 | # generate data from iPattern, PennCNV, QuantiSNP CNV calls ------------------------
 25 | 
 26 | ## ipattern ---------------------------------------------------------------------
 27 | 
 28 | read_icnv <- function(file_icnv, col_sel, sample) {
 29 |   
 30 |   cat("Read in CNV calls from iPattern ...\n")
 31 |   dat <- read.table(file = file_icnv, sep = "\t", check.names = FALSE, ##skip = 17, 
 32 |                     header = FALSE, comment.char = "#",
 33 |                     stringsAsFactors = FALSE)
 34 |   
 35 |   names(dat) <- c("CNV_type", "chr", "posStart", "posEnd",
 36 |                   "numSNP", "on_probe.num", "clusterIdx",
 37 |                   "gain_loss_score", "cluster_score", "gain_loss_sample.num",
 38 |                   "conf", "Sample_ID", "CNV_event_ID", "CNVR_ID")
 39 |   dat$length <- dat$posEnd - dat$posStart + 1
 40 |   dat$avgConf <- dat$conf/dat$numSNP
 41 | 
 42 |   # filter chr, CNV_type
 43 |   dat <- subset(dat, chr %in% c(1:22) & CNV_type %in% c("Gain", "Loss"))
 44 |   dat$CN <- ifelse(dat$CNV_type == "Gain", 3, 1)
 45 |   dat$chr <- as.integer(dat$chr)
 46 |   dat$method <- "iPattern"
 47 |   
 48 |   dat$Sample_ID <- gsub("^X", "", dat$Sample_ID, perl = TRUE)
 49 |   #dat$Sample_ID <- gsub(".", "-", dat$Sample_ID, fixed = TRUE)
 50 | 
 51 |   ## iPattern converts "-" in Sample_ID to "."
 52 |   ## recover the original Sample_ID
 53 |   idx <- grep("-", sample$Sample_ID) 
 54 |   samples.raw <- sample$Sample_ID[ idx ]
 55 |   samples.alt <- sub("-", ".", samples.raw)
 56 | 
 57 |   for (i in 1:length(samples.alt)) {
 58 | 	idxs1 <- which( dat$Sample_ID == samples.alt[i] )
 59 | 	if (length(idxs1) > 0 ) dat$Sample_ID[ idxs1 ] <- samples.raw[i]
 60 |   }
 61 | 
 62 |   stopifnot( all(dat$Sample_ID %in% sample$Sample_ID) )
 63 |   dat[, col_sel]  ## selected columns
 64 | }
 65 | 
 66 | # merge all groups results
 67 | dat_ipattern <- read_icnv( file_icnv = file_ipattern, col_sel = col_sel, sample = sample )
 68 | 
 69 | write.table( dat_ipattern, 
 70 |              file = file.path(path_output, "cnv.ipattern.txt"),
 71 |              quote = F, row.names = F, sep = "\t")
 72 | 
 73 | 
 74 | # penncnv -----------------------------------------------------------------
 75 | 
 76 | read_pcnv <- function(file_pcnv, col_sel, sample) {
 77 |   
 78 |   cat("Read in CNV calls from PennCNV ...\n")
 79 |   dat <- read.table(file = file_pcnv, sep = "\t", check.names = FALSE,
 80 |                     header = FALSE, stringsAsFactors = FALSE, 
 81 |                     comment.char = "")
 82 |   names(dat) <- c("chr", "posStart", "posEnd", "CN", "Sample_ID", "snpStart", "snpEnd", "conf", "numSNP")
 83 |   dat$Sample_ID <- gsub("\\.txt", "", dat$Sample_ID)
 84 |   dat$length <- dat$posEnd - dat$posStart + 1
 85 |   dat$avgConf <- dat$conf/dat$numSNP
 86 |   
 87 |   dat <- subset(dat, chr %in% c(1:22) & CN != 2)
 88 |   dat$CNV_type <- ifelse(dat$CN > 2, "Gain", "Loss")
 89 |   dat$method <- "PennCNV"
 90 |   dat$CN[which(dat$CN >= 3)] <- 3 ## set CN >= 3 to CN = 3
 91 |   
 92 |   stopifnot( all(dat$Sample_ID %in% sample$Sample_ID) )
 93 |   dat[, col_sel]
 94 | }
 95 | 
 96 | dat_penncnv <- read_pcnv(file_pcnv = file_penncnv, col_sel = col_sel, sample = sample )
 97 | 
 98 | write.table( dat_penncnv, 
 99 |              file = file.path(path_output, "cnv.penncnv.txt"),
100 |              quote = F, row.names = F, sep = "\t")
101 | 
102 | 
103 | # quantisnp ---------------------------------------------------------------
104 | # read from combined CNV results from all individuals
105 | 
106 | read_qcnv <- function(file_qcnv, col_sel, sample) {
107 |   
108 |   cat("Read in CNV calls from QuantiSNP ...\n")
109 |   dat <- read.table(file = file_qcnv, 
110 |                     sep = "\t", 
111 |                     header = TRUE, 
112 |                     check.names = FALSE, 
113 |                     stringsAsFactors = FALSE,
114 |                     comment.char = "")
115 |   
116 |   ## change column name Max.log BF => conf
117 |   names(dat) <- c("Sample_ID", "chr", "posStart", "posEnd", "snpStart", "snpEnd", "length", "numSNP", 
118 |                   "CN", "conf", "Log_BF.State.0", "Log_BF.State.1", "Log_BF.State.2", "Log_BF.State.3",
119 |                   "Log_BF.State.4", "Log_BF.State.5", "Log_BF.State.6")
120 |   
121 |   dat <- subset(dat, chr %in% c(1:22) & CN != 2)
122 |   dat$CNV_type <- ifelse(dat$CN > 2, "Gain", "Loss")
123 |   dat$avgConf <- dat$conf/dat$numSNP
124 |   dat$method <- "QuantiSNP"
125 |   dat$CN[which(dat$CN >= 3)] <- 3  # set CN >= 3 to CN = 3
126 |   
127 |   stopifnot( all(dat$Sample_ID %in% sample$Sample_ID) )
128 |   dat[, col_sel]
129 | }
130 | 
131 | dat_quantisnp <- read_qcnv(file_qcnv = file_quantisnp, col_sel = col_sel, sample = sample )
132 | 
133 | write.table( dat_quantisnp, 
134 |              file = file.path(path_output, "cnv.quantisnp.txt"),
135 |              quote = F, row.names = F, sep = "\t")
136 | 
137 | 


--------------------------------------------------------------------------------
/01_initial_call/run_iPattern/README.md:
--------------------------------------------------------------------------------
 1 | ## iPattern
 2 | 
 3 | ### Installation
 4 | 
 5 | To request the iPattern package, please contact the corresponding author Dr. Stephen W. Scherer (stephen.scherer@sickkids.ca) of the [paper](https://www.ncbi.nlm.nih.gov/pubmed/20531469). For more information about iPattern, please refer to the [paper](https://www.ncbi.nlm.nih.gov/pubmed/?term=21552272).
 6 | 
 7 | After obtaining the package (e.g., ipn.0.581.tar.gz is the version we received), please follow the instructions in the iPattern tutorial enclosed in the package for installation and usage. Here we echo the installation instructions in their tutorial. 
 8 | 
 9 | #### Requirements
10 | - R (2.7.1+)
11 |   - The R "ppc" package – it can be downloaded from http://www-stat.stanford.edu/~tibs/PPC/Rdist/index.html
12 |   - The R "cluster" package (1.15.2+)
13 | - Python (2.5.5+)
14 | 
15 | #### Setup
16 | 
17 | - untar the package file with `tar -zvxf ipn.0.581.tar.gz`
18 | - setup environment:
19 |   - set up environment variable IPNBASE: `export IPNBASE='/path/to/ipn_0.581'`
20 |   - set up environment variable PYTHONPATH: `PYTHONPATH=$PYTHONPATH:'/path/to/ipn_0.581/ipnlib'`
21 | 
22 | Note: the directory structure/name must be kept as it is. Changing the directory structure will break the iPattern pipeline, the pipeline finds all the necessary scripts based on IPNBASE and the directory structure. When PBS job submitting system is not available, you can use `–-noqsub` option to run iPattern sequentially.
23 | 
24 | Remark:
25 | 
26 | - In version 0.581, `${IPNBASE}/ipnlib/IpnFormat.py` will process the columns `Allele1 - Forward` and `Allele2 - Forward` in the final report (see the detailed decription of [data](https://github.com/HaoKeLab/ensembleCNV#data)). If only the `Allele1 - Top` and `Allele1 - Top` columns exist instead of the `Allele1 - Forward` and `Allele2 - Forward` columns in the final report, the users need to substitute `'Allele1 - Forward'` and `'Allele2 - Forward'` to the corresponding code `'Allele1 - Forward'` and `'Allele2 - Forward'` appearing in the `class IPNFormat` block of `${IPNBASE}/ipnlib/IpnFormat.py`.
27 | 
28 | - In version 0.581, two reference files `${IPNBASE}/ipn/known.cnvr.txt` and `${IPNBASE}/preprocess/ref_files/pq.txt` in the iPattern package are in hg18. We perpared a hg19 version [here](https://github.com/HaoKeLab/ensembleCNV/tree/master/01_initial_call/run_iPattern/ref_files_hg19) by [LiftOver](https://genome.ucsc.edu/cgi-bin/hgLiftOver). The users can substitute the two files when processing hg19 data. 
29 | 
30 | ### Analysis workflow
31 | 
32 | #### Prepare auxiliary input files
33 | 
34 | In addition to the sample-wise final report files in `${WKDIR}/01_initial_call/run_iPattern/data`, which are supposed to have been generated by `${WKDIR}/01_initial_call/prepare_IPQ_input_file/finalreport_to_iPattern.pl`, three other auxiliary input files for iPattern can be generated as follows.
35 | 
36 | ```sh
37 | PROJECT_NAME=<project_name>
38 | Rscript ${WKDIR}/01_initial_call/run_iPattern/prepare_input_files_for_iPattern.R ${WKDIR} ${PROJECT_NAME}
39 | ```
40 | 
41 | When the processing is completed, three files are supposed to be generated at  `${WKDIR}/01_initial_call/run_iPattern/data_aux`:
42 | 
43 | - `${PROJECT_NAME}_data_file.txt`: lists the absolute path to all the sample-wise final report files in `${WKDIR}/01_initial_call/run_iPattern/data`, so that iPattern knows where to find these data files.
44 | 
45 | - `${PROJECT_NAME}_gender_file.txt`: tab-delimited table including two columns (without column names in table header): Sample ID and Gender ("M" for male and "F" for female), which is generated based on `${WKDIR}/data/Samples_Table.txt` (see the detailed decription of [data](https://github.com/HaoKeLab/ensembleCNV#data)).
46 | 
47 | - `${PROJECT_NAME}_bad_samples.txt`: is used to list sample IDs to be excluded from iPattern analysis. We prepared an empty file where the users can type in the sample IDs to be excluded from the analysis if there is any.
48 | 
49 | #### Run iPattern
50 | 
51 | ```sh
52 | ${IPNBASE}/ipn_0.581/preprocess/ilmn/ilmn_run.py \
53 | --data-file-list   ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_data_file.txt \
54 | --gender-file      ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_gender_file.txt \
55 | --bad-sample-file  ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_bad_samples.txt \
56 | --experiment       $PROJECT_NAME \
57 | --output-directory ${WKDIR}/01_initial_call/run_iPattern/results/ \
58 | --do-log \
59 | --do-cleanup \
60 | --noqsub
61 | ```
62 | When the analysis is completed, you will find two files, which will be used by ensembleCNV, in the directory `${WKDIR}/01_initial_call/run_iPattern/results`:
63 | - `${PROJECT_NAME}_all_calls.txt`: raw CNV calls of all samples.
64 | - `${PROJECT_NAME}_sample.stats.txt`: sample-level summary statistics.
65 | 
66 | Note: 
67 | - All other parameters for `ilmn_run.py` are set by their default values.
68 | - When the sample size of the project is large, the authors of iPattern recommend the whole dataset be split into batches with balanced sample size in order to control for the number of CNV calls per sample. The batches are analyzed by iPattern independently. In each batch (or iPattern run), a minimum of 90-96 samples (e.g. one 96-well plate of samples) and a maximum of 400 samples are recommended based on iPattern tutorial. Creating batches can be easily implemented by splitting `${PROJECT_NAME}_data_file.txt` into batch-level data files (e.g. `${PROJECT_NAME}_batch1_data_file.txt`, `${PROJECT_NAME}_batch2_data_file.txt`, etc.) with each batch having a batch-specific project name (e.g., `${PROJECT_NAME}_batch1`, `${PROJECT_NAME}_batch2`, etc.), while `${PROJECT_NAME}_gender_file.txt` and `${PROJECT_NAME}_bad_samples.txt` remains unchanged. When the analysis for all batches are completed, the batch-wise results (e.g., `${PROJECT_NAME}_batch*_all_calls.txt` and `${PROJECT_NAME}_batch*_sample.stats.txt`) will need to be combined into the final results for the whole project (e.g., `${PROJECT_NAME}_all_calls.txt` and `${PROJECT_NAME}_sample.stats.txt`).
69 | 


--------------------------------------------------------------------------------
/01_initial_call/run_QuantiSNP/step.2.check.QuantiSNP.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system
  4 | 
  5 | ## The script was used to run QuantiSNP on Minerva high performance cluster.
  6 | ## You need to modifiy it according to the system you are using if you would like to use it.
  7 | ## Please refer to original QuantiSNP documents (https://sites.google.com/site/quantisnp/) for more information 
  8 | 
  9 | suppressPackageStartupMessages(require(optparse))
 10 | 
 11 | ## function ------------------------------------------------------------------
 12 | run.quantisnp <- function(path_output, path_dat, sample_name, gender) {
 13 |   
 14 |   ## define program variables
 15 |   EMITERS    <- "10"        ## number of EM iterations to use during training
 16 |   LSETTING   <- "2000000"   ## characteristic CNV length parameter
 17 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 18 |   GCDIR      <- file.path(path_quantisnp, "data/b37/")                        ## path to GC data files (contents of gc_data.zip)
 19 |   PARAMSFILE <- file.path(path_quantisnp, "quantisnp/config/params.dat")      ## path to parameters file
 20 |   LEVELSFILE <- file.path(path_quantisnp, "quantisnp/config/levels-hd.dat")   ## path to levels file
 21 |   MCRROOT    <- file.path(path_quantisnp, "v79/")                             ## path to MCR Run-Time Libraries
 22 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>    
 23 |   CHRRANGE   <- "1:23"   ## chromosome
 24 |   CHRX       <- "23"     ## which chromosome is X?
 25 |   OUTDIR     <- file.path(path_output, sample_name)    ## output directory
 26 |   SAMPLEID   <- sample_name ## sample name
 27 |   GENDER     <- gender      ## sample gender
 28 |   INFILE     <- file.path(path_dat, paste0(sample_name, ".txt"))   ## input data file
 29 |   
 30 |   if (!file.exists(OUTDIR)) dir.create(OUTDIR)
 31 | 
 32 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  
 33 |   cmd <- paste(file.path(path_quantisnp, "quantisnp/linux64/run_quantisnp2.sh"),
 34 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>  
 35 |                MCRROOT, 
 36 |                paste("--chr", CHRRANGE),
 37 |                paste("--outdir", OUTDIR), 
 38 |                paste("--sampleid", SAMPLEID),
 39 |                paste("--gender", GENDER), 
 40 |                paste("--emiters", EMITERS), 
 41 |                paste("--lsetting", LSETTING), 
 42 |                paste("--gcdir", GCDIR),
 43 |                "--plot", 
 44 |                "--genotype", 
 45 |                paste("--config", PARAMSFILE), 
 46 |                paste("--levels", LEVELSFILE), 
 47 |                paste("--input-files", INFILE), 
 48 |                paste("--chrX", CHRX), 
 49 |                "--doXcorrect")
 50 |   
 51 |   job.name <- sample_name
 52 |   log.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.log"))
 53 |   err.file <- file.path(OUTDIR, paste0(sample_name, ".quantisnp.err"))
 54 | 
 55 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 56 |   bsub.cmd <- paste("bsub -n 2 -W 10:00 -R 'rusage[mem=5000]' -P <account>", 
 57 |                     "-J", job.name,
 58 |                     "-q premium",
 59 |                     "-oo", log.file,
 60 |                     "-eo", err.file ,
 61 |                     shQuote(cmd))
 62 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 63 |   
 64 |   cat(bsub.cmd, "\n")
 65 |   system(bsub.cmd)
 66 |   
 67 | }
 68 | 
 69 | ## ===============================================================================================
 70 | 
 71 | option_list <- list(
 72 |   make_option(c("-q", "--quantisnp"), action = "store", default = NA, type = "character",
 73 |               help = "path to QuantiSNP installation folder."),
 74 |   make_option(c("-d", "--data"), default = NA, type = "character", action = "store",
 75 |               help = "data folder for runing QuantiSNP."),
 76 |   make_option(c("-s", "--sample"), action = "store", default = NA, type = "character",
 77 |               help = "sample file with Sample_ID and Gender information for runing QuantiSNP"), 
 78 |   make_option(c("-r", "--result"), default = NA, type = "character", action = "store",
 79 |               help = "path to CNV results generated in the first step.")
 80 | )
 81 | 
 82 | opt <- parse_args(OptionParser(option_list = option_list))
 83 | 
 84 | if (is.na(opt$data) | is.na(opt$result)) {
 85 |   stop("Three input argument must be supplied.")
 86 | }
 87 | 
 88 | # get paras
 89 | path_quantisnp <- opt$quantisnp
 90 | path_data      <- opt$data
 91 | sample_file    <- opt$sample
 92 | path_res       <- opt$result
 93 | 
 94 | dat_sample <- read.delim(file = sample_file, as.is = TRUE)
 95 | 
 96 | cat("number of rows of sample table:", nrow(dat_sample), "\n") ## number of samples
 97 | 
 98 | samples <- dat_sample$Sample_ID
 99 | genders <- tolower(dat_sample$Gender)
100 | ## must change Female => female and Male => male
101 | 
102 | n.success <- 0
103 | n.fail <- 0
104 | for (i in 1:length(samples)) {
105 |   
106 |   sample_name <- samples[i]
107 |   gender <- genders[i]
108 |   path_sample1 <- file.path(path_res, sample_name)
109 |   
110 |   if (dir.exists(paths = path_sample1)) {
111 |     
112 |     # check if .cnv file have been generated
113 |     files <- list.files(path = path_sample1)
114 |     idx1 <- grep(pattern = ".cnv", files)
115 |     if (length(idx1) == 1) {
116 |       n.success	<- n.success + 1
117 |       cat("Sample_ID:", sample_name, "SUCCESS.\n")
118 |     } else {
119 |       n.fail <-	n.fail + 1
120 |       cat("Sample_ID:", sample_name, "FAILED.\n")
121 |       run.quantisnp(path_output = path_res, path_dat = path_data, sample_name = sample_name, gender = gender)
122 |     }
123 | 
124 |   } else {
125 |   	n.fail <-	n.fail + 1
126 |     cat("Sample_ID:", sample_name, "FAILED.\n")
127 |     run.quantisnp(path_output = path_res, path_dat = path_data, sample_name = sample_name, gender = gender)
128 |   }
129 |   
130 | }
131 | 
132 | cat("total number of samples:", length(samples),
133 |     "number of success:", n.success,
134 |     "number of fail:", n.fail, "\n")
135 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/scripts/fun_plot_steps.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # add dup pairs flag
  4 | add_dup_pairs_flag <- function(dt, dup_pairs) {
  5 |   
  6 |   # dup pair with dup_flag not equal to 0
  7 |   dt$dup_flag <- 0
  8 |   for (i in 1:nrow(dup_pairs)) {
  9 |     samples <- c(dup_pairs$sample1.name[i], dup_pairs$sample2.name[i])
 10 |     idxs <- which(dt$Sample_ID %in% samples)
 11 |     if (length(idxs) >= 1) {
 12 |       dt$dup_flag[idxs] <- i
 13 |     }
 14 |   }
 15 |   
 16 |   dt
 17 | }
 18 | 
 19 | # tranfrom dt_LRRBAF from gatk to raw method we used LRR12/BAF12 
 20 | add_LRRBAF_ratio <- function(dt) {
 21 |   
 22 |   # add log_ratio LRR12 and BAF12 and LRR32 and BAF32
 23 |   dt$LRR12 <- log(dt$LRR1/dt$LRR2)
 24 |   dt$BAF12 <- log(dt$BAF1/dt$BAF2)
 25 |   
 26 |   dt$LRR32 <- log(dt$LRR3/dt$LRR2)
 27 |   dt$BAF32 <- log(dt$BAF3/dt$BAF2)
 28 |   
 29 |   return(dt)
 30 | }
 31 | 
 32 | # plot model
 33 | plot_model <- function(paras, dt_cnvr, title) {
 34 |   
 35 |   mu1 <- paras$mu[1]
 36 |   sigma1 <- paras$sigma[1]
 37 |   lambda1 <- paras$lambda[1]
 38 |   
 39 |   mu2 <- paras$mu[2]
 40 |   sigma2 <- paras$sigma[2]
 41 |   lambda2 <- paras$lambda[2]
 42 |   
 43 |   mu3 <- paras$mu[3]
 44 |   sigma3 <- paras$sigma[3]
 45 |   lambda3 <- paras$lambda[3]
 46 |   
 47 |   x <- dt_cnvr$LRR_median
 48 |   range_x <- range(x)
 49 |   
 50 |   xs <- seq(range_x[1], range_x[2], length.out = 800)
 51 |   dt <- data.frame(x = xs, stringsAsFactors = F)
 52 |   
 53 |   dt1 <- data.frame(x = xs, d = lambda1*dnorm(xs, mean = mu1, sd = sigma1), CN = 1)
 54 |   dt3 <- data.frame(x = xs, d = lambda3*dnorm(xs, mean = mu3, sd = sigma3), CN = 3)
 55 |   dt2 <- data.frame(x = xs, d = lambda2*dnorm(xs, mean = mu2, sd = sigma2), CN = 2)
 56 |   dt123 <- rbind(dt1, dt2, dt3)
 57 |   dt123$CN <- as.factor(dt123$CN)
 58 |   
 59 |   p <- ggplot(data = dt_cnvr, aes(LRR_median, y = ..density..)) +
 60 |     geom_histogram(bins = 100, fill = NA, color = "black") + 
 61 |     geom_line(data = dt123, aes(x, d, col = CN), lwd = 1.5) + 
 62 |     theme_bw(base_size = 10) +
 63 |     labs(title = title,
 64 |          subtitle = paste("mu1:", round(mu1, 2), "mu2:", round(mu2, 2), "mu3:", round(mu3, 2), "\n",
 65 |                           "sd1:", round(sigma1, 2), "sd2:", round(sigma2, 2), "sd3:", round(sigma3, 2)))
 66 |   p
 67 | }
 68 | 
 69 | 
 70 | # plot steps
 71 | plot_steps <- function(dt_cnvr_train, dup_pairs, paras, dt_cnvr_raw, dt_LRRBAF) {
 72 |   
 73 |   dt_cnvr_train <- dt_cnvr_train[order(dt_cnvr_train$CN), ]
 74 |   dt_cnvr_train$idx <- 1:nrow(dt_cnvr_train)
 75 |   
 76 |   dt_cnvr_raw <- dt_cnvr_raw[order(dt_cnvr_raw$CN), ]
 77 |   dt_cnvr_raw$idx <- 1:nrow(dt_cnvr_raw)
 78 |   
 79 |   dt_dup     <- data.frame()
 80 |   dt_dup_raw <- data.frame()
 81 |   if (! is.null(dup_pairs) ) {
 82 |     # add flag for dup
 83 |     dt_cnvr_train <- add_dup_pairs_flag(dt = dt_cnvr_train, dup_pairs = dup_pairs)
 84 |     dt_dup <- subset(dt_cnvr_train, dup_flag != 0)
 85 |     
 86 |     dt_cnvr_raw <- add_dup_pairs_flag(dt = dt_cnvr_raw, dup_pairs = dup_pairs)
 87 |     dt_dup_raw <- subset(dt_cnvr_raw, dup_flag != 0)
 88 |   }
 89 |   
 90 |   numsnp <- unique(dt_cnvr_raw$numSNP)
 91 |   
 92 |   plot1 <- ggplot(data = dt_cnvr_raw, aes(idx, LRR_median, col = factor(CN))) + 
 93 |     geom_point() + 
 94 |     theme_bw(base_size = 10) +
 95 |     annotate("text", x = dt_dup_raw$idx, y = dt_dup_raw$LRR_median, label = dt_dup_raw$dup_flag) + 
 96 |     labs(title = paste("scatter plot of LRR_median with numsnp:", numsnp)) + 
 97 |     theme(legend.position = "top")
 98 |   
 99 |   # plot1
100 |   
101 |   # add gmm model paras
102 |   plot2 <- plot_model(dt_cnvr = dt_cnvr_train, paras = paras, 
103 |                       title = "fit model for LRR_median, only contain CN = 1/2/3")
104 |   
105 |   # plot2
106 |   
107 |   # --------------------------------------------------
108 |   # add steps infromation 
109 |   # filter CN != 0
110 |   dt_LRRBAF <- subset(dt_LRRBAF, CN_gatk_pred != 0)
111 |   
112 |   dt_LRRBAF <- add_dup_pairs_flag(dt = dt_LRRBAF, dup_pairs = dup_pairs)
113 |   dt_LRRBAF_new <- add_LRRBAF_ratio(dt = dt_LRRBAF)
114 |   
115 |   # step1
116 |   dt1_gatk <- subset(dt_LRRBAF_new, CN_gatk_pred == 1)
117 |   dt1_annotate <- subset(dt_LRRBAF_new, dup_flag != 0)
118 |   plot_step1 <- ggplot() + 
119 |     geom_point(data = dt_LRRBAF_new, aes(BAF12, LRR12), col = "gray") + 
120 |     geom_vline(xintercept = 0, lty = 2, lwd = 1) + 
121 |     geom_hline(yintercept = 0, lty = 2, lwd = 1) + 
122 |     theme_bw(base_size = 10) +
123 |     geom_point(data = dt1_gatk, aes(BAF12, LRR12), col = "red") + 
124 |     annotate(geom = "text", x = dt1_annotate$BAF12, y = dt1_annotate$LRR12, label = dt1_annotate$dup_flag) + 
125 |     ggtitle(label = "step 1 for CN = 1")
126 |   
127 |   # step2
128 |   dt3 <- subset(dt_LRRBAF_new, LRR12 <= 0 | BAF12 <= 0)
129 |   dt3_gatk <- subset(dt_LRRBAF_new, CN_gatk_pred == 3)
130 |   dt3_annotate <- subset(dt3, dup_flag != 0)
131 |   plot_step2 <- ggplot() + 
132 |     geom_point(data = dt3, aes(BAF32, LRR32), col = "gray") + 
133 |     geom_vline(xintercept = 0, lty = 2, lwd = 1) + 
134 |     geom_hline(yintercept = 0, lty = 2, lwd = 1) + 
135 |     theme_bw(base_size = 10) +
136 |     geom_point(data = dt3_gatk, aes(BAF32, LRR32), col = "red") + 
137 |     annotate(geom = "text", x = dt3_annotate$BAF32, y = dt3_annotate$LRR32, label = dt3_annotate$dup_flag) + 
138 |     ggtitle(label = "step 2 for CN = 3")
139 |   
140 |   
141 |   ps <- gridExtra::grid.arrange(plot1, plot_step1, plot2, plot_step2, nrow = 2)
142 |   
143 |   return(ps)
144 | }
145 | 
146 | 
147 | # plot model_final
148 | plot_model_final <- function(paras, dt_cnvr, title) {
149 |   
150 |   mu1 <- paras$mu[2]
151 |   sigma1 <- paras$sigma[2]
152 |   lambda1 <- paras$lambda[2]
153 |   
154 |   mu2 <- paras$mu[3]
155 |   sigma2 <- paras$sigma[3]
156 |   lambda2 <- paras$lambda[3]
157 |   
158 |   mu3 <- paras$mu[4]
159 |   sigma3 <- paras$sigma[4]
160 |   lambda3 <- paras$lambda[4]
161 |   
162 |   
163 |   # transfrom lambdas --------------
164 |   lambdas <- lambda1 + lambda2 + lambda3
165 |   lambda1 <- lambda1/lambdas
166 |   lambda2 <- lambda2/lambdas
167 |   lambda3 <- lambda3/lambdas
168 |   
169 |   x <- dt_cnvr$LRR_median
170 |   range_x <- range(x)
171 |   
172 |   xs <- seq(range_x[1], range_x[2], length.out = 800)
173 |   dt <- data.frame(x = xs, stringsAsFactors = F)
174 |   
175 |   dt1 <- data.frame(x = xs, d = lambda1*dnorm(xs, mean = mu1, sd = sigma1), CN = 1)
176 |   dt2 <- data.frame(x = xs, d = lambda2*dnorm(xs, mean = mu2, sd = sigma2), CN = 2)
177 |   dt3 <- data.frame(x = xs, d = lambda3*dnorm(xs, mean = mu3, sd = sigma3), CN = 3)
178 |  
179 |   dt123 <- rbind(dt1, dt2, dt3)
180 |   dt123$CN <- as.factor(dt123$CN)
181 |   
182 |   p <- ggplot(data = dt_cnvr, aes(LRR_median, y = ..density..)) +
183 |     geom_histogram(bins = 100, fill = NA, color = "black") + 
184 |     geom_line(data = dt123, aes(x, d, col = CN), lwd = 1.5) + 
185 |     theme_bw(base_size = 10) +
186 |     labs(title = title,
187 |          subtitle = paste("mu1:", round(mu1, 2), "mu2:", round(mu2, 2), "mu3:", round(mu3, 2), "\n",
188 |                           "sd1:", round(sigma1, 2), "sd2:", round(sigma2, 2), "sd3:", round(sigma3, 2)))
189 |   p
190 | }
191 | 


--------------------------------------------------------------------------------
/02_batch_effect/PCA_on_summary_stats/step.1.prepare.stats.R:
--------------------------------------------------------------------------------
  1 | #!/urs/bin/env Rscript
  2 | 
  3 | args <- commandArgs( trailingOnly = TRUE )
  4 | 
  5 | path_ipattern  <- args[1]
  6 | path_penncnv   <- args[2]
  7 | path_quantisnp <- args[3]
  8 | path_output    <- args[4]
  9 | 
 10 | suppressMessages({
 11 |   require(data.table)
 12 | })
 13 | 
 14 | # ipattern ----------------------------------------------------------------
 15 | ## for number of samples larger than 500, samples may need to be splited into batches to run ipattern
 16 | read_ipattern_batch <- function(path_ipattern) {
 17 |   
 18 |   ## NumCNV
 19 |   cnv_file <- list.files(path = path_ipattern, pattern = "_all_calls.txt$")
 20 |   
 21 |   dat <- read.table(file = file.path(path_ipattern, cnv_file),
 22 |                     header = FALSE, sep = "\t", comment.char = "#", as.is = TRUE)
 23 |   names(dat) <- c("CNV_type", "chr", "posStart", "posEnd",
 24 |                   "numSNP", "on_probe.num", "clusterIdx",
 25 |                   "gain_loss_score", "cluster_score", "gain_loss_sample.num",
 26 |                   "conf", "Sample_ID", "CNV_event_ID", "CNVR_ID")
 27 |   dat <- subset(dat, chr %in% c(1:22))
 28 |   tbl <- table(dat$Sample_ID)
 29 |   dat_tbl <- as.data.frame(tbl)
 30 |   names(dat_tbl) <- c("Sample_ID", "iPattern.NumCNV")
 31 |   NumSample <- nrow(dat_tbl)
 32 |   
 33 |   ## sample.stats.txt
 34 |   stat_file <- list.files(path = path_ipattern, pattern = "_sample.stats.txt$")
 35 |   
 36 |   dat_stat <- read.table(file = file.path(path_ipattern, stat_file),
 37 |                          header = FALSE, sep = "\t", nrows = NumSample, as.is = TRUE)
 38 |   names(dat_stat) <- c("Sample_ID", "iPattern.LRR_SD", "iPattern.base_CN")
 39 |   dat_stat <- dat_stat[, c("Sample_ID", "iPattern.LRR_SD")]
 40 |   
 41 |   ## clean sample ID: remove path information, remove subfix ".rescale"
 42 |   samples <- dat_stat$Sample_ID
 43 |   samples <- unlist( lapply(1:length(samples), FUN = function(k) {
 44 |     sample1 <- samples[k]
 45 |     strs <- unlist(strsplit(sample1, split = "/", fixed = TRUE))
 46 |     str1 <- strs[length(strs)]
 47 |   }) )
 48 |   samples <- gsub("\\.rescale$", "", samples)
 49 |   dat_stat$Sample_ID <- samples
 50 |   
 51 |   res <- merge(dat_stat, dat_tbl)
 52 |   ## if Sample_ID starts with number
 53 |   res$Sample_ID <- gsub(pattern = "^X", replacement = "", res$Sample_ID, perl = TRUE)  ## check
 54 |   
 55 |   res
 56 | }
 57 | 
 58 | cat("Processing iPattern results ...\n")
 59 | dat_stats_ipattern <- read_ipattern_batch(path_ipattern = path_ipattern)
 60 | 
 61 | write.table(dat_stats_ipattern, 
 62 |             file = file.path(path_output, "ipattern.stats.txt"),
 63 |             quote = F, row.names = F, sep = "\t")
 64 | cat("Done.\n")
 65 | 
 66 | # penncnv sample-level -----------------------------------------------------
 67 | cat("Processing PennCNV results ...\n")
 68 | dat_penncnv <- read.table(file = file.path(path_penncnv, "CNV.PennCNV_qc_new.txt"),
 69 |                           sep = "\t",
 70 |                           header = TRUE,
 71 |                           check.names = FALSE,
 72 |                           stringsAsFactors = FALSE)
 73 | dat_penncnv$File <- gsub("\\.txt$", "", dat_penncnv$File, perl = TRUE) 
 74 | dat_penncnv$WF <- abs(dat_penncnv$WF)
 75 | 
 76 | fp <- c( "LRR_SD", "BAF_SD", "BAF_drift", "WF", "NumCNV" )
 77 | dat_penncnv <- dat_penncnv[, c("File", fp)]
 78 | names(dat_penncnv) <- c("Sample_ID", paste("PennCNV", fp, sep = "."))
 79 | 
 80 | dat_stats_penncnv <- dat_penncnv
 81 | 
 82 | write.table(dat_stats_penncnv, 
 83 |             file = file.path(path_output, "penncnv.stats.txt"),
 84 |             quote = F, row.names = F, sep = "\t")
 85 | cat("Done.\n")
 86 | 
 87 | # quantisnp ---------------------------------------------------------------
 88 | read_quantisnp_per_sample <- function(path_res, sample_id) {
 89 |   
 90 |   ## get numCNV
 91 |   file_cnv <- file.path(file.path(path_res, sample_id), paste0(sample_id, ".cnv"))
 92 |   dat_cnv <- fread(input = file_cnv)
 93 |   numCNV <- sum(dat_cnv$Chromosome %in% c(1:22))
 94 |   
 95 |   ## get LRR.SD and BAF.SD
 96 |   ## Note: in the .qc file, QuantiSNP has formatting issue
 97 |   ##       the column name "Gender" is written at the start of the second line
 98 |   file_qc <- file.path(file.path(path_res, sample_id), paste0(sample_id, ".qc"))
 99 |   dat_line2 <- read.table(file = file_qc, skip = 1, nrows = 1, header = FALSE, stringsAsFactors = FALSE)
100 |   dat_line2 <- dat_line2[, -1]
101 |   names(dat_line2) <- c("Sample_ID", "Chr", "OutlierRate", "LRR_SD", "BAF_SD", "Gender")
102 |   
103 |   dat_other <- read.table(file = file_qc, skip = 2, header = FALSE, stringsAsFactors = FALSE)
104 |   names(dat_other) <- c("Sample_ID", "Chr", "OutlierRate", "LRR_SD", "BAF_SD", "Gender")
105 |   dat <- rbind(dat_line2, dat_other)
106 |   names(dat) <- c("Sample_ID", "Chr", "OutlierRate", "LRR_SD", "BAF_SD", "Gender")
107 |   dat <- subset(dat, Chr %in% c(1:22))
108 |   
109 |   Sample_ID <- unique(dat$Sample_ID)
110 |   LRR_SD <- mean(dat$LRR_SD, na.rm = TRUE)
111 |   BAF_SD <- mean(dat$BAF_SD, na.rm = TRUE)
112 |   
113 |   res1 <- data.frame(Sample_ID = Sample_ID, 
114 |                      QuantiSNP.NumCNV = numCNV,
115 |                      QuantiSNP.LRR_SD = LRR_SD, 
116 |                      QuantiSNP.BAF_SD = BAF_SD,
117 |                      stringsAsFactors = FALSE)
118 |   return(res1) ## for one sample
119 | }
120 | 
121 | read_quantisnp <- function(path_res) {
122 |   
123 |   samples <- list.files(path = path_res)
124 |   res <- data.frame() ## all QuantiSNP statistics
125 |   for (i in 1:length(samples)) {
126 |     
127 |     sample1 <- samples[i]
128 |     #cat("i:", i, length(samples), "SampleID:", sample1, "\n")
129 |     
130 |     res1 <- read_quantisnp_per_sample(path_res = path_res, sample_id = sample1)
131 |     res <- rbind(res, res1)
132 |   }
133 |   res
134 | }
135 | 
136 | cat("Processing QuantiSNP results ...\n")
137 | dat_stats_quantisnp <- read_quantisnp(path_res = path_quantisnp)
138 | 
139 | write.table(dat_stats_quantisnp, 
140 |             file = file.path(path_output, "quantisnp.stats.txt"),
141 |             quote = F, row.names = F, sep = "\t")
142 | cat("Done.")
143 | 
144 | 
145 | # IPQ ---------------------------------------------------------------------
146 | 
147 | cat("Combine summary statistics from different methods ...\n")
148 | ## iPattern converts "-" in Sample_ID to "."
149 | ## recover the original Sample_ID
150 | idx <- grep("-", dat_stats_penncnv$Sample_ID) 
151 | samples.raw <- dat_stats_penncnv$Sample_ID[ idx  ]
152 | samples.alt <- sub("-", ".", samples.raw)
153 | 
154 | for (i in 1:length(samples.alt)) {
155 | 	idx1 <- which(dat_stats_ipattern$Sample_ID == samples.alt[i])
156 | 	dat_stats_ipattern$Sample_ID[ idx1 ] <- samples.raw[i]
157 | }
158 | 
159 | res_IP <- merge(dat_stats_ipattern, dat_stats_penncnv)
160 | stopifnot( nrow(res_IP) == nrow(dat_stats_ipattern))
161 | 
162 | res_IPQ <- merge(res_IP, dat_stats_quantisnp)
163 | stopifnot( nrow(res_IPQ) == nrow(res_IP) )
164 | 
165 | write.table(res_IPQ, 
166 |             file = file.path(path_output, "IPQ.stats.txt"),
167 |             quote = F, row.names = F, sep = "\t")
168 | cat("Done.\n")
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/example/example_create_CNVR/results/cnv.ipattern.txt:
--------------------------------------------------------------------------------
 1 | chr	posStart	posEnd	CN	Sample_ID	conf	numSNP	avgConf	length	CNV_type	method
 2 | 1	25598276	25642596	1	Sample86.1_3R05C01	19.07	8	2.38375	44321	Loss	iPattern
 3 | 1	25598276	25642596	1	Sample6.1_4R07C01	16.2	8	2.025	44321	Loss	iPattern
 4 | 1	25598276	25642596	1	Sample143.1_5R06C01	19.25	8	2.40625	44321	Loss	iPattern
 5 | 1	25598276	25642596	1	Sample62.1_7R02C01	27	8	3.375	44321	Loss	iPattern
 6 | 1	25598276	25642596	1	Sample35.1_9R01C01	16.59	8	2.07375	44321	Loss	iPattern
 7 | 1	25598276	25642596	1	Sample60.1_9R03C01	17.84	8	2.23	44321	Loss	iPattern
 8 | 1	25598276	25642596	1	Samplec9.1_38R05C01	26.89	8	3.36125	44321	Loss	iPattern
 9 | 1	25598276	25642596	1	Sample289.1_16R04C01	27.79	8	3.47375	44321	Loss	iPattern
10 | 1	25598276	25642596	1	Sample651.1_3R05C01	14.83	8	1.85375	44321	Loss	iPattern
11 | 1	25598276	25642596	1	Sample496.1_40R07C01	29.96	8	3.745	44321	Loss	iPattern
12 | 1	25598276	25642596	1	Sample130.1_18R01C01	17.4	8	2.175	44321	Loss	iPattern
13 | 1	25598276	25642596	1	Sample217.1_18R08C01	20.28	8	2.535	44321	Loss	iPattern
14 | 1	25598276	25642596	1	Sample242.1_20R01C01	22.59	8	2.82375	44321	Loss	iPattern
15 | 1	25598276	25642596	1	Sample520.1_41R08C01	33.39	8	4.17375	44321	Loss	iPattern
16 | 1	25598276	25642596	1	Sample66.1_12R06C01	31.81	8	3.97625	44321	Loss	iPattern
17 | 1	25598276	25642596	1	Sample177.1_21R05C01	25.54	8	3.1925	44321	Loss	iPattern
18 | 1	25598276	25642596	1	Sample160.1_22R02C01	20.9	8	2.6125	44321	Loss	iPattern
19 | 1	25598276	25642596	1	Samplec119.1_18R02C01	19.55	8	2.44375	44321	Loss	iPattern
20 | 1	25598276	25642596	1	Sample138.1_23R06C01	23.52	8	2.94	44321	Loss	iPattern
21 | 1	25598276	25642596	1	Sample390.1_24R07C01	17.26	8	2.1575	44321	Loss	iPattern
22 | 1	25598276	25642596	1	SampleY-37.1_11R02C01	20.91	8	2.61375	44321	Loss	iPattern
23 | 1	25598276	25642596	1	Sample212.1_25R06C01	26.63	8	3.32875	44321	Loss	iPattern
24 | 1	25598276	25642596	1	Sample686.1_9R03C01	18.67	8	2.33375	44321	Loss	iPattern
25 | 1	25598276	25642596	1	Sample668.1_14R05C01	19.34	8	2.4175	44321	Loss	iPattern
26 | 1	25598276	25642596	1	Sample256.1_28R07C01	25.32	8	3.165	44321	Loss	iPattern
27 | 1	25598276	25642596	1	Sample200.1_29R07C01	32.4	8	4.05	44321	Loss	iPattern
28 | 1	25598276	25642596	1	SampleY-70.1_4R01C01	20.15	8	2.51875	44321	Loss	iPattern
29 | 1	25598276	25642596	1	Sample235.1_30R03C01	17.06	8	2.1325	44321	Loss	iPattern
30 | 1	25598276	25642596	1	Sample283.1_30R06C01	21.45	8	2.68125	44321	Loss	iPattern
31 | 1	25598276	25642596	1	SampleY-41.1_1R05C01	25.78	8	3.2225	44321	Loss	iPattern
32 | 1	25598276	25642596	1	Sample556.1_43R03C01	26.36	8	3.295	44321	Loss	iPattern
33 | 1	25598276	25642596	1	Sample592.1_43R06C01	28.9	8	3.6125	44321	Loss	iPattern
34 | 1	25598276	25642596	1	Sample527.1_44R01C01	17.92	8	2.24	44321	Loss	iPattern
35 | 1	25598276	25642596	1	Sample550.1_46R07C01	19.96	8	2.495	44321	Loss	iPattern
36 | 1	25598276	25642596	1	Sample433.1_48R01C01	19.04	8	2.38	44321	Loss	iPattern
37 | 1	25598276	25642596	1	Sample445.1_79R03C01	23.29	8	2.91125	44321	Loss	iPattern
38 | 1	25598276	25642596	1	Sample522.1_78R01C01	19.71	8	2.46375	44321	Loss	iPattern
39 | 1	25598276	25642596	1	Sample331.1_35R05C01	25.97	8	3.24625	44321	Loss	iPattern
40 | 1	25598276	25642596	1	Sample560.1_78R02C01	22.22	8	2.7775	44321	Loss	iPattern
41 | 1	25598276	25642596	1	Sample392.1_35R02C01	12.07	8	1.50875	44321	Loss	iPattern
42 | 1	25598276	25642596	1	Sample501.1_75R01C01	11.73	8	1.46625	44321	Loss	iPattern
43 | 1	25598276	25642596	1	Sample247.1_67R01C01	22.09	8	2.76125	44321	Loss	iPattern
44 | 1	25598276	25642596	1	Sample439.1_55R05C01	16.85	8	2.10625	44321	Loss	iPattern
45 | 1	25598276	25642596	1	Sample131.1_5R05C01	17.91	8	2.23875	44321	Loss	iPattern
46 | 1	25598276	25642596	1	Sample72.1_8R04C01	18.18	8	2.2725	44321	Loss	iPattern
47 | 1	25598276	25642596	1	Sample36.1_9R08C01	18.32	8	2.29	44321	Loss	iPattern
48 | 1	25598276	25642596	1	Sample724.1_12R04C01	24.24	8	3.03	44321	Loss	iPattern
49 | 1	25598276	25642596	1	Sample748.1_26R05C01	18.17	8	2.27125	44321	Loss	iPattern
50 | 1	25598276	25642596	1	Sample516.1_37R02C01	23.24	8	2.905	44321	Loss	iPattern
51 | 1	25598276	25642596	1	Sample459.1_28R06C01	21.96	8	2.745	44321	Loss	iPattern
52 | 1	25598276	25642596	1	Sample533.1_39R08C01	32.24	8	4.03	44321	Loss	iPattern
53 | 1	25598276	25642596	1	Sample265.1_16R03C01	18.96	8	2.37	44321	Loss	iPattern
54 | 1	25598276	25642596	1	Sample654.1_23R06C01	21.66	8	2.7075	44321	Loss	iPattern
55 | 1	25598276	25642596	1	Sample737.1_9R06C01	23.52	8	2.94	44321	Loss	iPattern
56 | 1	25598276	25642596	1	Sample611.1_4R05C01	27.92	8	3.49	44321	Loss	iPattern
57 | 1	25598276	25642596	1	SampleY-16.1_17R04C01	27.15	8	3.39375	44321	Loss	iPattern
58 | 1	25598276	25642596	1	Sample569.1_41R04C01	22.89	8	2.86125	44321	Loss	iPattern
59 | 1	25598276	25642596	1	Sample751.1_10R04C01	22.17	8	2.77125	44321	Loss	iPattern
60 | 1	25598276	25642596	1	SampleY-53.1_21R08C01	13.27	8	1.65875	44321	Loss	iPattern
61 | 1	25598276	25642596	1	Sample29.1_11R03C01	25.16	8	3.145	44321	Loss	iPattern
62 | 1	25598276	25642596	1	Sample694.1_19R07C01	25.76	8	3.22	44321	Loss	iPattern
63 | 1	25598276	25642596	1	Sample13.1_12R02C01	30.73	8	3.84125	44321	Loss	iPattern
64 | 1	25598276	25642596	1	SampleY-59.1_28R08C01	22.65	8	2.83125	44321	Loss	iPattern
65 | 1	25598276	25642596	1	Sample141.1_21R02C01	21.73	8	2.71625	44321	Loss	iPattern
66 | 1	25598276	25642596	1	SampleY-47.1_4R04C01	27.89	8	3.48625	44321	Loss	iPattern
67 | 1	25598276	25642596	1	Sample697.1_24R05C01	25.43	8	3.17875	44321	Loss	iPattern
68 | 1	25598276	25642596	1	Sample400.1_9R01C01	25.01	8	3.12625	44321	Loss	iPattern
69 | 1	25598276	25642596	1	Sample255.1_28R01C01	21.14	8	2.6425	44321	Loss	iPattern
70 | 1	25598276	25642596	1	Samplec97.1_10R06C01	16.92	8	2.115	44321	Loss	iPattern
71 | 1	25598276	25642596	1	Sample296.1_30R07C01	16.7	8	2.0875	44321	Loss	iPattern
72 | 1	25598276	25642596	1	SampleY-58.1_3R08C01	14.27	8	1.78375	44321	Loss	iPattern
73 | 1	25598276	25642596	1	Sample371.1_31R02C01	19.71	8	2.46375	44321	Loss	iPattern
74 | 1	25598276	25642596	1	SampleY-68.1_27R01C01	12.81	8	1.60125	44321	Loss	iPattern
75 | 1	25598276	25642596	1	Sample580.1_43R05C01	37.41	8	4.67625	44321	Loss	iPattern
76 | 1	25598276	25642596	1	Sample15.1_71R04C01	17.87	8	2.23375	44321	Loss	iPattern
77 | 1	25598276	25642596	1	Sample110.1_62R05C01	25.08	8	3.135	44321	Loss	iPattern
78 | 1	25598276	25642596	1	Sample192.1_63R04C01	17.28	8	2.16	44321	Loss	iPattern
79 | 1	25598276	25642596	1	Sample379.1_34R03C01	22.39	8	2.79875	44321	Loss	iPattern
80 | 1	25598276	25642596	1	Sample157.1_63R08C01	20.82	8	2.6025	44321	Loss	iPattern
81 | 1	25598276	25642596	1	Samplec39.1_59R05C01	27.17	8	3.39625	44321	Loss	iPattern
82 | 1	25598276	25642596	1	Samplec14.1_59R07C01	21.63	8	2.70375	44321	Loss	iPattern
83 | 1	25598276	25642596	1	Sample252.1_33R05C01	19.32	8	2.415	44321	Loss	iPattern
84 | 1	25598276	25642596	1	Samplec44.1_65R04C01	15.49	8	1.93625	44321	Loss	iPattern
85 | 1	25598276	25642596	1	Sample285.1_67R04C01	20.95	8	2.61875	44321	Loss	iPattern
86 | 1	25598276	25669467	1	Sample21.1_6R08C01	14.7	10	1.47	71192	Loss	iPattern
87 | 1	25598276	25669467	1	Sample334.1_16R07C01	13.82	10	1.382	71192	Loss	iPattern
88 | 1	25598276	25669467	1	Sample38.1_71R08C01	13.88	10	1.388	71192	Loss	iPattern
89 | 1	25598276	25669467	1	Sample303.1_33R07C01	16.36	10	1.636	71192	Loss	iPattern
90 | 1	25598276	25669467	1	Sample422.1_52R08C01	16.44	10	1.644	71192	Loss	iPattern
91 | 1	25598276	25669467	1	Sample404.1_24R08C01	12.77	10	1.277	71192	Loss	iPattern
92 | 1	25598276	25669467	1	Sample345.1_36R03C01	17.46	10	1.746	71192	Loss	iPattern
93 | 1	25598276	25669467	1	Sample171.1_78R07C01	15.15	10	1.515	71192	Loss	iPattern
94 | 1	25598276	25669467	1	Sample424.1_53R08C01	11.77	10	1.177	71192	Loss	iPattern
95 | 1	25598276	25669467	1	Sample526.1_57R04C01	23.21	10	2.321	71192	Loss	iPattern
96 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/step.3.check.and.resubmit.jobs.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## NOTE: The scripts embraced by "##<<<... ##>>>..." need to be specified based on your system
  4 | 
  5 | suppressMessages(require(optparse))
  6 | 
  7 | option_list = list(
  8 |   make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA,
  9 |               help = "Path to the directory containing necessary input data."),
 10 |   make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA,
 11 |               help = "Path to the directory for saving results."),
 12 |   make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA,
 13 |               help = "Path to chromosome-wise LRR and BAF matrices."),
 14 |   make_option(c("-s", "--sourcefile"), action = "store", type = "character", default = NA,
 15 |               help = "Path to the scripts directory containing R scripts to be loaded into R."),
 16 |   make_option(c("-d", "--duplicates"), action = "store_true", default = FALSE,
 17 |               help = "[optional] Whether duplicate pairs information will be annotated in diagnosis plots."),
 18 |   make_option(c("-n", "--plot"), action = "store_true", default = FALSE,
 19 |               help = "[optional] Whether to generate diagnosis plots."),
 20 |   make_option(c("-r", "--script"), action = "store", type = "character", default = NA,
 21 |               help = "Path to the main script CNV.genotype.one.chr.one.batch.R."),
 22 |   make_option(c("-l", "--joblog"), action = "store", type = "character", default = NA,
 23 |               help = "Path to the directory saving job logs."),
 24 |   make_option(c("-f", "--flag"), action = "store", type = "integer", default = NA,
 25 |               help = "0: only print the running status of CNV genotyping; 1: resubmit jobs for unfinished CNV genotyping")
 26 | )
 27 | 
 28 | opt = parse_args(OptionParser(option_list = option_list))
 29 | pars = c(opt$type, opt$datapath, opt$resultpath, opt$joblog,
 30 |          opt$matrixpath, opt$sourcefile, opt$script, opt$flag)
 31 | 
 32 | if ( any(is.na(pars)) ) {
 33 |   stop("All parameters must be supplied. (--help for detail)")
 34 | }
 35 | 
 36 | flag <- as.integer( opt$flag )  ## 0 or 1
 37 | 
 38 | # resubmit unfinished jobs
 39 | file_cnvr <- "cnvr_batch.txt"  ## with batch information
 40 | dt_cnvr_raw <- read.delim(file = file.path(opt$datapath, file_cnvr), as.is = TRUE)
 41 | dt_cnvr_raw <- dt_cnvr_raw[order(dt_cnvr_raw$chr, dt_cnvr_raw$batch), ]
 42 | # add fname column
 43 | dt_cnvr_raw$fname <- paste0(dt_cnvr_raw$CNVR_ID, "_pred.rds") 
 44 | 
 45 | tbl_raw <- table(dt_cnvr_raw$chr, dt_cnvr_raw$batch)
 46 | dt_freq_raw <- as.data.frame(tbl_raw)
 47 | names(dt_freq_raw) <- c("chr", "batch", "Freq")
 48 | 
 49 | dt_freq_raw <- subset(dt_freq_raw, Freq != 0)  ## subset non-null batch
 50 | dt_freq_raw <- dt_freq_raw[order(dt_freq_raw$chr, dt_freq_raw$batch), ]
 51 | 
 52 | path_main_pred <- file.path(opt$resultpath, "pred")
 53 | path_main_failed <- file.path(opt$resultpath, "cnvrs_error")
 54 | 
 55 | # create script
 56 | script <- file.path(opt$script, "CNV.genotype.one.chr.one.batch.R")
 57 | cmd    <- paste("Rscript", script, 
 58 |                 "--datapath", opt$datapath,
 59 |                 "--resultpath", opt$resultpath,
 60 |                 "--matrixpath", opt$matrixpath,
 61 |                 "--sourcefile", opt$sourcefile)
 62 | 
 63 | if ( opt$duplicates ) cmd <- paste(cmd, "--duplicates")
 64 | if ( opt$plot ) cmd <- paste(cmd, "--plot")
 65 | path_joblog <- opt$joblog
 66 | 
 67 | # check if CNV genotyping for all CNVRs is finished ----------------------------------
 68 | check_jobs <- function(path_main, dt_cnvr_raw, flag, path_main_failed, path_joblog) {
 69 |   
 70 |   path_job_error <- file.path(path_joblog, "job", "ERROR")
 71 |   path_job_out   <- file.path(path_joblog, "job", "OUT")
 72 |   
 73 |   # remove all previous results
 74 |   system( paste("rm -rf", path_main_failed) )
 75 |   
 76 |   tbl_raw <- table(dt_cnvr_raw$chr, dt_cnvr_raw$batch)
 77 |   dt_freq_raw <- as.data.frame(tbl_raw)
 78 |   names(dt_freq_raw) <- c("chr", "batch", "Freq")
 79 |   
 80 |   dt_freq_raw <- subset(dt_freq_raw, Freq != 0)
 81 |   dt_freq_raw <- dt_freq_raw[order(dt_freq_raw$chr, dt_freq_raw$batch), ]
 82 |   
 83 |   for (i in 1:nrow(dt_freq_raw)) {
 84 |     
 85 |     chr1 <- dt_freq_raw$chr[i]
 86 |     batch1 <- dt_freq_raw$batch[i]
 87 |     freq1 <- dt_freq_raw$Freq[i]
 88 |     
 89 |     foldername1 <- paste0("chr_", chr1, "_batch_", batch1)
 90 |     path1 <- file.path(path_main, foldername1)
 91 |     
 92 |     if ( !dir.exists(paths = path1) ) { 
 93 |       cat("CHR:", chr1, "BATCH:", batch1, "The whole batch failed and jobs will be resubmitted.\n")
 94 |       
 95 |       # submit jobs
 96 |       if (flag == 1) {
 97 |         
 98 |         cmd1 = paste(cmd, "--chr", chr1, "--batch", batch1, "--type", 0)
 99 | 
100 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
101 | ## configure based on your system
102 |         bsub.cmd = paste("bsub -n 2 -W 10:00 -R 'rusage[mem=20000]' -P <account>",
103 |                          "-e", file.path(path_job_error, paste0("chr_", chr1, "_batch_", batch1, ".e")), 
104 |                          "-o", file.path(path_job_out, paste0("chr_", chr1, "_batch_", batch1, ".o")),
105 |                          "-q premium", shQuote(cmd1))
106 |         cat(bsub.cmd, "\n")
107 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
108 |         
109 |         system(bsub.cmd)
110 |       }
111 |       
112 |     } else {
113 |       ## the results folder for the current batch exists
114 |       files <- list.files(path = path1)
115 |       dt1   <- subset(dt_cnvr_raw, chr == chr1 & batch == batch1)
116 |       dt1.failed <- subset(dt1, !fname %in% files)
117 |       
118 |       if (nrow(dt1.failed) == 0) {
119 |         cat("CHR:", chr1, "BATCH:", batch1, "TOTAL:", freq1, "SUCCEED!\n")
120 |       
121 |       } else {  
122 |         cat("CHR:", chr1, "BATCH:", batch1, "TOTAL:", freq1, "FAILED:", nrow(dt1.failed), "\n")
123 |         
124 |         if ( !dir.exists(paths = path_main_failed) ) {
125 |           dir.create(path = path_main_failed, showWarnings = F, recursive = T)
126 |         }
127 |         
128 |         fname.failed <- paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt")
129 |         write.table(data.frame(CNVR_ID = dt1.failed$CNVR_ID, stringsAsFactors = F),
130 |                     file = file.path(path_main_failed, paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt")),
131 |                     col.names = T, row.names = F, quote = F)
132 |         
133 |         if (flag == 1) {
134 |           cmd1 = paste(cmd, "--chr", chr1, "--batch", batch1, "--type", 1)
135 |           
136 | ##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
137 | ## configure based on your system
138 |           bsub.cmd = paste("bsub -n 2 -W 10:00 -R 'rusage[mem=20000]' -P <account>",
139 |                            "-e", file.path(path_job_error, paste0("chr_", chr1, "_batch_", batch1, ".e")), 
140 |                            "-o", file.path(path_job_out, paste0("chr_", chr1, "_batch_", batch1, ".o")),
141 |                            "-q premium", shQuote(cmd1))
142 | ##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
143 |           
144 |           cat(bsub.cmd, "\n")
145 |           system(bsub.cmd)
146 |         }
147 |       }
148 |     }
149 |   }
150 | }
151 | 
152 | # main runing function --------------------------------------------
153 | check_jobs(path_main = path_main_pred, 
154 |            dt_cnvr_raw = dt_cnvr_raw, 
155 |            flag = flag, 
156 |            path_main_failed = path_main_failed,
157 |            path_joblog = path_joblog)
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/01_initial_call/run_PennCNV/README.md:
--------------------------------------------------------------------------------
  1 | ## PennCNV
  2 | 
  3 | ### Installation
  4 | 
  5 | To download and install PennCNV, please follow the detailed instructions (including trouble shooting) at the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/install/). For more information about PennCNV, please refer to their original [PennCNV website](http://penncnv.openbioinformatics.org/en/latest/).
  6 | 
  7 | After installation, set up environment variable PENNCNV: `export PENNCNV='/path/to/penncnv'`
  8 | 
  9 | ### Analysis workflow
 10 | 
 11 | Note: 
 12 | 
 13 | - PennCNV was originally designed to sequentially analyze one sample at a time. Please refer to [PennCNV website](http://penncnv.openbioinformatics.org/en/latest/) for how to run PennCNV in a sequential way. Here, we provide scripts to run the analysis on multiple samples in parallel via job submitting system (one sample per job) in a cluster environment. 
 14 | 
 15 | - In the following steps (2) and (3), the scripts regarding job submission embraced by "##<<<... ##>>>..." in the scripts need to be specified by the users based on the system the users are using.
 16 | 
 17 | We run PennCNV analysis with the following 5 steps:
 18 | 
 19 | #### (1) Prepare SNP.pfb and SNP.gcmodel files
 20 | 
 21 | #### (1.1) compile pfb (population frequency of B allele) file
 22 | ```sh
 23 | perl ${PENNCNV}/bin/compile_pfb.pl \
 24 | -snpposfile ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/SNP_pos.txt \
 25 | -listfile ${WKDIR}/01_initial_call/run_PennCNV/data_aux/list_pfb.txt \
 26 | -output ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb
 27 | ```
 28 | 
 29 | Note:
 30 | 
 31 |   - For more information about pfb file, please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/input/#pfb-population-frequency-of-b-allele-file).
 32 | 
 33 |   - `SNP_pos.txt`: generated by `finalreport_to_matrix_LRR_and_BAF.pl` in the [initial step](https://github.com/HaoKeLab/ensembleCNV#prepare-chromosome-wise-lrr-and-baf-matrices-for-cnv-genotyping).
 34 | 
 35 |   - `list_pfb.txt`: the users need to prepare a text file that contains a list of full path to signal files in `{WKDIR}/01_initial_call/run_PennCNV/data` generated by `finalreport_to_PennCNV.pl` in the [initial step](https://github.com/HaoKeLab/ensembleCNV#prepare-data-for-individual-cnv-callers), one per line. The pfb file compiled from only a few samples is not valid -- at least about 100 samples (e.g. one 96-well plate of samples) are needed. Based on our experience, if the sample size of the projects is very large, the users do not need to use signal files from all samples. Instead, a subset of 300 to 500 samples from unrelated subjects are good enough to estimate PFB (population frequency of B allele) for the project. Please put the prepared `list_pfb.txt` in the directory `${WKDIR}/01_initial_call/run_PennCNV/data_aux`.
 36 | 
 37 |   - The `SNP.pfb` will not only be used by PennCNV but also employed by ensembleCNV for [CNV genotyping](https://github.com/HaoKeLab/ensembleCNV#4-cnv-genotyping-for-each-cnvr). 
 38 | 
 39 | 
 40 | #### (1.2) compile gcmodel file for GC content ajdustment
 41 | 
 42 | ```sh
 43 | perl ${PENNCNV}/bin/cal_gc_snp.pl \
 44 | ${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base_hg19.txt.sorted \
 45 | ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \
 46 | -output ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel
 47 | ```
 48 | 
 49 | Note: 
 50 | 
 51 | - For more information about gcmodel file, please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/input/#gcmodel-file).
 52 | 
 53 | - The `gc5Base_hg19.txt.sorted` (take hg19 for example) is generated based on UCSC Genome Browser annotation file (http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gc5Base.txt.gz). Despite the file name, it actually contains GC content per 5120bp. If you need GC annotation file for other genome assembly, please download it from the corresponding directory names. After downloading `gc5Base.txt.gz` (put in `${WKDIR}/01_initial_call/run_PennCNV/data_aux`) and unzipping the file, then sort this file such that chromosome and positions are sorted.
 54 | ```sh
 55 | sort -k 2,2 -k 3,3n \
 56 | <${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base.txt \
 57 | >${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base_hg19.txt.sorted
 58 | ```
 59 | 
 60 | #### (2) Run PennCNV for each sample in parallel (through job submitting system on cluster)
 61 | 
 62 | Note: 
 63 | 
 64 | - In `step.2.run.PennCNV.jobs.R`, the scripts regarding job submission embraced by "##<<<... ##>>>..." need to be specified based on your system.
 65 | 
 66 | - For more information about CNV calling by PennCNV, please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/test/).
 67 | 
 68 | ```sh 
 69 | Rscript ${WKDIR}/01_initial_call/run_PennCNV/step.2.run.PennCNV.jobs.R \
 70 | --penncnv ${PENNCNV} \                                  ## direct to ${PENNCNV}/bin/detect_cnv.pl
 71 | --data ${WKDIR}/01_initial_call/run_PennCNV/data \      ## generated with finalreport_to_PennCNV.pl
 72 | --wkdir ${WKDIR}/01_initial_call/run_PennCNV/results \  ## output directory
 73 | --pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \
 74 | --gcmodel ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel \
 75 | --hmm ${PENNCNV}/lib/hhall.hmm
 76 | ```
 77 | 
 78 | When the analysis is completed, there will be subfolders named after sample IDs, each for one sample respectively, created in the directory `${WKDIR}/01_initial_call/run_PennCNV/results/res`. Within each sample subfolders, two files will be generated:
 79 | - `<Sample_ID>.log`: log file generated by `detect_cnv.pl`, information from which will be retrieved to generated sample-level summary statistics (see step (5) below). 
 80 | - `<Sample_ID>.rawcnv`: raw CNV calls made by `detect_cnv.pl`.
 81 | 
 82 | #### (3) Check job status and resubmit failed jobs
 83 | 
 84 | Note: In `step.3.check.PennCNV.jobs.R`, the scripts regarding job submission embraced by "##<<<... ##>>>..." need to be specified based on your system.
 85 | 
 86 | ```sh
 87 | Rscrip ${WKDIR}/01_initial_call/run_PennCNV/step.3.check.PennCNV.jobs.R \
 88 | --penncnv ${PENNCNV} \                                  ## direct to ${PENNCNV}/bin/detect_cnv.pl
 89 | --data ${WKDIR}/01_initial_call/run_PennCNV/data/ \     ## generated with finalreport_to_PennCNV.pl
 90 | --wkdir ${WKDIR}/01_initial_call/run_PennCNV/results \  ## output directory
 91 | --pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \
 92 | --gcmodel ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel \
 93 | --hmm ${PENNCNV}/lib/hhall.hmm
 94 | ```
 95 | This step checks if the jobs submitted for each sample in step (2) are successfully completed and resubmits failed jobs if there is any.
 96 | 
 97 | 
 98 | #### (4) Combine PennCNV results (.rawcnv and .log files) from each sample
 99 | ```sh
100 | perl ${WKDIR}/01_initial_call/run_PennCNV/step.4.combine.PennCNV.res.pl \
101 | --in_dir ${WKDIR}/01_initial_call/run_PennCNV/results/res \
102 | --out_dir ${WKDIR}/01_initial_call/run_PennCNV/results
103 | ```
104 | This script screens `.log` and `.rawcnv` files for all samples generated in steps (2) and (3), and combines them. When this step is completed, there will be two files generated in the directory `${WKDIR}/01_initial_call/run_PennCNV/results`:
105 | 
106 | - `CNV.PennCNV.log`: combined log file of the `.log` files from all samples.
107 | 
108 | - `CNV.PennCNV.rawcnv`: combined raw CNV calls of the `.rawcnv` files from all samples.
109 | 
110 | #### (5) Merge closely adjacent CNVs and generate final results
111 | ```sh
112 | Rscript ${WKDIR}/01_initial_call/run_PennCNV/step.5.clean.PennCNV.res.R \
113 | --penncnv ${PENNCNV} \                                  ## direct to installation directory ${PENNCNV}
114 | --input ${WKDIR}/01_initial_call/run_PennCNV/results \
115 | --pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb
116 | ```
117 | 
118 | This script is a wrapper to run three perl scripts in PennCNV package:
119 | 
120 | - `${PENNCNV}/bin/clean_cnv.pl`: merge adjacent CNVs which are close to each other. Please refer to the [page](http://penncnv.openbioinformatics.org/en/latest/user-guide/annotation/#merging-adjacent-cnv-calls) for details.
121 | 
122 | - `${PENNCNV}/bin/convert_cnv.pl`: convert the CNV calls in `.rawcnv` format to the tab-delimited table.
123 | 
124 | - `${PENNCNV}/bin/filter_cnv.pl`: extract sample-level summary statistics from log file.
125 | 
126 | When the analysis is completed, you will find two files, which will be used by ensembleCNV, in the directory `${WKDIR}/01_initial_call/run_PennCNV/results`:
127 | 
128 | - `CNV.PennCNV_new.txt`: CNV calls of all samples.
129 | - `CNV.PennCNV_qc_new.txt`: sample-level summary statistics.
130 | 
131 | 


--------------------------------------------------------------------------------
/06_performance_assessment/step.1.performance.assessment.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressMessages(library(optparse))
  4 | suppressMessages(library(ggplot2))
  5 | suppressMessages(library(cowplot))
  6 | 
  7 | option_list <- list(
  8 |   make_option(c("-d", "--duplicates"), action = "store", default = NA, type = "character",
  9 |               help = "Path to duplicate pairs information."),
 10 |   make_option(c("-n", "--matrixCN"), action = "store", default = NA,type = "character",
 11 |               help = "Path to matrix of copy number (CN)"),
 12 |   make_option(c("-g", "--matrixGQ"), action = "store", default = NA,type = "character",
 13 |               help = "Path to matrix of genotyping quality (GQ) score."),
 14 |   make_option(c("-o", "--resultpath"), action = "store", default = NA,type = "character",
 15 |               help = "Path to directory for saving assessment results.")
 16 | )
 17 | 
 18 | opt <- parse_args(OptionParser(option_list = option_list))
 19 | pars <- c(opt$duplicates, opt$matrixCN, opt$matrixGQ, opt$resultpath)
 20 | 
 21 | if (any(is.na(pars))) {
 22 |   stop("All required parameters must be supplied. (--help for detail)")
 23 | }
 24 | 
 25 | file_duplicates <- opt$duplicates
 26 | file_matrixcn   <- opt$matrixCN
 27 | file_matrixgq   <- opt$matrixGQ
 28 | path_result     <- opt$resultpath
 29 | 
 30 | dup_pairs <- read.delim(file = file_duplicates, as.is = TRUE)
 31 | matrix_CN <- readRDS(file = file_matrixcn)
 32 | matrix_gq <- readRDS(file = file_matrixgq)
 33 | 
 34 | # functions ---------------------------------------------------------------
 35 | 
 36 | generate_results <- function(mat, dup_pairs, gq1) {
 37 |   
 38 |   # clean cnvr with all CN = 2 or missing (denoted as -9)
 39 |   mat <- mat
 40 |   n_cnvr_raw <- nrow(mat)
 41 |   n_sample <- ncol(mat)
 42 |   
 43 |   # filter CNVR
 44 |   freqs <- unlist(lapply(1:n_cnvr_raw, FUN = function(k) {
 45 |     v1 <- as.vector(mat[k, ])
 46 |     freq1 <- sum(v1 %in% c(0, 1, 3))
 47 |     freq1
 48 |   }))
 49 |   
 50 |   idxs_del <- which(freqs == 0)
 51 |   mat1 <- mat
 52 |   if (length(idxs_del) >= 1) {
 53 |     mat1 <- mat[-idxs_del, ]
 54 |   }
 55 |   n_cnvr <- nrow(mat1)
 56 |   cat("After cleaning CNVRs with no CNV calls,", n_cnvr, "CNVRs remains from", n_cnvr_raw, "CNVRs.\n")
 57 |   mat_clean <- mat1  ## after cleaning nocall CNVR_ID
 58 |   
 59 |   ## sample level callRate
 60 |   freq_sample = unlist(lapply(1:n_sample, FUN = function(k) {
 61 |     v1 = as.vector( mat_clean[, k] )
 62 |     sum(v1 %in% c(0, 1, 2, 3))
 63 |   }))
 64 |   callRate_sample = freq_sample/n_cnvr
 65 |   ## CNVR level callRate
 66 |   freq_cnvr = unlist(lapply(1:n_cnvr, FUN = function(k) {
 67 |     v1 = as.vector( mat_clean[k, ] )
 68 |     sum(v1 %in% c(0, 1, 2, 3))
 69 |   }))
 70 |   callRate_cnvr = freq_cnvr/n_sample
 71 |   
 72 |   dat_callRate_cnvr = data.frame(callRate_cnvr = callRate_cnvr, 
 73 |                                  cutoff_gq = gq1, 
 74 |                                  stringsAsFactors = FALSE)
 75 |   dat_callRate_sample = data.frame(callRate_sample = callRate_sample,
 76 |                                    cutoff_gq = gq1, 
 77 |                                    stringsAsFactors = FALSE)
 78 |   
 79 |   ## consistency rate
 80 |   consistency_rates <- c()
 81 |   for (i in 1:nrow(dup_pairs)) {
 82 |     
 83 |     sample1 <- dup_pairs$sample1.name[i]
 84 |     sample2 <- dup_pairs$sample2.name[i]
 85 |     
 86 |     cns1 <- as.vector(mat1[, sample1])
 87 |     cns2 <- as.vector(mat1[, sample2])   # copy number of sample2
 88 |     
 89 |     # filter nocall cnvr
 90 |     idxs <- union(which(cns1 == -9), which(cns2 == -9))
 91 |     if(length(idxs) >= 1) {
 92 |       cns1 <- cns1[-idxs]
 93 |       cns2 <- cns2[-idxs]
 94 |     }
 95 |     
 96 |     idxs_overlap <- which(cns1 != 2 & cns2 != 2 & cns1 == cns2)
 97 |     idxs_union   <- union(which(cns1 != 2), which(cns2 != 2))
 98 |     
 99 |     rate1 <- length(idxs_overlap)/length(idxs_union)
100 |     consistency_rates  <- c(consistency_rates, rate1)
101 |   }
102 |   
103 |   res_consistency <- data.frame(consistency_rate = consistency_rates,
104 |                                 sample1.name = dup_pairs$sample1.name,
105 |                                 sample2.name = dup_pairs$sample2.name,
106 |                                 n_cnvr = n_cnvr,
107 |                                 cutoff_gq = gq1,
108 |                                 stringsAsFactors = FALSE)
109 |   
110 |   res_ncnvr = data.frame(cutoff_gq = gq1, n_cnvr = n_cnvr, 
111 |                          stringsAsFactors = FALSE)
112 |   ## return list results
113 |   return(list(
114 |     res_consistency = res_consistency,
115 |     res_callRate_cnvr = dat_callRate_cnvr,
116 |     res_callRate_sample = dat_callRate_sample,
117 |     res_ncnvr = res_ncnvr
118 |   ))
119 | }
120 | 
121 | summary_regenotype = function(mat_CN, mat_gq, cutoffs_gq, dup_pairs) {
122 |   
123 |   res_consistency = data.frame() ## output dat for plot
124 |   res_callRate_cnvr = data.frame()
125 |   res_callRate_sample = data.frame()
126 |   res_ncnvr = data.frame()
127 |   
128 |   for (i in 1:length(cutoffs_gq)) {
129 |     
130 |     gq1  = cutoffs_gq[i]
131 |     cat("gq_score:", gq1, "\n")
132 |     idx1 = which(mat_gq < gq1)
133 |     if (length(idx1) >= 1) {
134 |       mat_CN[idx1] = -9
135 |     }
136 |     
137 |     res_gq1 = generate_results(mat = mat_CN,
138 |                                dup_pairs = dup_pairs,
139 |                                gq1 = gq1)
140 |     
141 |     res_consistency = rbind(res_consistency, res_gq1$res_consistency)
142 |     res_callRate_sample = rbind(res_callRate_sample, res_gq1$res_callRate_sample)
143 |     res_callRate_cnvr = rbind(res_callRate_cnvr, res_gq1$res_callRate_cnvr)
144 |     res_ncnvr = rbind(res_ncnvr, res_gq1$res_ncnvr)
145 |     
146 |   }
147 |   
148 |   res_consistency$cutoff_gq = factor(res_consistency$cutoff_gq, levels = cutoffs_gq)
149 |   res_callRate_sample$cutoff_gq = factor(res_callRate_sample$cutoff_gq, levels = cutoffs_gq)
150 |   res_callRate_cnvr$cutoff_gq = factor(res_callRate_cnvr$cutoff_gq, levels = cutoffs_gq)
151 |   res_ncnvr$cutoff_gq = factor(res_ncnvr$cutoff_gq, levels = cutoffs_gq)
152 |   
153 |   ## return list results
154 |   return(list(
155 |     res_consistency = res_consistency,
156 |     res_callRate_cnvr = res_callRate_cnvr,
157 |     res_callRate_sample = res_callRate_sample,
158 |     res_ncnvr = res_ncnvr
159 |   ))
160 | }
161 | 
162 | # main --------------------------------------------------------------------
163 | 
164 | cutoffs_gq <- c(0, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80)
165 | 
166 | res <- summary_regenotype(mat_CN = matrix_CN,
167 |                           mat_gq = matrix_gq,
168 |                           cutoffs_gq = cutoffs_gq,
169 |                           dup_pairs = dup_pairs)
170 | 
171 | res_consistency     <- res$res_consistency
172 | res_ncnvr           <- res$res_ncnvr
173 | res_callRate_sample <- res$res_callRate_sample
174 | res_callRate_cnvr   <- res$res_callRate_cnvr
175 | 
176 | saveRDS(res, file = file.path(path_result, "performance_assessment.rds"))
177 | 
178 | # start plot --------------------------------------------------------------
179 | 
180 | p1 = ggplot(data = res_consistency, aes(cutoff_gq, consistency_rate)) +
181 |   geom_boxplot() +
182 |   theme_bw() + 
183 |   geom_hline(yintercept = 0.9, lty = 2, lwd = 1, col = "grey60") +
184 |   theme(panel.background = element_blank(),
185 |         plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
186 |         axis.title.x = element_text(size = 18),
187 |         axis.title.y = element_text(size = 18),
188 |         axis.text = element_text(size = 15),
189 |         strip.text = element_text(size = 15)) +
190 |   xlab("GQ score threshold") +
191 |   ylab("Concordance rate") +
192 |   ggtitle("Concrodance rate")
193 | 
194 | p2 = ggplot(data = res_ncnvr, aes(cutoff_gq, n_cnvr)) +
195 |   #geom_col() +
196 |   geom_bar(stat = "identity") +
197 |   theme_bw() +
198 |   theme(panel.background = element_blank(),
199 |         plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
200 |         axis.title.x = element_text(size = 18),
201 |         axis.title.y = element_text(size = 18),
202 |         axis.text = element_text(size = 15),
203 |         strip.text = element_text(size = 15)) +
204 |   xlab("GQ score threshold") +
205 |   ylab("Number of CNVRs") +
206 |   ggtitle("Number of CNVRs")
207 | 
208 | p3 = ggplot(data = res_callRate_sample, aes(cutoff_gq, callRate_sample)) +
209 |   geom_boxplot() +
210 |   theme_bw() +
211 |   geom_hline(yintercept = 0.9, lty = 2, lwd = 1, col = "grey60") +
212 |   theme(panel.background = element_blank(),
213 |         plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
214 |         axis.title.x = element_text(size = 18),
215 |         axis.title.y = element_text(size = 18),
216 |         axis.text = element_text(size = 15),
217 |         strip.text = element_text(size = 15)) +
218 |   xlab("GQ score threshold") +
219 |   ylab("Sample-wise call rate") +
220 |   ggtitle("Sample-wise call rate")
221 | 
222 | 
223 | p4 = ggplot(data = res_callRate_cnvr, aes(cutoff_gq, callRate_cnvr)) +
224 |   geom_boxplot() +
225 |   geom_hline(yintercept = 0.9, lty = 2, lwd = 1, col = "grey60") +
226 |   theme_bw() +
227 |   theme(panel.background = element_blank(),
228 |         plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
229 |         axis.title.x = element_text(size = 18),
230 |         axis.title.y = element_text(size = 18),
231 |         axis.text = element_text(size = 15),
232 |         strip.text = element_text(size = 15)) +
233 |   xlab("GQ score threshold") +
234 |   ylab("CNVR-wise call rate") +
235 |   ggtitle("CNVR-wise call rate")
236 | 
237 | png(filename = file.path(path_result, "performance_assessment.png"),
238 |     width = 12, height = 12, units = "in", res = 512)
239 | 
240 | p = plot_grid(p1, p2, p3, p4, 
241 |               nrow = 2, ncol = 2,
242 |               labels = LETTERS[1:4],
243 |               label_size = 22,
244 |               vjust = 1.2, align = "hv")
245 | print(p)
246 | 
247 | dev.off()
248 | 
249 | 
250 | 


--------------------------------------------------------------------------------
/example/example_create_CNVR/data/iPattern_all_calls.txt:
--------------------------------------------------------------------------------
 1 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.07	Sample86.1_3R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 2 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	16.2	Sample6.1_4R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 3 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.25	Sample143.1_5R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 4 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	27	Sample62.1_7R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 5 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	16.59	Sample35.1_9R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 6 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.84	Sample60.1_9R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 7 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	26.89	Samplec9.1_38R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 8 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	27.79	Sample289.1_16R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
 9 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	14.83	Sample651.1_3R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
10 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	29.96	Sample496.1_40R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
11 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.4	Sample130.1_18R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
12 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	20.28	Sample217.1_18R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
13 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	22.59	Sample242.1_20R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
14 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	33.39	Sample520.1_41R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
15 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	31.81	Sample66.1_12R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
16 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.54	Sample177.1_21R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
17 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	20.9	Sample160.1_22R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
18 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.55	Samplec119.1_18R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
19 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	23.52	Sample138.1_23R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
20 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.26	Sample390.1_24R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
21 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	20.91	SampleY-37.1_11R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
22 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	26.63	Sample212.1_25R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
23 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	18.67	Sample686.1_9R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
24 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.34	Sample668.1_14R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
25 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.32	Sample256.1_28R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
26 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	32.4	Sample200.1_29R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
27 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	20.15	SampleY-70.1_4R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
28 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.06	Sample235.1_30R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
29 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	21.45	Sample283.1_30R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
30 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.78	SampleY-41.1_1R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
31 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	26.36	Sample556.1_43R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
32 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	28.9	Sample592.1_43R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
33 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.92	Sample527.1_44R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
34 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.96	Sample550.1_46R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
35 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.04	Sample433.1_48R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
36 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	23.29	Sample445.1_79R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
37 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.71	Sample522.1_78R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
38 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.97	Sample331.1_35R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
39 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	22.22	Sample560.1_78R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
40 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	12.07	Sample392.1_35R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
41 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	11.73	Sample501.1_75R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
42 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	22.09	Sample247.1_67R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
43 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	16.85	Sample439.1_55R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
44 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.91	Sample131.1_5R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
45 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	18.18	Sample72.1_8R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
46 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	18.32	Sample36.1_9R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
47 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	24.24	Sample724.1_12R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
48 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	18.17	Sample748.1_26R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
49 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	23.24	Sample516.1_37R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
50 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	21.96	Sample459.1_28R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
51 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	32.24	Sample533.1_39R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
52 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	18.96	Sample265.1_16R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
53 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	21.66	Sample654.1_23R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
54 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	23.52	Sample737.1_9R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
55 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	27.92	Sample611.1_4R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
56 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	27.15	SampleY-16.1_17R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
57 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	22.89	Sample569.1_41R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
58 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	22.17	Sample751.1_10R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
59 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	13.27	SampleY-53.1_21R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
60 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.16	Sample29.1_11R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
61 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.76	Sample694.1_19R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
62 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	30.73	Sample13.1_12R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
63 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	22.65	SampleY-59.1_28R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
64 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	21.73	Sample141.1_21R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
65 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	27.89	SampleY-47.1_4R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
66 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.43	Sample697.1_24R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
67 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.01	Sample400.1_9R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
68 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	21.14	Sample255.1_28R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
69 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	16.92	Samplec97.1_10R06C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
70 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	16.7	Sample296.1_30R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
71 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	14.27	SampleY-58.1_3R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
72 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.71	Sample371.1_31R02C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
73 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	12.81	SampleY-68.1_27R01C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
74 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	37.41	Sample580.1_43R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
75 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.87	Sample15.1_71R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
76 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	25.08	Sample110.1_62R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
77 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	17.28	Sample192.1_63R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
78 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	22.39	Sample379.1_34R03C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
79 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	20.82	Sample157.1_63R08C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
80 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	27.17	Samplec39.1_59R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
81 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	21.63	Samplec14.1_59R07C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
82 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	19.32	Sample252.1_33R05C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
83 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	15.49	Samplec44.1_65R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
84 | Loss	1	25598276	25642596	8	8	1_1	500	500	106	20.95	Sample285.1_67R04C01	cnve.1.25598276.25642596.1	cnvr.1.25572993.25674785
85 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	14.7	Sample21.1_6R08C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
86 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	13.82	Sample334.1_16R07C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
87 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	13.88	Sample38.1_71R08C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
88 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	16.36	Sample303.1_33R07C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
89 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	16.44	Sample422.1_52R08C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
90 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	12.77	Sample404.1_24R08C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
91 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	17.46	Sample345.1_36R03C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
92 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	15.15	Sample171.1_78R07C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
93 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	11.77	Sample424.1_53R08C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
94 | Loss	1	25598276	25669467	10	9	1_1	483.35	483.35	26	23.21	Sample526.1_57R04C01	cnve.1.25598276.25669467.2	cnvr.1.25572993.25674785
95 | 


--------------------------------------------------------------------------------
/example/example_create_CNVR/results/cnv.quantisnp.txt:
--------------------------------------------------------------------------------
  1 | chr	posStart	posEnd	CN	Sample_ID	conf	numSNP	avgConf	length	CNV_type	method
  2 | 1	25598276	25638253	3	Sample100.1_3R06C01	4.05203	7	0.578861428571429	39978	Gain	QuantiSNP
  3 | 1	25598276	25642596	3	Sample108.1_63R05C01	9.94526	8	1.2431575	44321	Gain	QuantiSNP
  4 | 1	25598276	25642596	3	Sample119.1_63R02C01	2.18403	8	0.27300375	44321	Gain	QuantiSNP
  5 | 1	25598276	25642596	3	Sample121.1_63R06C01	8.60815	8	1.07601875	44321	Gain	QuantiSNP
  6 | 1	25598276	25642596	3	Sample124.1_78R08C01	16.4803	8	2.0600375	44321	Gain	QuantiSNP
  7 | 1	25627470	25642596	3	Sample125.1_22R06C01	4.86867	7	0.695524285714286	15127	Gain	QuantiSNP
  8 | 1	25598276	25638253	3	Sample126.1_23R05C01	9.03692	7	1.29098857142857	39978	Gain	QuantiSNP
  9 | 1	25598276	25629950	1	Sample127.1_70R03C01	18.9998	5	3.79996	31675	Loss	QuantiSNP
 10 | 1	25598276	25629950	1	Sample139.1_70R04C01	12.9404	5	2.58808	31675	Loss	QuantiSNP
 11 | 1	25598276	25642596	3	Sample142.1_70R05C01	9.2058	8	1.150725	44321	Gain	QuantiSNP
 12 | 1	25598276	25638253	3	Sample161.1_23R01C01	4.33492	7	0.619274285714286	39978	Gain	QuantiSNP
 13 | 1	25598276	25642596	3	Sample166.1_18R03C01	17.3779	8	2.1722375	44321	Gain	QuantiSNP
 14 | 1	25627470	25638253	3	Sample167.1_44R03C01	11.4517	6	1.90861666666667	10784	Gain	QuantiSNP
 15 | 1	25629943	25642596	3	Sample169.1_62R01C01	13.5597	5	2.71194	12654	Gain	QuantiSNP
 16 | 1	25598276	25642596	3	Sample172.1_62R08C01	8.46983	8	1.05872875	44321	Gain	QuantiSNP
 17 | 1	25598276	25642596	3	Sample18.1_4R08C01	1.8337	8	0.2292125	44321	Gain	QuantiSNP
 18 | 1	25598276	25642596	3	Sample195.1_62R07C01	10.9461	8	1.3682625	44321	Gain	QuantiSNP
 19 | 1	25598276	25642596	3	Sample205.1_70R07C01	7.52285	8	0.94035625	44321	Gain	QuantiSNP
 20 | 1	25598276	25642596	3	Sample209.1_30R02C01	0.543627	8	0.067953375	44321	Gain	QuantiSNP
 21 | 1	25598276	25638253	3	Sample219.1_28R04C01	7.28972	7	1.04138857142857	39978	Gain	QuantiSNP
 22 | 1	25598276	25642596	3	Sample227.1_68R01C01	6.54151	8	0.81768875	44321	Gain	QuantiSNP
 23 | 1	25598276	25642596	3	Sample236.1_67R03C01	3.70644	8	0.463305	44321	Gain	QuantiSNP
 24 | 1	25598276	25638253	3	Sample237.1_25R08C01	2.80132	7	0.400188571428571	39978	Gain	QuantiSNP
 25 | 1	25598276	25642596	1	Sample243.1_20R08C01	16.0652	8	2.00815	44321	Loss	QuantiSNP
 26 | 1	25598276	25642596	3	Sample261.1_32R02C01	7.99707	8	0.99963375	44321	Gain	QuantiSNP
 27 | 1	25598276	25642596	3	Sample262.1_32R08C01	11.5614	8	1.445175	44321	Gain	QuantiSNP
 28 | 1	25598276	25629950	1	Sample266.1_20R03C01	14.7599	5	2.95198	31675	Loss	QuantiSNP
 29 | 1	25598276	25629950	1	Sample273.1_32R03C01	13.9294	5	2.78588	31675	Loss	QuantiSNP
 30 | 1	25598276	25629950	1	Sample275.1_33R04C01	13.5034	5	2.70068	31675	Loss	QuantiSNP
 31 | 1	25598276	25638253	1	Sample278.1_20R04C01	18.6362	7	2.66231428571429	39978	Loss	QuantiSNP
 32 | 1	25598276	25638253	3	Sample281.1_27R05C01	3.04208	7	0.434582857142857	39978	Gain	QuantiSNP
 33 | 1	25598276	25642596	3	Sample282.1_27R08C01	0.613086	8	0.07663575	44321	Gain	QuantiSNP
 34 | 1	25598276	25642596	3	Sample291.1_20R05C01	6.36673	8	0.79584125	44321	Gain	QuantiSNP
 35 | 1	25598276	25642596	3	Sample294.1_72R06C01	6.31589	8	0.78948625	44321	Gain	QuantiSNP
 36 | 1	25598276	25642596	3	Sample297.1_25R05C01	6.15803	8	0.76975375	44321	Gain	QuantiSNP
 37 | 1	25598276	25638253	3	Sample308.1_31R03C01	0.207963	7	0.029709	39978	Gain	QuantiSNP
 38 | 1	25598276	25642596	1	Sample309.1_66R07C01	20.5669	8	2.5708625	44321	Loss	QuantiSNP
 39 | 1	25629943	25642596	3	Sample314.1_80R08C01	12.3155	5	2.4631	12654	Gain	QuantiSNP
 40 | 1	25598276	25629950	1	Sample317.1_35R04C01	12.1177	5	2.42354	31675	Loss	QuantiSNP
 41 | 1	25598276	25638253	3	Sample320.1_16R06C01	4.8707	7	0.695814285714286	39978	Gain	QuantiSNP
 42 | 1	25598276	25642596	3	Sample323.1_69R02C01	4.10757	8	0.51344625	44321	Gain	QuantiSNP
 43 | 1	25598276	25638253	3	Sample328.1_13R02C01	10.3853	6	1.73088333333333	39978	Gain	QuantiSNP
 44 | 1	25598276	25638253	1	Sample332.1_73R01C01	19.6671	7	2.80958571428571	39978	Loss	QuantiSNP
 45 | 1	25598276	25642596	1	Sample349.1_19R03C01	20.8153	8	2.6019125	44321	Loss	QuantiSNP
 46 | 1	25598276	25638253	3	Sample356.1_35R06C01	7.99647	7	1.14235285714286	39978	Gain	QuantiSNP
 47 | 1	25598276	25642596	3	Sample357.1_36R04C01	13.6046	8	1.700575	44321	Gain	QuantiSNP
 48 | 1	25598276	25642596	1	Sample358.1_52R03C01	19.4456	8	2.4307	44321	Loss	QuantiSNP
 49 | 1	25627470	25638253	3	Sample360.1_31R06C01	4.52212	6	0.753686666666667	10784	Gain	QuantiSNP
 50 | 1	25598276	25629950	3	Sample363.1_17R05C01	8.85061	5	1.770122	31675	Gain	QuantiSNP
 51 | 1	25598276	25642596	3	Sample369.1_36R05C01	4.06593	8	0.50824125	44321	Gain	QuantiSNP
 52 | 1	25598276	25642596	3	Sample375.1_69R05C01	6.37298	8	0.7966225	44321	Gain	QuantiSNP
 53 | 1	25598276	25638253	3	Sample376.1_17R06C01	2.94036	7	0.420051428571429	39978	Gain	QuantiSNP
 54 | 1	25598276	25642596	3	Sample378.1_24R06C01	13.8549	8	1.7318625	44321	Gain	QuantiSNP
 55 | 1	25598276	25642596	3	Sample388.1_17R07C01	8.63671	8	1.07958875	44321	Gain	QuantiSNP
 56 | 1	25598276	25642596	3	Sample402.1_69R08C01	8.97555	8	1.12194375	44321	Gain	QuantiSNP
 57 | 1	25598276	25642596	3	Sample402.1_74R03C01	13.1224	8	1.6403	44321	Gain	QuantiSNP
 58 | 1	25627470	25642596	3	Sample407.1_36R02C01	5.65946	7	0.808494285714286	15127	Gain	QuantiSNP
 59 | 1	25598276	25642596	3	Sample417.1_77R05C01	19.0808	8	2.3851	44321	Gain	QuantiSNP
 60 | 1	25598276	25629950	1	Sample431.1_79R01C01	16.467	5	3.2934	31675	Loss	QuantiSNP
 61 | 1	25598276	25642596	3	Sample438.1_73R08C01	4.94542	8	0.6181775	44321	Gain	QuantiSNP
 62 | 1	25598276	25642596	3	Sample448.1_53R01C01	7.05554	8	0.8819425	44321	Gain	QuantiSNP
 63 | 1	25598276	25642596	3	Sample463.1_50R08C01	1.82352	8	0.22794	44321	Gain	QuantiSNP
 64 | 1	25598276	25642596	3	Sample464.1_55R06C01	22.5451	8	2.8181375	44321	Gain	QuantiSNP
 65 | 1	25598276	25638253	3	Sample47.1_9R02C01	4.78743	7	0.683918571428571	39978	Gain	QuantiSNP
 66 | 1	25598276	25638253	1	Sample472.1_48R03C01	21.8442	7	3.1206	39978	Loss	QuantiSNP
 67 | 1	25598276	25642596	3	Sample475.1_50R04C01	8.12018	8	1.0150225	44321	Gain	QuantiSNP
 68 | 1	25598276	25642596	3	Sample476.1_55R01C01	20.4309	8	2.5538625	44321	Gain	QuantiSNP
 69 | 1	25598276	25642596	3	Sample477.1_75R03C01	9.45939	8	1.18242375	44321	Gain	QuantiSNP
 70 | 1	25598276	25642596	3	Sample485.1_79R06C01	11.2973	8	1.4121625	44321	Gain	QuantiSNP
 71 | 1	25598276	25629950	3	Sample491.1_54R04C01	15.2998	5	3.05996	31675	Gain	QuantiSNP
 72 | 1	25627470	25642596	3	Sample492.1_49R01C01	0.663154	7	0.0947362857142857	15127	Gain	QuantiSNP
 73 | 1	25598276	25642596	3	Sample504.1_77R02C01	5.11496	8	0.63937	44321	Gain	QuantiSNP
 74 | 1	25598276	25642596	3	Sample505.1_77R04C01	16.027	8	2.003375	44321	Gain	QuantiSNP
 75 | 1	25598276	25642596	3	Sample506.1_77R07C01	8.47885	8	1.05985625	44321	Gain	QuantiSNP
 76 | 1	25598276	25638253	3	Sample510.1_47R04C01	8.68003	7	1.24000428571429	39978	Gain	QuantiSNP
 77 | 1	25598276	25642596	3	Sample529.1_76R05C01	8.63588	8	1.079485	44321	Gain	QuantiSNP
 78 | 1	25598276	25642596	3	Sample538.1_76R02C01	17.5925	8	2.1990625	44321	Gain	QuantiSNP
 79 | 1	25598276	25638253	3	Sample540.1_44R04C01	7.68229	7	1.09747	39978	Gain	QuantiSNP
 80 | 1	25598276	25629950	3	Sample542.1_42R03C01	20.0362	5	4.00724	31675	Gain	QuantiSNP
 81 | 1	25627470	25638253	3	Sample554.1_37R05C01	7.91008	6	1.31834666666667	10784	Gain	QuantiSNP
 82 | 1	25598276	25638253	3	Sample564.1_57R05C01	10.6977	7	1.52824285714286	39978	Gain	QuantiSNP
 83 | 1	25627470	25638253	3	Sample566.1_37R06C01	1.98509	6	0.330848333333333	10784	Gain	QuantiSNP
 84 | 1	25598276	25642596	3	Sample575.1_57R01C01	5.66285	8	0.70785625	44321	Gain	QuantiSNP
 85 | 1	25598276	25642596	3	Sample58.1_3R03C01	7.08843	8	0.88605375	44321	Gain	QuantiSNP
 86 | 1	25598276	25642596	3	Sample583.1_76R08C01	1.28627	8	0.16078375	44321	Gain	QuantiSNP
 87 | 1	25598276	25642596	3	Sample584.1_78R03C01	7.37858	8	0.9223225	44321	Gain	QuantiSNP
 88 | 1	25627470	25638253	3	Sample587.1_57R02C01	11.0545	6	1.84241666666667	10784	Gain	QuantiSNP
 89 | 1	25598276	25638253	3	Sample588.1_57R06C01	7.55628	7	1.07946857142857	39978	Gain	QuantiSNP
 90 | 1	25627470	25642596	3	Sample625.1_4R08C01	0.0820793	6	0.0136798833333333	15127	Gain	QuantiSNP
 91 | 1	25598276	25642596	1	Sample626.1_11R04C01	15.2668	7	2.18097142857143	44321	Loss	QuantiSNP
 92 | 1	25598276	25638253	1	Sample660.1_9R02C01	14.3275	6	2.38791666666667	39978	Loss	QuantiSNP
 93 | 1	25598276	25638253	3	Sample7.1_8R07C01	14.0671	7	2.00958571428571	39978	Gain	QuantiSNP
 94 | 1	25598276	25638253	3	Sample704.1_13R08C01	10.2507	6	1.70845	39978	Gain	QuantiSNP
 95 | 1	25598276	25642596	3	Sample709.1_7R05C01	0.342095	7	0.0488707142857143	44321	Gain	QuantiSNP
 96 | 1	25598276	25629950	3	Sample721.1_24R06C01	4.30825	5	0.86165	31675	Gain	QuantiSNP
 97 | 1	25598276	25642596	1	Sample739.1_13R07C01	15.0622	7	2.15174285714286	44321	Loss	QuantiSNP
 98 | 1	25598276	25638253	3	Sample74.1_6R04C01	1.76736	7	0.25248	39978	Gain	QuantiSNP
 99 | 1	25598276	25638253	3	Sample745.1_7R02C01	0.780356	6	0.130059333333333	39978	Gain	QuantiSNP
100 | 1	25627470	25642596	3	Sample746.1_29R04C01	4.41411	6	0.735685	15127	Gain	QuantiSNP
101 | 1	25598276	25638253	3	Sample75.1_9R04C01	14.5811	7	2.08301428571429	39978	Gain	QuantiSNP
102 | 1	25598276	25638253	3	Sample83.1_4R05C01	4.46067	7	0.637238571428571	39978	Gain	QuantiSNP
103 | 1	25598276	25629950	3	Sample94.1_12R08C01	0.698268	5	0.1396536	31675	Gain	QuantiSNP
104 | 1	25598276	25642596	3	Sample98.1_4R06C01	9.87336	8	1.23417	44321	Gain	QuantiSNP
105 | 1	25598276	25629950	3	SampleY-26.1_15R03C01	4.79464	5	0.958928	31675	Gain	QuantiSNP
106 | 1	25598276	25642596	1	SampleY-34.1_17R06C01	28.4924	7	4.07034285714286	44321	Loss	QuantiSNP
107 | 1	25627470	25642596	3	SampleY-46.1_3R07C01	1.20922	6	0.201536666666667	15127	Gain	QuantiSNP
108 | 1	25598276	25642596	3	SampleY-55.1_6R03C01	5.26557	7	0.752224285714286	44321	Gain	QuantiSNP
109 | 1	25598276	25642596	3	SampleY-60.1_23R04C01	1.83716	7	0.262451428571429	44321	Gain	QuantiSNP
110 | 1	25598276	25642596	3	SampleY-66.1_13R04C01	6.33878	7	0.90554	44321	Gain	QuantiSNP
111 | 1	25598276	25638253	1	Samplec103.1_7R04C01	10.4957	6	1.74928333333333	39978	Loss	QuantiSNP
112 | 1	25627470	25638253	3	Samplec111.1_20R06C01	10.3684	5	2.07368	10784	Gain	QuantiSNP
113 | 1	25627470	25642596	3	Samplec112.1_20R07C01	5.45517	6	0.909195	15127	Gain	QuantiSNP
114 | 1	25598276	25638253	1	Samplec113.1_20R08C01	13.8458	6	2.30763333333333	39978	Loss	QuantiSNP
115 | 1	25598276	25629950	3	Samplec115.1_12R07C01	0.489012	5	0.0978024	31675	Gain	QuantiSNP
116 | 1	25598276	25642596	1	Samplec116.1_12R08C01	13.2119	7	1.88741428571429	44321	Loss	QuantiSNP
117 | 1	25598276	25638253	3	Samplec124.1_8R05C01	0.783226	6	0.130537666666667	39978	Gain	QuantiSNP
118 | 1	25598276	25638253	1	Samplec134.1_30R03C01	18.9541	6	3.15901666666667	39978	Loss	QuantiSNP
119 | 1	25598276	25642596	1	Samplec16.1_26R06C01	28.4213	7	4.06018571428571	44321	Loss	QuantiSNP
120 | 1	25598276	25629950	1	Samplec17.1_60R08C01	14.591	5	2.9182	31675	Loss	QuantiSNP
121 | 1	25598276	25642596	3	Samplec20.1_65R07C01	0.319406	8	0.03992575	44321	Gain	QuantiSNP
122 | 1	25598276	25642596	3	Samplec22.1_61R03C01	9.54339	8	1.19292375	44321	Gain	QuantiSNP
123 | 1	25598276	25629950	3	Samplec27.1_78R06C01	0.0851005	5	0.0170201	31675	Gain	QuantiSNP
124 | 1	25627470	25638253	3	Samplec31.1_58R06C01	0.593463	6	0.0989105	10784	Gain	QuantiSNP
125 | 1	25598276	25642596	3	Samplec35.1_61R04C01	1.80077	8	0.22509625	44321	Gain	QuantiSNP
126 | 1	25598276	25642596	3	Samplec55.1_53R03C01	0.173868	8	0.0217335	44321	Gain	QuantiSNP
127 | 1	25598276	25638253	3	Samplec57.1_64R04C01	10.1527	7	1.45038571428571	39978	Gain	QuantiSNP
128 | 1	25598276	25642596	3	Samplec58.1_61R01C01	12.8555	8	1.6069375	44321	Gain	QuantiSNP
129 | 1	25598276	25638253	3	Samplec59.1_61R06C01	19.3747	7	2.76781428571429	39978	Gain	QuantiSNP
130 | 1	25598276	25642596	1	Samplec6.1_7R08C01	14.5154	7	2.07362857142857	44321	Loss	QuantiSNP
131 | 1	25598276	25629950	1	Samplec65.1_60R07C01	12.5598	5	2.51196	31675	Loss	QuantiSNP
132 | 1	25598276	25642596	3	Samplec72.1_65R06C01	0.813839	8	0.101729875	44321	Gain	QuantiSNP
133 | 1	25598276	25629950	3	Samplec76.1_12R05C01	9.24304	5	1.848608	31675	Gain	QuantiSNP
134 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/CNV.genotype.one.chr.one.batch.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript --vanilla
  2 | 
  3 | # load packages
  4 | suppressMessages({
  5 |   require(optparse)
  6 |   require(dplyr)
  7 |   require(mixtools)
  8 |   require(ggplot2)
  9 |   require(cowplot)
 10 |   require(plyr)
 11 |   require(modeest)
 12 |   require(mclust)
 13 |   require(gridExtra)
 14 |   require(pheatmap)
 15 |   require(RColorBrewer)
 16 | })
 17 | 
 18 | option_list = list(
 19 |   make_option(c("-c", "--chr"), action = "store", type = "character", default = NA,
 20 |               help = "Specify the chromosome on which the list of CNVRs to be genotyped is located."),
 21 |   make_option(c("-b", "--batch"), action = "store", type = "character", default = NA,
 22 |               help = "Specify the batch to which the list of CNVRs to be genotyped belongs."),
 23 |   make_option(c("-t", "--type"), action = "store", type = "character", default = NA,
 24 |               help = "Job submission type (0 - initial submission, 1 - resubmission of failed jobs)"),
 25 |   make_option(c("-p", "--datapath"), action = "store", type = "character", default = NA,
 26 |               help = "Path to the directory containing necessary input data."),
 27 |   make_option(c("-o", "--resultpath"), action = "store", type = "character", default = NA,
 28 |               help = "Path to the directory for saving results."),
 29 |   make_option(c("-m", "--matrixpath"), action = "store", type = "character", default = NA,
 30 |               help = "Path to chromosome-wise LRR and BAF matrices."),
 31 |   make_option(c("-s", "--sourcefile"), action = "store", type = "character", default = NA,
 32 |               help = "Path to the scripts directory containing R scripts to be loaded into R."),
 33 |   make_option(c("-d", "--duplicates"), action = "store_true", default = FALSE,
 34 |               help = "[optional] Whether duplicate pairs information will be annotated in diagnosis plots."),
 35 |   make_option(c("-n", "--plot"), action = "store_true", default = FALSE,
 36 |               help = "[optional] Whether to generate diagnosis plots.")
 37 | )
 38 | 
 39 | opt = parse_args(OptionParser(option_list = option_list))
 40 | pars = c(opt$chr, opt$batch, opt$type, opt$datapath, opt$resultpath, opt$matrixpath, opt$sourcefile)
 41 | 
 42 | if ( any(is.na(pars)) ) {
 43 |   stop("All three parameters must be supplied. (--help for detail)")
 44 | }
 45 | 
 46 | chr1   <- as.integer( opt$chr )
 47 | batch1 <- as.integer( opt$batch )
 48 | type1  <- as.integer( opt$type )
 49 | 
 50 | path_data   <- opt$datapath
 51 | path_result <- opt$resultpath
 52 | path_matrix <- opt$matrixpath
 53 | path_sourcefile <- opt$sourcefile
 54 | flag_png_plot <- opt$plot
 55 | flag_duplicates <- opt$duplicates
 56 | 
 57 | if ( type1 != 1 & type1 != 0) {
 58 |   stop("Job submission type must be 0 or 1. (--help for detail)")
 59 | }
 60 | ## print out parameters
 61 | cat("Processing chr:", chr1, "batch:", batch1, "type:", type1, "\n")
 62 | 
 63 | # source all the functions used in the pipeline
 64 | source(file = file.path(path_sourcefile, 'fun_BAF.R'))
 65 | source(file = file.path(path_sourcefile, 'fun_gatk.R'))
 66 | source(file = file.path(path_sourcefile, 'fun_LRR.R'))
 67 | source(file = file.path(path_sourcefile, 'fun_models.R')) 
 68 | source(file = file.path(path_sourcefile, 'fun_plot_steps.R'))
 69 | source(file = file.path(path_sourcefile, 'fun_plot_diagnosis.R'))
 70 | source(file = file.path(path_sourcefile, 'fun_plot_heatmap.R'))
 71 | source(file = file.path(path_sourcefile, 'fun_pipeline_main.R'))
 72 | 
 73 | ## use parameters 
 74 | ## PennCNV ( sample LRR mean and SD )
 75 | samples_LRR <- read.delim(file = file.path(path_data, "samples_QC.txt"), as.is = TRUE)
 76 | samples_LRR$Sample_ID <- sub("\\.txt$", "", samples_LRR$File)
 77 | 
 78 | ## dup pairs with column_name ( sample1.name sample2.name )
 79 | dup_pairs <- NULL # init 
 80 | if ( flag_duplicates ) {
 81 |   dup_pairs <- read.delim(file = file.path(path_data, "duplicate_pairs.txt"), as.is = TRUE)
 82 | }
 83 | 
 84 | ## paras_LRR ------------------------------------------------------
 85 | paras_LRR <- list(LRR_mean = list(CN_1 = -0.4156184, CN_3 = 0.1734862),
 86 |                   LRR_sd   = list(CN_1 = 0.2502591, CN_3 = 0.2249798))  ## sd for one SNP
 87 | ## These parameters can be updated after the intial round of CNVgenotyping
 88 | ## by selecting the CNVRs with well fitted GMM.
 89 | 
 90 | # main part for runing on cluster -----------------------------------------
 91 | 
 92 | cat("read in CNV ...\n")  
 93 | dt_cnvs <- read.delim(file = file.path(path_data, "cnv_clean.txt"), as.is = TRUE)
 94 | 
 95 | # PennCNV PFB information
 96 | cat("read in PFB ...\n")
 97 | dt_PFB <- read.table(file = file.path(path_data, "SNP.pfb"), sep = "\t",
 98 |                      header = TRUE, as.is = TRUE, check.names = FALSE,
 99 |                      comment.char = "")
100 | dt_PFB <- dt_PFB[, c("Name", "PFB", "Position")] # add Position information here
101 | 
102 | # read in matrix dat of LRR and BAF
103 | cat("read in BAF matrix ...\n")
104 | file_BAF <- paste0("matrix_chr_", chr1, "_BAF.rds")
105 | dt_matrix_BAF <- readRDS(file = file.path( path_matrix, "BAF", file_BAF))
106 | dt_matrix_BAF <- as.matrix(dt_matrix_BAF)
107 | 
108 | cat("read in LRR matrix ...\n")
109 | file_LRR <- paste0("matrix_chr_", chr1, "_LRR.rds")
110 | dt_matrix_LRR <- readRDS(file = file.path( path_matrix, "LRR", file_LRR))
111 | dt_matrix_LRR <- as.matrix(dt_matrix_LRR)
112 | 
113 | samples   <- rownames(dt_matrix_LRR)
114 | snps      <- colnames(dt_matrix_LRR)
115 | n_snps    <- length(snps)
116 | n_samples <- length(samples)
117 | 
118 | # read in cnvrs dat -------------------------------------------------
119 | create_path <- function(path_main, str_subpath) {
120 |   
121 |   path_sub = file.path(path_main, str_subpath) 
122 |   
123 |   if ( !dir.exists(paths = path_sub) ) {
124 |     dir.create(path = path_sub, showWarnings = FALSE, recursive = TRUE)
125 |   }
126 |   
127 |   return( path_sub )
128 | }
129 | 
130 | # output pathsub_folder: summary/steps/diag/heatmap
131 | path_main <- path_result
132 | path_log     <- create_path(path_main = path_main, str_subpath = "log")
133 | path_pred    <- create_path(path_main = path_main, str_subpath = "pred")
134 | path_pars    <- create_path(path_main = path_main, str_subpath = "pars")
135 | 
136 | if ( flag_png_plot ) {
137 |   path_png     <- create_path(path_main = path_main, str_subpath = "png")
138 |   path_heatmap <- create_path(path_main = path_png, str_subpath = "heatmap")
139 | }
140 | 
141 | path_cnvrs_error <- create_path(path_main = path_main, str_subpath = "cnvrs_error")
142 | 
143 | # add subfolders for each chr and each batch 
144 | folder.name <- paste0("chr_", chr1, "_batch_", batch1)
145 | path_pred <- file.path(path_pred, folder.name)
146 | 
147 | # test if folder exist
148 | if ( !dir.exists(path_pred) ) {
149 |   dir.create(path = path_pred, showWarnings = FALSE, recursive = TRUE)
150 | }
151 | 
152 | dt_cnvrs1 <- data.frame()
153 | cnvrs <- NULL  
154 | if (type1 == 0) {
155 |   
156 |   file_cnvr <- "cnvr_batch.txt"  ## with batch information
157 |   dt_cnvrs  <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE)
158 |   dt_cnvrs1 <- subset(dt_cnvrs, chr == chr1 & batch == batch1)
159 |   cnvrs <- unique( dt_cnvrs1$CNVR_ID ) 
160 |   
161 | } else if (type1 == 1) {
162 |   
163 |   ## this path can be specified by users
164 |   file_cnvr <- "cnvr_batch.txt"  ## with batch information
165 |   dt_cnvrs  <- read.delim(file = file.path(path_data, file_cnvr), as.is = TRUE)
166 |   
167 |   cnvrs_error <- read.table(file = file.path(path_cnvrs_error, paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt")),
168 |                             sep = "\t", header = T, check.names = F, stringsAsFactors = F)
169 |   
170 |   dt_cnvrs1 <- subset(dt_cnvrs,  CNVR_ID %in% cnvrs_error$CNVR_ID)
171 |   cnvrs <- unique( dt_cnvrs1$CNVR_ID ) 
172 | }
173 | 
174 | ## must be changed here to save each CNVRID data
175 | path_cnvr_stat <- file.path(path_result, "stats")
176 | dir.create(path = path_cnvr_stat, showWarnings = FALSE)
177 | 
178 | res_pars_all <- data.frame() 
179 | cnvrs_error  <- c()
180 | # --------------------------------------------------------------------
181 | for (i in 1:nrow(dt_cnvrs1)) {
182 | 
183 |   cnvr1 <- dt_cnvrs1$CNVR_ID[i]
184 | 
185 |   cat("cnvr1:", cnvr1, i, "in", nrow(dt_cnvrs1), "\n")
186 | 
187 |   snp_start <- dt_cnvrs1$start_snp[i]
188 |   snp_end   <- dt_cnvrs1$end_snp[i]
189 | 
190 |   ## snps have been sorted by their positions on chromosome
191 |   ## when preparing chromosome-wise LRR and BAF matrices 
192 |   idx_start <- which(snps == snp_start)
193 |   idx_end   <- which(snps == snp_end)
194 |   
195 |   ## check idx_start and idx_end
196 |   idxs <- c(idx_start, idx_end)
197 |   if (length(idxs) != 2 | idx_start >= idx_end) {
198 |     stop("CNVR boundaries are not consistency with SNP information.")
199 |   }
200 | 
201 |   snps_name <- snps[idx_start:idx_end] # all snps in cnvr1
202 | 
203 |   # plot heatmap add 20 snps on the both side ----------------------------
204 |   ## idx_outer_start <- dt_cnvrs1$outer.start[i]
205 |   ## idx_outer_end   <- dt_cnvrs1$outer.end[i]
206 |   idx_outer_start <- idx_start
207 |   idx_outer_end   <- idx_end
208 |   idx_start_new <- ifelse((idx_outer_start - 20) <= 0, 1, idx_outer_start - 20)  # new start and end for plot heatmap
209 |   idx_end_new <- ifelse((idx_outer_end + 20) > n_snps, n_snps, idx_outer_end + 20)
210 | 
211 |   dt_lrr_heatmap = dt_matrix_LRR[, idx_start_new:idx_end_new]
212 | 
213 |   snps_name_heatmap <- snps[idx_start_new:idx_end_new]
214 |   snps_name_all     <- snps[idx_outer_start:idx_outer_end]
215 |   # colnames(dt_lrr_heatmap) <- snps_name_heatmap
216 | 
217 |   snps_add   <- setdiff(snps_name_heatmap, snps_name_all)
218 |   snps_outer <- setdiff(snps_name_all, snps_name)
219 |   snps_flag  <- ifelse(snps_name_heatmap %in% snps_add, 0,
220 |                       ifelse(snps_name_heatmap %in% snps_name, 2, 1))
221 |   dt_snps_flag <- data.frame(snp_name = snps_name_heatmap,
222 |                              snp_flag = snps_flag,
223 |                              stringsAsFactors = FALSE)
224 | 
225 |   if ( flag_png_plot ) {
226 |     filename_heatmap <- paste0("heatmap_", cnvr1, ".png")
227 |     png(filename = file.path(file.path(path_png, "heatmap"), filename_heatmap),
228 |         width = 12, height = 12, units = "in", res = 512)
229 |     plot_heatmap(dt_lrr_heatmap = dt_lrr_heatmap, dt_snps_flag = dt_snps_flag)
230 |     dev.off()
231 |   }
232 |   
233 |   # -------------------------------------------------------------------
234 |   dt_baf = dt_matrix_BAF[, idx_start:idx_end]
235 |   dt_lrr = dt_matrix_LRR[, idx_start:idx_end]
236 |   
237 |   numsnp <- idx_end - idx_start + 1
238 |   samples_new <- rownames(dt_baf) ## need change in dt_cnvr_stat
239 |   stopifnot( all(samples_new == samples) )
240 | 
241 |   dt_cnvr_stat <- data.frame(CNVR_ID = cnvr1, 
242 |                              Chr = chr1, 
243 |                              BAF = as.vector(dt_baf),
244 |                              LRR = as.vector(dt_lrr), 
245 |                              Sample_ID = rep(samples_new, numsnp),
246 |                              Name = rep(snps_name, each = length(samples_new)), 
247 |                              numSNP = numsnp,
248 |                              stringsAsFactors = FALSE)
249 | 
250 |   dt_PFB1 <- subset(dt_PFB, Name %in% snps_name)
251 |   dt_cnvr_stat <- merge(dt_cnvr_stat, dt_PFB1, all.x = TRUE)
252 | 
253 |   dt_samples_cn <- data.frame(Sample_ID = samples_new, stringsAsFactors = FALSE)
254 | 
255 |   dt_cnv <- subset(dt_cnvs, CNVR_ID == cnvr1)
256 |   dt_cnv <- dt_cnv[, c("Sample_ID", "CN", "alg")]
257 |   dt_samples_cn <- merge(dt_samples_cn, dt_cnv, all.x = TRUE)
258 |   dt_samples_cn$CN[ which(is.na(dt_samples_cn$CN)) ]   <- 2
259 |   dt_samples_cn$alg[ which(is.na(dt_samples_cn$alg)) ] <- "other"
260 | 
261 |   dt_cnvr_stat <- merge(dt_cnvr_stat, dt_samples_cn, all.x = TRUE)
262 |   
263 |   ## save CNVR-stat data
264 |   saveRDS(dt_cnvr_stat, file = file.path(path_cnvr_stat, paste0(cnvr1, "_stat.rds")))
265 |   
266 |   # catch error and warning when calling CNVR
267 |   res_pipeline_cnvr1 <- tryCatch({
268 |     pipeline_main(dt_cnvrs = dt_cnvr_stat, 
269 |                   paras_LRR = paras_LRR, 
270 |                   dup_pairs = dup_pairs,
271 |                   samples_LRR = samples_LRR, 
272 |                   path_png = path_png,
273 |                   n.sample = n_samples,
274 |                   flag_png_plot = flag_png_plot)
275 |   }, error = function(e) {
276 |     NULL
277 |   }, warning = function(w) {
278 |     NULL
279 |   })
280 |   
281 |   if ( is.null(res_pipeline_cnvr1) ) {
282 |     cnvrs_error <- c(cnvrs_error, cnvr1)
283 |     next
284 |   }
285 | 
286 |   res_gatk_pred_final <- res_pipeline_cnvr1$res_gatk_pred_final
287 |   res_pars <- res_pipeline_cnvr1$res_pars
288 | 
289 |   cat( names(res_pars_all), "\n")
290 |   cat( names(res_gatk_pred_final), "\n")
291 |   
292 |   # res_pred_all <- rbind(res_pred_all, res_gatk_pred_final)
293 |   filename_cnvr1 <- paste0(cnvr1, "_pred.rds")
294 |   saveRDS(res_gatk_pred_final, file = file.path(path_pred, filename_cnvr1))
295 | 
296 |   res_pars_all <- rbind(res_pars_all, res_pars)
297 | }
298 | 
299 | filename_pars <- paste0("CNVR_pars_chr_", chr1, "_batch_", batch1, ".rds")
300 | saveRDS(res_pars_all, file = file.path(path_pars, filename_pars)) # pars file
301 | 
302 | if ( length(cnvrs_error) >= 1) {
303 |   write.table(data.frame(CNVR_ID = cnvrs_error, stringsAsFactors = F),
304 |               file = file.path(path_cnvrs_error, paste0("cnvrs_error_chr_", chr1, "_batch_", batch1, ".txt")),
305 |               col.names = T, row.names = F, quote = F)
306 | }
307 | 
308 | 
309 | 
310 | 


--------------------------------------------------------------------------------
/04_CNV_genotype/scripts/fun_pipeline_main.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # add GQ_score as parameter
  3 | 
  4 | # transform model to data.frame
  5 | trans_model <- function(model) {
  6 |   
  7 |   mus = model$mu
  8 |   sigmas = model$sigma
  9 |   lambdas = model$lambda
 10 |   
 11 |   res <- data.frame(mu0 = mus[1], mu1 = mus[2], mu2 = mus[3], mu3 = mus[4],
 12 |                     sigma0 = sigmas[1], sigma1 = sigmas[2], sigma2 = sigmas[3], sigma3 = sigmas[4],
 13 |                     lambda0 = lambdas[1], lambda1 = lambdas[2], lambda2 = lambdas[3], lambda3 = lambdas[4])
 14 |   
 15 | }
 16 | 
 17 | # main pipeline function
 18 | pipeline_main <- function(dt_cnvrs, paras_LRR, dup_pairs, samples_LRR,
 19 |                           plot_steps = TRUE, path_png, GQ_score = 0,
 20 |                           n.sample, flag_png_plot) {
 21 |   
 22 |   if (flag_png_plot) {
 23 |     path_png_diag    <- create_path(path_main = path_png, str_subpath = "diag")
 24 |     path_png_steps   <- create_path(path_main = path_png, str_subpath = "steps")
 25 |     path_png_summary <- create_path(path_main = path_png, str_subpath = "summary")
 26 |   }
 27 |   
 28 |   cnvr_id <- unique(dt_cnvrs$CNVR_ID)
 29 |   numsnp <- unique(dt_cnvrs$numSNP) # numsnp 
 30 |   dt_cnvr <- process_cnvr_LRR(dt_cnvrs = dt_cnvrs, samples_LRR = samples_LRR)
 31 | 
 32 |   n_sample = nrow(dt_cnvr)
 33 |   stopifnot(n_sample == n.sample) ## change here for other dataset
 34 |   # set CN = 0 cutoff = -0.8
 35 |   dt_cnvr0 <- subset(dt_cnvr, LRR_median <= -0.8)
 36 |   n0 <- nrow(dt_cnvr0)  # set cutoff of n0 is 5
 37 |   
 38 |   dt_cnvr_train <- subset(dt_cnvr, LRR_median > - 0.8 & CN != 0) # all confirmed CN = 1/2/3
 39 |   
 40 |   # if dt_cnvr_train
 41 |   if (nrow(dt_cnvr_train) == 0) {
 42 |     
 43 |     res_pars <- NULL
 44 |     
 45 |     res_gatk_pred_final <- data.frame(Sample_ID = dt_cnvr$Sample_ID, CN = dt_cnvr$CN, 
 46 |                                       CNVR_ID = cnvr_id, CN_gatk_pred = 0,
 47 |                                       value_GQ = 100, stringsAsFactors = FALSE)
 48 |     
 49 |     
 50 |     res <- list(res_gatk_pred_final = res_gatk_pred_final,
 51 |                 res_pars = res_pars)  ## paras for each CNVR_ID
 52 |     
 53 |   } else {
 54 |     
 55 |     res_paras <- train_model_zz(dt_cnvr = dt_cnvr_train, paras_LRR = paras_LRR)
 56 |     
 57 |     paras_all <- res_paras$paras_all  # final paras from gmm model
 58 |     paras_model <- res_paras$paras_model  # step paras for plot diagnosis
 59 |     
 60 |     ## all predict result
 61 |     mu1 <- paras_all$mus[1]
 62 |     sigma1 <- paras_all$sigmas[1]
 63 |     mu2 <- paras_all$mus[2]
 64 |     sigma2 <- paras_all$sigmas[2]
 65 |     mu3 <- paras_all$mus[3]
 66 |     sigma3 <- paras_all$sigmas[3]
 67 |     
 68 |     
 69 |     cat("parameter_gmm:\n")
 70 |     cat(paras_all$mus, "\n")
 71 |     cat(paras_all$sigmas, "\n")
 72 |     cat(paras_all$lambdas, "\n")
 73 |     # save diagnosis png
 74 |     ## plot diagnosis 
 75 |     if ( flag_png_plot ) {
 76 |       file_diagnosis <- paste0("diag_", cnvr_id, ".png")
 77 |       png(filename = file.path(path_png_diag, file_diagnosis), width = 12, height = 12, units = "in", res = 512)
 78 |       plot_gmm_diagnosis(dt_cnvr = dt_cnvr_train, paras_model = paras_model)
 79 |       dev.off()
 80 |     }
 81 | 
 82 |     # set CN = 0 
 83 |     mu0 <- -3
 84 |     # sigma0 <- sigma1*10  ## MUST BE CHANGED
 85 |     sigma0 <- 0.8*0.8
 86 |     if (n0 != 0) {
 87 |       if (n0  >= 5) {
 88 |         mu0 <- median(dt_cnvr0$LRR_median)
 89 |         sigma0 <- sd(dt_cnvr0$LRR_median)
 90 |       } 
 91 |     }
 92 |     
 93 |     model1 <- list()
 94 |     model1$mu <- c(mu0, paras_all$mus)
 95 |     model1$sigma <- c(sigma0, paras_all$sigmas) ## 2212 samples number
 96 |     model1$lambda <- c(n0/n_sample, ((n_sample - n0)/n_sample)*paras_all$lambdas)
 97 |     
 98 |     # add cutoff of pbf to select SNP
 99 |     dt_cnvrs_BAF <- subset(dt_cnvrs, PFB <= 0.99 & PFB >= 0.01)
100 |     
101 |     cat("nrow BAF data:", nrow(dt_cnvrs_BAF), "\n")
102 |     flag_BAF <- ifelse(nrow(dt_cnvrs_BAF) == 0, 0, 1)  
103 |     
104 |     numsnp.used <- 0
105 |     numsnp.raw  <- unique(dt_cnvrs$numSNP)
106 |     # calculate BAF
107 |     if (flag_BAF == 1) {
108 |       dt_BAF1 <- calculate_BAF_gatk_whole(dt_cnvrs = dt_cnvrs_BAF)
109 |       numsnp.used <- length(unique(dt_cnvrs_BAF$Name)) ##
110 |       cat("numsnp.used:", numsnp.used, "\n")
111 |     } 
112 |     
113 |     # calculate LRR
114 |     dt_LRR1 <- output_LRR_gatk(dt_cnvr = dt_cnvr, model = model1)
115 |     
116 |     # add to deal with 0 value in each column -----------------
117 |     idx0.inf <- which(dt_LRR1$LRR0 == 0)
118 |     if (length(idx0.inf) >= 1) {
119 |       dt_LRR1$LRR0[idx0.inf] <- 1e-10
120 |     }
121 |     
122 |     idx1.inf <- which(dt_LRR1$LRR1 == 0)
123 |     if (length(idx1.inf) >= 1) {
124 |       dt_LRR1$LRR1[idx1.inf] <- 1e-10
125 |     }
126 |     
127 |     idx2.inf <- which(dt_LRR1$LRR2 == 0)
128 |     if (length(idx2.inf) >= 1) {
129 |       dt_LRR1$LRR2[idx2.inf] <- 1e-10
130 |     }
131 |     
132 |     idx3.inf <- which(dt_LRR1$LRR3 == 0)
133 |     if (length(idx3.inf) >= 1) {
134 |       dt_LRR1$LRR3[idx3.inf] <- 1e-10
135 |     }
136 |     
137 |     # ==========================================
138 |     
139 |     dt_LRRBAF1 <- data.frame()
140 |     res_gatk_pred1 <- data.frame()
141 |     mean1_GQ <- 0
142 |     if (flag_BAF == 1) {
143 |       dt_LRRBAF1 <- merge(dt_LRR1, dt_BAF1)
144 |       res_gatk_pred1 <- output_gatk_result(dt_LRRBAF = dt_LRRBAF1)  # dt_LRRBAF = dt_LRR1
145 |       mean1_GQ <- mean(res_gatk_pred1$value_GQ)
146 |       cat("mean1_GQ:", mean1_GQ, "nrow gatk_pred1:", nrow(res_gatk_pred1),"\n")
147 |     } else {
148 |       dt_LRRBAF1 <- dt_LRR1
149 |       res_gatk_pred1 <- output_gatk_result_LRR(dt_LRRBAF = dt_LRRBAF1)
150 |       mean1_GQ <- mean(res_gatk_pred1$value_GQ)
151 |     }
152 |     
153 |     
154 |     ## save steps_1_ png
155 |     if (flag_BAF == 1 & flag_png_plot) {
156 |       file_steps <- paste0("steps_1_", cnvr_id, ".png")
157 |       png(filename = file.path(path_png_steps, file_steps), width = 12, height = 12, units = "in", res = 512)
158 |       plot_steps(dt_cnvr_train = dt_cnvr_train, dup_pairs = dup_pairs, dt_cnvr_raw = dt_cnvr, 
159 |                  paras = paras_all, dt_LRRBAF = res_gatk_pred1)  ## here
160 |       dev.off()
161 |     }
162 |     
163 |     # if number of CN = 1 <= 2* CN= 0
164 |     # re_model
165 |     n0_new <- sum(res_gatk_pred1$CN_gatk_pred == 0)
166 |     n1_new <- sum(res_gatk_pred1$CN_gatk_pred == 1)
167 |     # n0_new
168 |     # n1_new
169 |     
170 |     # hardy weinberg test
171 |     res_gatk_pred_final <- NULL
172 |     model_final <- NULL
173 |     if (n1_new >= n0_new | (paras_all$lambdas[2] >= 0.9)) {
174 |       res_gatk_pred_final <- res_gatk_pred1
175 |       model_final <- model1  ## final model
176 |     } else {
177 |       mu1 <- paras_all$mus[2]
178 |       sigma1 <- paras_all$sigmas[2]
179 |       mu2 <- paras_all$mus[3]
180 |       sigma2 <- paras_all$sigmas[3]
181 |       mu3 <- paras_all$mus[3] + paras_LRR$LRR_mean$CN_3
182 |       sigma3 <- paras_LRR$LRR_sd$CN_3/sqrt(numsnp)
183 |       
184 |       if (n0 == 0) {
185 |         model2 <- normalmixEM(x = dt_cnvr$LRR_median, k = 3, 
186 |                               mean.constr = c(mu1, mu2, mu3),
187 |                               sd.constr = c(sigma1, sigma2, sigma3))
188 |         # add CN = 0 parameters
189 |         model2$mu <- c(mu0, model2$mu)
190 |         model2$sigma <- c(sigma0, model2$sigma)
191 |         model2$lambda <- c(0, model2$lambda)
192 |       } else {
193 |         model2 <- normalmixEM(x = dt_cnvr$LRR_median, k = 4,
194 |                               mean.constr = c(mu0, mu1, mu2, mu3),
195 |                               sd.constr = c(sigma0, sigma1, sigma2, sigma3))
196 |       }
197 |       
198 |       # calculate LRR
199 |       dt_LRR2 <- output_LRR_gatk(dt_cnvr = dt_cnvr, model = model2)
200 |       
201 |       idx0.inf <- which(dt_LRR2$LRR0 == 0)
202 |       if (length(idx0.inf) >= 1) {
203 |         dt_LRR2$LRR0[idx0.inf] <- 1e-10
204 |       }
205 |       
206 |       idx1.inf <- which(dt_LRR2$LRR1 == 0)
207 |       if (length(idx1.inf) >= 1) {
208 |         dt_LRR2$LRR1[idx1.inf] <- 1e-10
209 |       }
210 |       
211 |       idx2.inf <- which(dt_LRR2$LRR2 == 0)
212 |       if (length(idx2.inf) >= 1) {
213 |         dt_LRR2$LRR2[idx2.inf] <- 1e-10
214 |       }
215 |       
216 |       idx3.inf <- which(dt_LRR2$LRR3 == 0)
217 |       if (length(idx3.inf) >= 1) {
218 |         dt_LRR2$LRR3[idx3.inf] <- 1e-10
219 |       }
220 |       
221 |       res_gatk_pred2 <- data.frame()
222 |       mean2_GQ <- 0
223 |       cat("calculate mean2_GQ.\n")
224 |       # calculate BAF
225 |       if (flag_BAF == 1) {
226 |         dt_BAF2 <- dt_BAF1  ## save as dt_BAF1
227 |         dt_LRRBAF2 <- merge(dt_LRR2, dt_BAF2)
228 |         res_gatk_pred2 <- output_gatk_result(dt_LRRBAF = dt_LRRBAF2)
229 |         mean2_GQ <- mean(res_gatk_pred2$value_GQ)
230 |       } else {
231 |         dt_LRRBAF2 <- dt_LRR2
232 |         res_gatk_pred2 <- output_gatk_result_LRR(dt_LRRBAF = dt_LRRBAF2)
233 |         mean2_GQ <- mean(res_gatk_pred2$value_GQ)
234 |       }
235 |       
236 |       cat(mean1_GQ, mean2_GQ, "\n")
237 |       
238 |       res_gatk_pred_final <- res_gatk_pred2
239 |       model_final <- model2  ## model final
240 |     }
241 |     
242 |     ## save steps_2 png
243 |     if (flag_BAF == 1 & flag_png_plot) {
244 |       file_steps <- paste0("steps_2_", cnvr_id, ".png")
245 |       png(filename = file.path(path_png_steps, file_steps), width = 12, height = 12, units = "in", res = 512)
246 |       plot_steps(dt_cnvr_train = dt_cnvr_train, dup_pairs = dup_pairs, dt_cnvr_raw = dt_cnvr, 
247 |                  paras = paras_all, dt_LRRBAF = res_gatk_pred_final)  ## here
248 |       dev.off()
249 |     }
250 |     
251 |     
252 |     # add GQ_score cutoff here
253 |     idxs_nocall <- which(res_gatk_pred_final$value_GQ <= GQ_score)
254 |     call_rate <- 1 - length(idxs_nocall)/nrow(res_gatk_pred_final)
255 |     if (length(idxs_nocall) >= 1) {
256 |       res_gatk_pred_final$CN_gatk_pred[idxs_nocall] <- 4
257 |     }
258 |     if ( flag_png_plot ) {
259 |       
260 |       # plot for final
261 |       # for new input must ordered as follow
262 |       # dt_pfb <- dt_cnvrs[order(dt_cnvrs$Sample_ID, dt_cnvrs$Position), ]
263 |       dt_pfb <- dt_cnvrs[1:numsnp, ]
264 |       dt_pfb$MAF <- pmin(dt_pfb$PFB, 1 - dt_pfb$PFB) ##
265 |       dt_pfb <- dt_pfb[, c("Name", "MAF")]
266 |       
267 |       plot_MAF <- ggplot(data = dt_pfb, aes(Name, MAF)) + 
268 |         geom_col() + 
269 |         ggtitle(label = paste("snps MAF in Position order", "numSNP:", numsnp)) + 
270 |         labs(x = "SNP Name") + 
271 |         theme_bw(base_size = 9) +
272 |         theme(axis.text.x = element_text(angle = 45, hjust = 1))
273 |       
274 |       # final 
275 |       
276 |       cat("model_final_parameter:\n")
277 |       cat(model_final$mu, "\n")
278 |       cat(model_final$sigma, "\n")
279 |       cat(model_final$lambda, "\n")
280 |       plot_final <- plot_model_final(paras = model_final, dt_cnvr = dt_cnvr_train,
281 |                                      title = paste("final model for", cnvr_id, "numSNP:", numsnp))
282 |       
283 |       
284 |       # scatter plot LRR_median
285 |       dt_pred <- res_gatk_pred_final[, c("Sample_ID", "CN_gatk_pred")]
286 |       dt_cnvr_scatter <- merge(dt_cnvr, dt_pred)
287 |       dt_cnvr_scatter <- dt_cnvr_scatter[order(dt_cnvr_scatter$CN), ]
288 |       dt_cnvr_scatter$idx <- 1:nrow(dt_cnvr_scatter)
289 |       myColors <- brewer.pal(4, "Set1")
290 |       plot_raw <- ggplot() + 
291 |         geom_point(data = subset(dt_cnvr_scatter, CN == 0), aes(idx, LRR_median), col = "black") +
292 |         geom_point(data = subset(dt_cnvr_scatter, CN == 1), aes(idx, LRR_median), col = "red") + 
293 |         geom_point(data = subset(dt_cnvr_scatter, CN == 2), aes(idx, LRR_median), col = "green") +
294 |         geom_point(data = subset(dt_cnvr_scatter, CN == 3), aes(idx, LRR_median), col = "blue") +
295 |         theme_bw(base_size = 10) + 
296 |         ggtitle(label = "CNV call from IPQ") 
297 |       
298 |       # plot_raw
299 |       # add gray color point here
300 |       plot_gatk <- ggplot() + 
301 |         geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 0), aes(idx, LRR_median), col = "black") +
302 |         geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 1), aes(idx, LRR_median), col = "red") + 
303 |         geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 2), aes(idx, LRR_median), col = "green") +
304 |         geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 3), aes(idx, LRR_median), col = "blue") +
305 |         geom_point(data = subset(dt_cnvr_scatter, CN_gatk_pred == 4), aes(idx, LRR_median), col = "gray") +
306 |         theme_bw(base_size = 10) +
307 |         ggtitle(label = "CNV call from gatk similary method",
308 |                 subtitle = paste("GQ score:", GQ_score, "call rate:", round(call_rate, 3),
309 |                                  "numsnp.raw:", numsnp.raw, "numsnp.used:", numsnp.used))
310 |       
311 |       # plot_gatk
312 |       filefinal <- paste0("summary_", cnvr_id, ".png")
313 |       png(filename = file.path(path_png_summary, filefinal),
314 |           width = 12, height = 12, units = "in", res = 512)
315 |       grid.arrange(plot_MAF, plot_final, plot_raw, plot_gatk, nrow = 2)
316 |       dev.off()
317 |       
318 |     }
319 |     
320 |     res_pars <- trans_model(model = model_final)
321 |     res_pars$CNVR_ID = cnvr_id
322 |     res_pars$numSNP  = numsnp
323 |     
324 |     # res_gatk_pred_final  ## final result
325 |     res <- list(res_gatk_pred_final = res_gatk_pred_final,
326 |                 res_pars = res_pars)  ## paras for each CNVR_ID
327 |     
328 |   }
329 | 
330 | }
331 |  
332 | 
333 | 
334 | 


--------------------------------------------------------------------------------