├── 2bRAD-M_workflow.png
├── .gitignore
├── tools
    ├── 2bRAD-M-20201225-conda.yml
    ├── Download_2bRADTagDB_NCBI.pl
    └── Download_2bRADTagDB_GTDB.pl
├── manual
├── LICENSE
├── scripts
    ├── FindGenome_ByQualitative.pl
    ├── MergeProfilesFromMultipleSamples.pl
    ├── CalculateRelativeAbundance_Combined2bEnzymes.pl
    ├── CalculateRelativeAbundance_Single2bEnzyme.pl
    ├── CreateQuanDatabase_2bRAD.pl
    ├── CreateQualDatabase_2bRAD.pl
    └── 2bRADExtraction.pl
├── README.md
└── bin
    └── 2bRADM_Pipline.pl


/2bRAD-M_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shihuang047/2bRAD-M/HEAD/2bRAD-M_workflow.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rhistory
2 | *.RData
3 | .ipynb_checkpoints/
4 | # LibreOffice lock files
5 | .~lock*
6 | # Apple-OS-styled files
7 | *.DS_Store
8 | *.nc
9 | 


--------------------------------------------------------------------------------
/tools/2bRAD-M-20201225-conda.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - perl=5.26.2
7 |   - perl-parallel-forkmanager=2.02
8 |   - pear=0.9.6
9 | 


--------------------------------------------------------------------------------
/manual:
--------------------------------------------------------------------------------
 1 | 1.install software
 2 |   conda env create -n 2bRAD-M-20201225 --file tools/2bRAD-M-20201225-conda.yml
 3 |   conda activate 2bRAD-M-20201225
 4 | 
 5 | 2.download genome, database and example data
 6 |   for NCBI database: perl tools/Download_2bRADTagDB_NCBI.pl your_database_path(default:./2B-RAD-M-ref_db_NCBI/)
 7 |   for GTDB database: perl tools/Download_2bRADTagDB_GTDB.pl your_database_path(default:./2B-RAD-M-ref_db_GTDB/)
 8 | 
 9 | 3.test the Pipline with sample data
10 |   1)simulate_50:
11 |     perl bin/2bRADM_Pipline.pl -t 3 -l your_database_path/list_simulation -d your_database_path -o outdir -gsc 60 -qc no
12 |   2)MSA1002_R1:
13 |     perl bin/2bRADM_Pipline.pl -t 3 -l your_database_path/list_mock -d your_database_path -o outdir
14 |     
15 | Note:
16 |   You need to activate the environment each time you use it. (conda activate 2bRAD-M-20201225)
17 | 
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The 2bRAD-M software is licensed under the MIT license.
 2 | 
 3 | Copyright (c) 2021 QIBEBT, CAS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tools/Download_2bRADTagDB_NCBI.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | #Author:zhangrongchao, zhangrongchaoxx@163.com
  3 | use strict;
  4 | use warnings;
  5 | use File::Basename qw(dirname basename);
  6 | use Cwd 'abs_path';
  7 | 
  8 | $ARGV[0] ||="2B-RAD-M-ref_db_NCBI";
  9 | 
 10 | #if($#ARGV!=0){
 11 | #	print STDERR "perl $0 outdir\n";
 12 | #	exit 1;
 13 | #}
 14 | 
 15 | my $outdir=$ARGV[0];#下载目录
 16 | 
 17 | $outdir=abs_path($outdir);
 18 | &CheckDir("$outdir");
 19 | 
 20 | 
 21 | my @a=('abfh_classify','MSA1002','simulate_50');#分类表，实际数据，模拟数据
 22 | #my @b=('BcgI.species');#需要下载的库文件
 23 | my @b=('BcgI.species','CjePI.species');#需要下载的库文件
 24 | 
 25 | my %hash_path=(
 26 | 	'abfh_classify'=>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25889157/abfh_classify_with_speciename.txt.gz',],
 27 | 
 28 | 	'MSA1002'      =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25623566/MSA1002_R1.fq.gz',],
 29 | 
 30 | #	'simulate_50'  =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25621832/simulate_50.fa.gz',],
 31 | 	'simulate_50'  =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25915428/simulate_50.BcgI.fq.gz',],
 32 | 
 33 | 	'BcgI.species' =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25889544/BcgI.species.fa.gz0',
 34 | 	                  'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25889658/BcgI.species.fa.gz1',],
 35 | 
 36 | 	'CjePI.species'=>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25891653/CjePI.species.fa.gz0',
 37 | 	                  'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25890987/CjePI.species.fa.gz1',
 38 | 	                  'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25890996/CjePI.species.fa.gz2',
 39 | 	                  'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25891002/CjePI.species.fa.gz3',],
 40 | 	);
 41 | 
 42 | my %hash_md5=(
 43 | 	'abfh_classify'=>['25f3a20babb56fd9f2a61eeddb82151a',],
 44 | 
 45 | 	'MSA1002'      =>['bc2b189213975f6d6c0833a4ba726239',],
 46 | 
 47 | #	'simulate_50'  =>['9defe990462d3fef8eb69a2c359d72da',],
 48 | 	'simulate_50'  =>['04cafca5b5c23c48774e9d515dde42a8',],
 49 | 
 50 | 	'BcgI.species' =>['b36cc8e85fb68f1b3cc5301c49cafe98',
 51 | 	                  '071b711730ce87e6c1f85f29319a5979',],
 52 | 	
 53 | 	'CjePI.species'=>['a32c1998d0d800fe336d9f03756b8409',
 54 | 	                  '1eb528474f89a6550f69c160d0885dd8',
 55 | 	                  'b803ea6b0e2bca1c6381b2a15a76876d',
 56 | 	                  'a3d5f018fb3410b507759f2eabee4d04',]
 57 | 	);
 58 | 
 59 | #合并后文件md5
 60 | my %complete_md5=(
 61 | 	'BcgI.species' =>'75171aabcb754e827e5824ae755d06af',
 62 | 	'CjePI.species'=>'bcfdef3722dfc763e09fd185f580198d',
 63 | 	);
 64 | 
 65 | #download abfh_classify && MSA1002 && simulate_50
 66 | for my $i(@a){
 67 | 	my $name=(split /\//,$hash_path{$i}[0])[-1];
 68 | 	my $file_md5;#下载的文件的MD5值
 69 | 	while(1){
 70 | 		if(-e "$outdir/$name"){
 71 | 			chomp($file_md5=`md5sum $outdir/$name`);
 72 | 			$file_md5=(split /\s+/,$file_md5)[0];
 73 | 		}
 74 | 		if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[0]){
 75 | 			print STDOUT "File $name has been downloaded.\n";
 76 | 			last;
 77 | 		}else{
 78 | 			`wget -t 0 -O $outdir/$name $hash_path{$i}[0]`;
 79 | 		}
 80 | 	}
 81 | }
 82 | #example list
 83 | open OU,">$outdir/list_mock" or die "cannot open $outdir/list_mock\n";
 84 | print OU "MSA1002\t$outdir/MSA1002_R1.fq.gz\n";
 85 | close OU;
 86 | 
 87 | open OU,">$outdir/list_simulation" or die "cannot open $outdir/list_simulation\n";
 88 | #print OU "simulate_50\t$outdir/simulate_50.fa.gz\n";
 89 | print OU "simulate_50\t$outdir/simulate_50.BcgI.fq.gz\n";
 90 | close OU;
 91 | 
 92 | 
 93 | 
 94 | #下载数据库文件
 95 | for my $i(@b){
 96 | 	my $cat="";
 97 | 	while(1){
 98 | 		my $md5;
 99 | 		if(-e "$outdir/$i.fa.gz"){#存在完成文件
100 | 			chomp($md5=`md5sum $outdir/$i.fa.gz`);
101 | 			$md5=(split /\s+/,$md5)[0];
102 | 		}
103 | 		if(-e "$outdir/$i.fa.gz" && $md5 eq $complete_md5{$i}){
104 | 			print STDOUT "File $i.fa.gz hash been downloaded.\n";
105 | 			`rm -rf $cat`;
106 | 			last;
107 | 		}else{
108 | 			for my $j(0..$#{$hash_path{$i}}){#循环每个文件
109 | 				my $name=(split /\//,$hash_path{$i}[$j])[-1];
110 | 				my $file_md5;#下载的文件的MD5值
111 | 				while(1){
112 | 					if(-e "$outdir/$name"){
113 | 						chomp($file_md5=`md5sum $outdir/$name`);
114 | 						$file_md5=(split /\s+/,$file_md5)[0];
115 | 					}
116 | 					if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[$j]){
117 | 						print STDOUT "File $name has been downloaded.\n";
118 | 						$cat .=" $outdir/$name";
119 | 						last;
120 | 					}else{
121 | 						`wget -t 0 -O $outdir/$name $hash_path{$i}[$j]`;
122 | 					}
123 | 				}
124 | 			}
125 | 			`cat $cat > $outdir/$i.fa.gz`;
126 | 		}
127 | 	}
128 | }
129 | 
130 | print STDOUT "Congratulations! All databases have been downloaded.\n";
131 | 
132 | sub CheckDir{
133 | 	my $file = shift;
134 | 	unless( -d $file ){
135 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
136 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
137 | 	}
138 | 	return 1;
139 | }
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/scripts/FindGenome_ByQualitative.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings;
  3 | use strict;
  4 | use Getopt::Long;
  5 | use FindBin qw($Bin);
  6 | use File::Basename qw(dirname basename);
  7 | 
  8 | my $author="Zhangrongchao";
  9 | my $time="20201222";
 10 | 
 11 | my $g_score_threshold ||=5;#对定性的合并结果，进行分类筛选 gscore阈值
 12 | my $GCF_threshold ||=1;#鉴定到某个基因组几个标签以上，该基因组才会被纳入定量建库
 13 | 
 14 | 
 15 | my($list,$database,$outdir,$qual_dir,$help);
 16 | GetOptions(
 17 | 		"l:s"   => \$list, #待处理样品列表
 18 | 		"d:s"   => \$database, #数据库目录
 19 | 		"o:s"   => \$outdir, #输出目录
 20 | 		"qualdir:s" => \$qual_dir, #所有样品定性结果总目录
 21 | 
 22 | 		"gscore:i"   => \$g_score_threshold,#筛选分类
 23 | 		"gcf:i" => \$GCF_threshold,#筛选分类中的基因组
 24 | 
 25 | 		"h|help:s" => \$help,
 26 | 		);
 27 | 
 28 | sub usage{#帮助
 29 | print STDERR "\e[;33;1m
 30 | 	DESCRIPTION
 31 | 		2b微生物根据定性结果筛选定量基因组生成建库所需list
 32 | 	USAGE
 33 | 		perl $0
 34 | 	PARAMETERS
 35 | 		-l  <file> sample list (the line which begins with # will be ignored)
 36 | 		           eg: sample(<tab>...<tab>...)
 37 | 		-d  <dir> database path
 38 | 		-o  <dir> outdir (if not exists,it will be created)
 39 | 		-qualdir <dir> dir of qualitative
 40 | 	OPTIONS
 41 | 		-gscore <i> G score threshold of classify in qualitative analysis, it decides quantitative database. [$g_score_threshold, it means >$g_score_threshold]
 42 | 		-gcf    <i> detected tag threshold of GCF in qualitative analysis, it decides quantitative database. [$GCF_threshold, it means >$GCF_threshold]
 43 | 		-h|help     print help
 44 | 	AUTHOR:  $author $time\e[0m\n";
 45 | }
 46 | 
 47 | 
 48 | if(defined($help)){
 49 | 	&usage;
 50 | 	exit 0;
 51 | }
 52 | 
 53 | #参数检测
 54 | unless($list && $database && $outdir && $qual_dir){
 55 | 	&usage;
 56 | 	print STDERR "para -l -d -o or -qualdir error.\n";
 57 | 	exit 1;
 58 | }
 59 | 
 60 | #数据库文件检测
 61 | unless(-e "$database/abfh_classify_with_speciename.txt.gz"){
 62 | 	print STDERR "incomplete database, $database/abfh_classify_with_speciename.txt.gz does not exists.\n";
 63 | 	exit 1;
 64 | }
 65 | 
 66 | #记录数据库中gcf转化为全部信息
 67 | my %gcf2classify_path;
 68 | open DB,"gzip -dc $database/abfh_classify_with_speciename.txt.gz|" or die "cannot open $database/abfh_classify_with_speciename.txt.gz\n";
 69 | while(<DB>){
 70 | 	next if(/^#/ || /^$/);
 71 | 	chomp;
 72 | 	my @tmp=split /\t/;
 73 | 	$gcf2classify_path{$tmp[0]}=$_;
 74 | }
 75 | close DB;
 76 | 
 77 | 
 78 | &CheckDir("$outdir");
 79 | 
 80 | open IN,"$list" or die "cannot open $list\n";
 81 | while(<IN>){
 82 | 	next if(/^#/ || /^$/);#跳过注释行和空行
 83 | 	chomp;
 84 | 	my $sample_name=(split /\t/)[0];
 85 | 	#样品定量列表
 86 | 	##单样品重建库list准备
 87 | 	##记录通过G_score阈值的分类
 88 | 	my (%hs_pass_Gscore_class,@enzyme_use);
 89 | 	if(-e "$qual_dir/$sample_name/$sample_name.combine.xls"){
 90 | 		open XI,"$qual_dir/$sample_name/$sample_name.combine.xls" or die "cannot open $qual_dir/$sample_name/$sample_name.combine.xls\n";
 91 | 	}else{
 92 | 		print STDERR "!!!$sample_name does not have $qual_dir/$sample_name/$sample_name.combine.xls, can't do quantitative analysis\n";
 93 | 		next;
 94 | 	}
 95 | 	while(<XI>){
 96 | 		chomp;
 97 | 		my @tmp=split /\t/;
 98 | 		next if(/^#Kingdom/i);#跳过表头
 99 | 		if(/^#/){#记录 合并使用的酶 组合
100 | 			my @a=split /\s+/,$tmp[0];
101 | 			for my $enzyme(@a){
102 | 				$enzyme=~s/^#//;
103 | 				next if($enzyme eq "combine");#跳过 combine字段
104 | 				push @enzyme_use,$enzyme;
105 | 				unless(-e "$database/$enzyme.species.fa.gz"){
106 | 					print STDERR "incomplete database, $database/abfh_classify_with_speciename.txt.gz does not exists.\n";
107 | 					exit 1;
108 | 				}
109 | 			}
110 | 			next;
111 | 		}
112 | 		my $class=join("\t",@tmp[0..$#tmp-8]);#获取分类信息
113 | 		$hs_pass_Gscore_class{$class}++ if($tmp[-1]>$g_score_threshold);#通过gscore阈值的分类
114 | 	}
115 | 	close XI;
116 | 	&CheckDir("$outdir/$sample_name");#建立每个样品的文件夹
117 | 	open OU,"|sort|uniq > $outdir/$sample_name/sdb.list" or die "cannot open $outdir/$sample_name/sdb.list\n"; #输出选出的基因组列表，并排序去重
118 | 	for my $enzyme(@enzyme_use){
119 | 		if(-e "$qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls"){
120 | 			open QU,"$qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls" or die "cannot open $qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls\n";
121 | 		}else{
122 | 			print STDERR "warning: $sample_name does not have $qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls\n";
123 | 			next;
124 | 		}
125 | 		while(<QU>){
126 | 			chomp;
127 | 			my @tmp=split /\t/;
128 | 			my $class=join("\t",@tmp[0..$#tmp-4]);
129 | 			if(exists $hs_pass_Gscore_class{$class} && $tmp[-2]>$GCF_threshold){
130 | 				print OU "$gcf2classify_path{$tmp[-4]}\n";
131 | #				my @all_class=split /\t/,$gcf2classify_path{$tmp[-4]};
132 | #				print OU join("\t",@all_class[0..8]),"\t$database/genome_ref/$all_class[-1]\n";
133 | 			}
134 | 		}
135 | 		close QU;
136 | 	}
137 | 	close OU;
138 | 
139 | }
140 | close IN;
141 | 
142 | 
143 | sub CheckDir{# create the directory
144 | 	my $file = shift;
145 | 	unless( -d $file ){
146 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
147 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
148 | 		}
149 | 		return 1;
150 | }
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/scripts/MergeProfilesFromMultipleSamples.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings;
  3 | use strict;
  4 | use Getopt::Long;
  5 | use Cwd 'abs_path';
  6 | 
  7 | my ($list,$outdir,$prefix,$mock,$control,$help);
  8 | GetOptions(
  9 | 		"l:s" => \$list,
 10 | 		"o:s" => \$outdir,
 11 | 		"p:s" => \$prefix,
 12 | 
 13 | 		"m:s" => \$mock,
 14 | 		"c:s" => \$control,
 15 | 
 16 | 		"h|help:s" => \$help,
 17 | 		);
 18 | sub usage{
 19 | 	print STDERR "\e[;33;1m
 20 | 	DESCRIPTION
 21 | 		Merge the abundance profiles from mulitple samples. If negative control samples were provided, 
 22 | 		this script can also filter all taxa in negative control samples (i.e., potential contaminations) for each biological sample.
 23 | 	USAGE
 24 | 		perl $0
 25 | 		
 26 | 	PARAMETERS
 27 | 		-l <file> A list file indicating the sample_id and corresponding output files from the last step.
 28 | 		          e.g., sample_id<tab>qualitative/sample_id/sample_id.combine.xls
 29 | 		-o <dir>  The output directory
 30 | 		-p <str>  The output prefix
 31 | 	OPTIONS
 32 |         -m <str>   The mock-community sample name(s) (separated by commas).
 33 |         -c <str>   The sample name(s) (separated by commas) of negative control that can be used for filtering potential contaminations.
 34 |     
 35 | 		-h|help   help
 36 | 	AUTHOR:  ZRC 2020.09.14
 37 | 	\e[0m\n";
 38 | }
 39 | 
 40 | if(defined($help)){
 41 | 	&usage;
 42 | 	exit 0;
 43 | }
 44 | 
 45 | unless($list && $outdir && $prefix){
 46 | 	&usage;
 47 | 	print STDERR "para -l -o  or -p error.\n";
 48 | 	exit 1;
 49 | }
 50 | 
 51 | &CheckDir("$outdir");
 52 | #记录mock样品
 53 | my (%hash_mock,%hash_control);
 54 | if(defined($mock)){
 55 | 	my @mock=split /,/,$mock;
 56 | 	for(@mock){
 57 | 		$hash_mock{$_}++;
 58 | 	}
 59 | }
 60 | #记录control样品
 61 | if(defined($control)){
 62 | 	my @control=split /,/,$control;
 63 | 	for(@control){
 64 | 		$hash_control{$_}++;
 65 | 	}
 66 | }
 67 | 
 68 | #读取定性/定量计算结果文件
 69 | my (%hash_specie,%hash_all,@sample_sort,$classify_col,$head);
 70 | #循环样品
 71 | open LI,"$list" or die "cannot open $list\n";
 72 | while(<LI>){
 73 | 	next if (/^#/ || /^$/);#去除注释行和空行
 74 | 	chomp;
 75 | 	my ($sample,$path)=split /\t/;
 76 | 	$path=abs_path($path);
 77 | 	unless(-e $path){
 78 | 		print STDERR "warning: $sample $path not exist, cannot be calculate Abundance.\n";
 79 | 		next;
 80 | 	}
 81 | 	push @sample_sort,$sample;#记录样品顺序
 82 | 	open IN,"$path" or die "cannot open $path\n";
 83 | 	while(<IN>){
 84 | 		chomp;
 85 | 		my @tmp=split /\t/;
 86 | 		if(/^#Kingdom/i){
 87 | 			for my $i(0..$#tmp){#确定分类列
 88 | 				if($tmp[$i] eq "Theoretical_Tag_Num"){
 89 | 					$classify_col=$i-1;
 90 | 					$head=join("\t",@tmp[0..$classify_col]);
 91 | 					last;
 92 | 				}
 93 | 			}
 94 | 		}
 95 | 		next if(/^#/);#跳过注释行
 96 | 		my $id=join("\t",@tmp[0..$classify_col]);
 97 | 		$hash_specie{$id}{$sample}=$tmp[-4];#记录Sequenced_Reads_Num/Theoretical_Tag_Num值
 98 | 		$hash_all{$sample}+=$tmp[-4];#记录总数
 99 | 	}
100 | 	close IN;
101 | }
102 | close LI;
103 | 
104 | #输出所有样品丰度计算结果
105 | open OU,">$outdir/$prefix.all.xls" or die "cannot open $outdir/$prefix.all.xls\n";
106 | print OU "$head\t",join("\t",@sample_sort),"\n";#表头
107 | for my $id(sort {$a cmp $b} keys %hash_specie){#循环检测到的物种
108 | 	my $judge=0;
109 | 	my $print=$id;
110 | 	for my $sample(@sample_sort){#循环样品
111 | 		if(exists $hash_specie{$id}{$sample}){
112 | 			my $percent=$hash_specie{$id}{$sample}/$hash_all{$sample};
113 | 			if($percent==0){
114 | 				$print .="\t0";
115 | 			}else{
116 | 				$print .="\t$percent";
117 | 				$judge++;
118 | 			}
119 | 		}else{
120 | 			$print .="\t0";
121 | 		}
122 | 	}
123 | 	print OU "$print\n" if ($judge!=0);
124 | }
125 | close OU;
126 | 
127 | #输出 删除mock和阴性对照样品，以及阴性对照检测出来的菌 的结果
128 | open OU,">$outdir/$prefix.filtered.xls" or die "cannot open $outdir/$prefix.filtered.xls\n";
129 | #表头处理
130 | print OU "$head";
131 | for(@sample_sort){
132 | 	next if(exists $hash_mock{$_} || exists $hash_control{$_});#过滤掉mock和阴性对照样品
133 | 	print OU "\t$_";
134 | }
135 | print OU "\n";
136 | for my $id(sort {$a cmp $b} keys %hash_specie){#循环检测到的物种
137 | 	my $judge=0;#整行判断
138 | 	my $judge_control=0;#阴性对照判断
139 | 	my $print=$id;
140 | 	for my $sample(@sample_sort){
141 | 		next if(exists $hash_mock{$sample});#过滤掉mock样品
142 | 		next if(exists $hash_control{$sample});#过滤掉阴性对照样品
143 | #		if(exists $hash_control{$sample}){#阴性对照样品处理
144 | #			$judge_control++ if(exists $hash_specie{$id}{$sample} && $hash_specie{$id}{$sample}!=0);
145 | #			next;
146 | #		}
147 | 		if(exists $hash_specie{$id}{$sample}){
148 | 			my $percent=0;
149 | 			$percent=$hash_specie{$id}{$sample}/$hash_all{$sample} if($hash_all{$sample}!=0);
150 | 			if($percent==0){
151 | 				$print .="\t0";
152 | 			}else{
153 | 				$print .="\t$percent";
154 | 				$judge++;
155 | 			}
156 | 		}else{
157 | 			$print .="\t0";
158 | 		}
159 | 	}
160 | 	print OU "$print\n" if ($judge!=0 && $judge_control==0);
161 | }
162 | 
163 | sub CheckDir{#创建目录
164 | 	my $file = shift;
165 | 	unless( -d $file ){
166 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
167 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
168 | 	}
169 | 	return 1;
170 | }
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/tools/Download_2bRADTagDB_GTDB.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | #Author:zhangrongchao, zhangrongchaoxx@163.com
  3 | use strict;
  4 | use warnings;
  5 | use File::Basename qw(dirname basename);
  6 | use Cwd 'abs_path';
  7 | 
  8 | $ARGV[0] ||="2B-RAD-M-ref_db_GTDB";
  9 | 
 10 | #if($#ARGV!=0){
 11 | #	print STDERR "perl $0 outdir\n";
 12 | #	exit 1;
 13 | #}
 14 | 
 15 | my $outdir=$ARGV[0];#下载目录
 16 | 
 17 | $outdir=abs_path($outdir);
 18 | &CheckDir("$outdir");
 19 | 
 20 | 
 21 | my @a=('abfh_classify','MSA1002','simulate_50');#分类表，实际数据，模拟数据
 22 | #my @b=('BcgI.species');#需要下载的库文件
 23 | my @b=('BcgI.species','CjePI.species');#需要下载的库文件
 24 | 
 25 | my %hash_path=(
 26 | 	'abfh_classify'=>['https://figshare.com/ndownloader/files/31653170/abfh_classify_with_speciename.txt.gz',],
 27 | 
 28 | 	'MSA1002'      =>['https://figshare.com/ndownloader/files/25623566/MSA1002_R1.fq.gz',],
 29 | 
 30 | #	'simulate_50'  =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25621832/simulate_50.fa.gz',],
 31 | 	'simulate_50'  =>['https://figshare.com/ndownloader/files/25915428/simulate_50.BcgI.fq.gz',],
 32 | 
 33 | 	'BcgI.species' =>['https://figshare.com/ndownloader/files/31653911/BcgI.species.fa.gz0',
 34 | 	                  'https://figshare.com/ndownloader/files/31659299/BcgI.species.fa.gz1',
 35 | 	                  'https://figshare.com/ndownloader/files/31653614/BcgI.species.fa.gz2',],
 36 | 
 37 | 	'CjePI.species'=>['https://figshare.com/ndownloader/files/31660241/CjePI.species.fa.gz0',
 38 | 	                  'https://figshare.com/ndownloader/files/31660358/CjePI.species.fa.gz1',
 39 | 	                  'https://figshare.com/ndownloader/files/31662320/CjePI.species.fa.gz2',
 40 | 	                  'https://figshare.com/ndownloader/files/31662794/CjePI.species.fa.gz3',
 41 | 	                  'https://figshare.com/ndownloader/files/31659818/CjePI.species.fa.gz4',],
 42 | 	);
 43 | 
 44 | my %hash_md5=(
 45 | 	'abfh_classify'=>['c2faa9ae97b704b3d0705709cf22ecb4',],
 46 | 
 47 | 	'MSA1002'      =>['bc2b189213975f6d6c0833a4ba726239',],
 48 | 
 49 | #	'simulate_50'  =>['9defe990462d3fef8eb69a2c359d72da',],
 50 | 	'simulate_50'  =>['04cafca5b5c23c48774e9d515dde42a8',],
 51 | 
 52 | 	'BcgI.species' =>['a1b70d0de71093a0bb9bedbadab641b0',
 53 | 	                  '383fd8c85a23aee4a48d48aa41845f17',
 54 | 	                  'd19a5ce115fac8708fb0919f619ddf19',],
 55 | 	
 56 | 	'CjePI.species'=>['8b1c62c80bdf3b05182f2fe47d0f0751',
 57 | 	                  '4662c85ef0e12a749d8b9284302e2a18',
 58 | 	                  'ed3d3a27df05b7c0eb97140f78f54a75',
 59 | 	                  '063b3c362f41889037b3bb15d8a0617f',
 60 | 	                  '021a06a6e926b4ba91acba0c398877d7',]
 61 | 	);
 62 | 
 63 | #合并后文件md5
 64 | my %complete_md5=(
 65 | 	'BcgI.species' =>'eea6b5ec34b00a749d45199a91fd3e34',
 66 | 	'CjePI.species'=>'3d9913da22ac340357d4e708a7506de8',
 67 | 	);
 68 | 
 69 | #download abfh_classify && MSA1002 && simulate_50
 70 | for my $i(@a){
 71 | 	my @tmp=split /\//,$hash_path{$i}[0];
 72 | 	my $url=join("/",@tmp[0..$#tmp-1]);
 73 | 	my $name=$tmp[-1];
 74 | 	my $file_md5;#下载的文件的MD5值
 75 | 	while(1){
 76 | 		if(-e "$outdir/$name"){
 77 | 			chomp($file_md5=`md5sum $outdir/$name`);
 78 | 			$file_md5=(split /\s+/,$file_md5)[0];
 79 | 		}
 80 | 		if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[0]){
 81 | 			print STDOUT "File $name has been downloaded.\n";
 82 | 			last;
 83 | 		}else{
 84 | 			`wget -t 0 -O $outdir/$name $url`;
 85 | 		}
 86 | 	}
 87 | }
 88 | #example list
 89 | open OU,">$outdir/list_mock" or die "cannot open $outdir/list_mock\n";
 90 | print OU "MSA1002\t$outdir/MSA1002_R1.fq.gz\n";
 91 | close OU;
 92 | 
 93 | open OU,">$outdir/list_simulation" or die "cannot open $outdir/list_simulation\n";
 94 | #print OU "simulate_50\t$outdir/simulate_50.fa.gz\n";
 95 | print OU "simulate_50\t$outdir/simulate_50.BcgI.fq.gz\n";
 96 | close OU;
 97 | 
 98 | 
 99 | 
100 | #下载数据库文件
101 | for my $i(@b){
102 | 	my $cat="";
103 | 	while(1){
104 | 		my $md5;
105 | 		if(-e "$outdir/$i.fa.gz"){#存在完成文件
106 | 			chomp($md5=`md5sum $outdir/$i.fa.gz`);
107 | 			$md5=(split /\s+/,$md5)[0];
108 | 		}
109 | 		if(-e "$outdir/$i.fa.gz" && $md5 eq $complete_md5{$i}){
110 | 			print STDOUT "File $i.fa.gz hash been downloaded.\n";
111 | 			`rm -rf $cat`;
112 | 			last;
113 | 		}else{
114 | 			for my $j(0..$#{$hash_path{$i}}){#循环每个文件
115 | 				my @tmp=split /\//,$hash_path{$i}[$j];
116 | 				my $url=join("/",@tmp[0..$#tmp-1]);
117 | 				my $name=$tmp[-1];
118 | 				my $file_md5;#下载的文件的MD5值
119 | 				while(1){
120 | 					if(-e "$outdir/$name"){
121 | 						chomp($file_md5=`md5sum $outdir/$name`);
122 | 						$file_md5=(split /\s+/,$file_md5)[0];
123 | 					}
124 | 					if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[$j]){
125 | 						print STDOUT "File $name has been downloaded.\n";
126 | 						$cat .=" $outdir/$name";
127 | 						last;
128 | 					}else{
129 | 						`wget -t 0 -O $outdir/$name $url`;
130 | 					}
131 | 				}
132 | 			}
133 | 			`cat $cat > $outdir/$i.fa.gz`;
134 | 		}
135 | 	}
136 | }
137 | 
138 | print STDOUT "Congratulations! All databases have been downloaded.\n";
139 | 
140 | sub CheckDir{
141 | 	my $file = shift;
142 | 	unless( -d $file ){
143 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
144 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
145 | 	}
146 | 	return 1;
147 | }
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/scripts/CalculateRelativeAbundance_Combined2bEnzymes.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings;
  3 | use strict;
  4 | use Getopt::Long;
  5 | use FindBin qw($Bin);
  6 | use File::Basename qw(dirname basename);
  7 | 
  8 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang";
  9 | my $time="2020.06.03";
 10 | 
 11 | #默认值
 12 | my $mark ||="combine";
 13 | my $g_score_threshold ||=0;
 14 | 
 15 | #Standard output for clearing cache
 16 | select STDOUT;$|=1;
 17 | 
 18 | my ($list,$site,$outdir,$help);
 19 | GetOptions(
 20 | 		"l:s" => \$list,
 21 | 		"s:s" => \$site,
 22 | 		"io:s" => \$outdir,
 23 | 
 24 | 		"m:s" => \$mark,
 25 | 		"g:i" => \$g_score_threshold,
 26 | 		"h|help:s" => \$help,
 27 | 		);
 28 | 
 29 | 
 30 | sub usage{# helper information
 31 | print STDERR "\e[;33;1m
 32 | DESCRIPTION
 33 |     It computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by one or multiple type 2b restriction enzymes.
 34 | USAGE
 35 | 	perl $0
 36 | PARAMETERS
 37 | 	-l  <str> The path of the input filepath list (the line which begins with # will be ignored)
 38 | 	          eg: sample_name<tab>...
 39 | 	-s  <int> One or multiple type 2b restriction enzymes (sites). The selected sites should be separated by comma.
 40 | 	          [1]CspCI  [9]BplI
 41 | 	          [2]AloI   [10]FalI
 42 | 	          [3]BsaXI  [11]Bsp24I
 43 | 	          [4]BaeI   [12]HaeIV
 44 | 	          [5]BcgI   [13]CjePI
 45 | 	          [6]CjeI   [14]Hin4I
 46 | 	          [7]PpiI   [15]AlfI
 47 | 	          [8]PsrI   [16]BslFI
 48 | 	          [17]All_Detected_Enzyme
 49 | 	-io <str> The input and output directory
 50 | OPTIONS
 51 | 	-m  <str> Whether the taxa idenfication or abundance estimation should take into account for the 2b-RAD taxa-specific markers from more than one restriction sites [combine]
 52 | 	-g  <int> The G-score threshold [$g_score_threshold, it means >=$g_score_threshold] To control the false-positive in the species identification, G score was derived for each species identified within a sample, which is a harmonious mean of read coverage of 2b-RAD markers belongs to a species and number of all possible 2b-RAD markers of this species. Therecommended/default threshold is $g_score_threshold.
 53 | 	-h|help   print this help.
 54 | AUTHOR:  $author $time\e[0m\n";
 55 | }
 56 | 
 57 | if(defined($help)){
 58 | 	&usage;
 59 | 	exit 0;
 60 | }
 61 | 
 62 | my %hs_site2enzyme=(#the codes for all restriction enzymes
 63 | 	'1'  =>  'CspCI',   '2'  =>  'AloI',
 64 | 	'3'  =>  'BsaXI',   '4'  =>  'BaeI',
 65 | 	'5'  =>  'BcgI',    '6'  =>  'CjeI',
 66 | 	'7'  =>  'PpiI',    '8'  =>  'PsrI',
 67 | 	'9'  =>  'BplI',    '10' =>  'FalI',
 68 | 	'11' =>  'Bsp24I',  '12' =>  'HaeIV',
 69 | 	'13' =>  'CjePI',   '14' =>  'Hin4I',
 70 | 	'15' =>  'AlfI',    '16' =>  'BslFI',
 71 | 	);
 72 | 
 73 | unless($list && $site && $outdir){
 74 | 	&usage;
 75 | 	exit 1;
 76 | }
 77 | 
 78 | # check the availability of the taxa-specific 2b-RAD reference genome database
 79 | if($site=~/17/){
 80 | 	$site="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16";
 81 | }
 82 | my @site=split /,/,$site;
 83 | for $site(@site){
 84 | 	unless(exists $hs_site2enzyme{$site}){#检测酶切位点是否存在
 85 | 		&usage;
 86 | 		print STDERR "Parameter -s is wrong\n";
 87 | 		exit 1;
 88 | 	}
 89 | }
 90 | =pod
 91 | #注释文件检测并读取
 92 | my %hs_anno;
 93 | unless(-e "$database/species_ID_annotation.txt"){
 94 | 	&usage;
 95 | 	print STDERR "cannot find $database/species_ID_annotation.txt\n";
 96 | 	exit;
 97 | }else{
 98 | 	open AN,"$database/species_ID_annotation.txt" or die "cannot open $database/species_ID_annotation.txt\n";
 99 | 	while(<AN>){
100 | 		next if(/^#/ || /^$/);#去除注释行和空行
101 | 		chomp;
102 | 		my @tmp=split /\t/;
103 | 		$hs_anno{$tmp[2]}="$tmp[1]\t$tmp[3]";
104 | 	}
105 | 	close AN;
106 | }
107 | =cut
108 | 
109 | #合并处理
110 | open LI,"$list" or die "cannot open $list\n";
111 | while(<LI>){#循环样品
112 | 	next if(/^#/ || /^$/);#去除注释行和空行
113 | 	chomp;
114 | 	my (%hs_sample,$head);
115 | 	my @use_site;#用到的酶 结果
116 | 	my $sample_name=(split /\t/)[0];
117 | 	my $cnt=0;
118 | 	for $site(@site){# iterate all enzymes
119 | 		if(-e "$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls"){
120 | 			push @use_site,$hs_site2enzyme{$site};
121 | 		}else{
122 | 			print STDERR "warning: $sample_name cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls\n";
123 | 			next;#跳过没有鉴定的酶 文件
124 | 		}
125 | 		$cnt++;
126 | 		open IN,"$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls";
127 | 		while(<IN>){
128 | 			chomp;
129 | 			my $line=$_;
130 | 			next if($line=~/^#/);
131 | 			if($line=~/^Kingdom/i){
132 | 				$head=$line;
133 | 				next;
134 | 			}
135 | 			my @tmp=split /\t/;
136 | 			my $class=join("\t",@tmp[0..$#tmp-8]);
137 | 			$hs_sample{$class}{-8}+=$tmp[-8];#理论标签数
138 | 			$hs_sample{$class}{-7}+=$tmp[-7];#测序得到的标签数
139 | 			$hs_sample{$class}{-5}+=$tmp[-5];#测序得到的reads数
140 | 			$hs_sample{$class}{-2}+=$tmp[-2];#测到的深度大于1的标签数
141 | 		}
142 | 		close IN;
143 | 	}
144 | 	next if($cnt==0);#如果所有的酶都没有结果，那么不继续输出
145 | 	open OU,">$outdir/$sample_name/$sample_name.$mark.xls" or die "cannot open $outdir/$sample_name/$sample_name.$mark.xls\n";
146 | 	print OU "#@use_site combine\n";
147 | #	print OU "#Kingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecie\tTheoretical_Tag_Num\tSequenced_Tag_Num\tPercent\tSequenced_Reads_Num\tSequenced_Reads_Num/Theoretical_Tag_Num\tSequenced_Reads_Num/Sequenced_Tag_Num\tG_Score\ttaxid\tunique_name\n";
148 | 	print OU "#$head\n";
149 | 	for my $class(keys %hs_sample){
150 | 		my @tmp=split /\t/,$class;
151 | 		my $Theoretical_Tag_Num=$hs_sample{$class}{-8};
152 | 		my $Sequenced_Tag_Num=$hs_sample{$class}{-7};
153 | 		my $Sequenced_Tag_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Tag_Num/$Theoretical_Tag_Num*100;
154 | 		my $Sequenced_Reads_Num=$hs_sample{$class}{-5};
155 | 		my $Sequenced_Reads_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Theoretical_Tag_Num;
156 | 		my $Sequenced_Reads_Num2Sequenced_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Sequenced_Tag_Num;
157 | 		my $Sequenced_Tag_Num_2=$hs_sample{$class}{-2};
158 | 		my $G_Score=sprintf "%.8f",sqrt($Sequenced_Tag_Num*$Sequenced_Reads_Num);
159 | 		next if ($G_Score<$g_score_threshold);#过滤gscore阈值
160 | 		print OU "$class\t$Theoretical_Tag_Num\t$Sequenced_Tag_Num\t$Sequenced_Tag_Num2Theoretical_Tag_Num%\t";
161 | 		print OU "$Sequenced_Reads_Num\t$Sequenced_Reads_Num2Theoretical_Tag_Num\t$Sequenced_Reads_Num2Sequenced_Tag_Num\t";
162 | #		print OU "$G_Score\t$hs_anno{$tmp[6]}\n";
163 | 		print OU "$Sequenced_Tag_Num_2\t$G_Score\n";
164 | 	}
165 | 	close OU;
166 | 	undef %hs_sample;
167 | }
168 | close IN;
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/scripts/CalculateRelativeAbundance_Single2bEnzyme.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | use warnings;
  3 | use strict;
  4 | use Getopt::Long;
  5 | use FindBin qw($Bin);
  6 | use File::Basename qw(dirname basename);
  7 | use Cwd 'abs_path';
  8 | 
  9 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang";
 10 | my $time="2020.12.21";
 11 | 
 12 | #set default parameters
 13 | my $g_score_threshold ||=0;
 14 | my $verbose ||="yes";
 15 | 
 16 | select STDOUT;$|=1;#Standard output for clearing cache
 17 | 
 18 | my ($list,$database,$site,$outdir,$level);
 19 | GetOptions(
 20 | 		"l:s" => \$list,
 21 | 		"d:s" => \$database,
 22 | 		"t:s" => \$level,
 23 | 		"s:s" => \$site,
 24 | 		"o:s" => \$outdir,
 25 | 
 26 | 		"g:i" => \$g_score_threshold,
 27 | 		"v:s" => \$verbose,
 28 | 		);
 29 | 
 30 | 
 31 | sub usage{# help information
 32 | print STDERR "\e[;33;1m
 33 | DESCRIPTION
 34 |     It computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by a single type 2b restriction enzyme.
 35 | USAGE
 36 |     perl $0
 37 | Required:
 38 |     -l <str>    The path of the input filepath list (the line that begin with # will be ignored) e.g: sample_name<tab>data_path(fa|fq)(.gz).
 39 |     -d <str>    The database filepath.
 40 |     -t <str>    The taxonomy level of the taxa-specific 2b-RAD database used. It should be one of the following: kingdom,phylum,class,order,family,genus,species,strain.
 41 |     -s <str>    One of the type 2b restriction enzymes (sites).
 42 |                 [1]CspCI  [9]BplI
 43 |                 [2]AloI   [10]FalI
 44 |                 [3]BsaXI  [11]Bsp24I
 45 |                 [4]BaeI   [12]HaeIV
 46 |                 [5]BcgI   [13]CjePI
 47 |                 [6]CjeI   [14]Hin4I
 48 |                 [7]PpiI   [15]AlfI
 49 |                 [8]PsrI   [16]BslFI
 50 |     -o <str>    The output directory (automatically create if it does not exist)
 51 | Optional:
 52 |     -g <int>    The threshold of G score [$g_score_threshold, it means >=$g_score_threshold]. To control the false-positive in the species identification, G score was derived for each speciesidentified within a sample, which is a harmonious mean of read coverage of 2b-RAD markers belongs to a species and number of all possible 2b-RAD markers of this species. Therecommended/default threshold is $g_score_threshold.
 53 |     -v <str>    This specify if more detailed information will be shown [$verbose] (yes or no)
 54 | AUTHOR:  $author $time\e[0m\n";
 55 | }
 56 | 
 57 | 
 58 | my %hs_site2enzyme=(# the codes for all restriction enzymes
 59 | 	'1'  =>  'CspCI',   '2'  =>  'AloI',
 60 | 	'3'  =>  'BsaXI',   '4'  =>  'BaeI',
 61 | 	'5'  =>  'BcgI',    '6'  =>  'CjeI',
 62 | 	'7'  =>  'PpiI',    '8'  =>  'PsrI',
 63 | 	'9'  =>  'BplI',    '10' =>  'FalI',
 64 | 	'11' =>  'Bsp24I',  '12' =>  'HaeIV',
 65 | 	'13' =>  'CjePI',   '14' =>  'Hin4I',
 66 | 	'15' =>  'AlfI',    '16' =>  'BslFI',
 67 | 	);
 68 | 
 69 | my %hs_type_database=(
 70 | 	'kingdom' => '1',
 71 | 	'phylum'  => '2',
 72 | 	'class'   => '3',
 73 | 	'order'   => '4',
 74 | 	'family'  => '5',
 75 | 	'genus'   => '6',
 76 | 	'species' => '7',
 77 | 	'strain'  => '8',
 78 | 	);
 79 | 
 80 | my @HEAD=(
 81 | 	'Kingdom',
 82 | 	'Phylum',
 83 | 	'Class',
 84 | 	'Order',
 85 | 	'Family',
 86 | 	'Genus',
 87 | 	'Species',
 88 | 	'Strain',
 89 | 	);
 90 | 
 91 | unless($list && $database && $level && $site && $outdir){
 92 | 	&usage;
 93 | 	exit 1;
 94 | }
 95 | 
 96 | #转换绝对路径
 97 | $list=abs_path($list);
 98 | $database=abs_path($database);
 99 | $outdir=abs_path($outdir);
100 | 
101 | 
102 | # parameter checking
103 | unless($verbose eq "yes" || $verbose eq "no"){
104 | 	&usage;
105 | 	print STDERR "Parameter -v is wrong\n";
106 | 	exit 1;
107 | }
108 | # check the taxonomic level of a 2b-RAD reference genome database
109 | unless($level eq "kingdom" || $level eq "phylum" || $level eq "class" || $level eq "order" || $level eq "family" || $level eq "genus" || $level eq "species" || $level eq "strain"){
110 | 	&usage;
111 | 	print STDERR "Parameter -t is wrong. Cannot get $level\n";
112 | 	exit 1;
113 | }
114 | # check the parameter -s and -d
115 | unless(exists $hs_site2enzyme{$site}){
116 | 	&usage;
117 | 	print STDERR "Parameter -s $site is wrong\n";
118 | 	exit 1;
119 | }
120 | #检查库文件
121 | unless(-e "$database/$hs_site2enzyme{$site}.$level.fa.gz" && -e "$database/abfh_classify_with_speciename.txt.gz"){
122 | 	&usage;
123 | 	print STDERR "Incomplete database, please check the parameter(-d).\n";
124 | 	exit 1;
125 | }
126 | 
127 | print STDOUT "COMMAND: perl $0 -l $list -d $database -t $level -s $site -o $outdir -g $g_score_threshold -v $verbose\n";
128 | 
129 | &CheckDir($outdir);
130 | 
131 | my $head=join("\t",@HEAD[0..$hs_type_database{$level}-1]);
132 | # load the database
133 | print STDOUT "### Loading the database, $database/$hs_site2enzyme{$site}.$level.fa.gz, ",`date`;
134 | my (%hs_tag2GCF,%hs_GCF2class,%hs_tag_theory_num);
135 | my $all_genome_num;
136 | open LI,"gzip -dc $database/abfh_classify_with_speciename.txt.gz|" or die "cannot open $database/abfh_classify_with_speciename.txt.gz\n";
137 | while(<LI>){
138 | 	next if(/^#/ || /^$/);#去掉注释行和空行
139 | 	chomp;
140 | 	my @tmp=split /\t/;
141 | 	my $class=join("\t",@tmp[1..$hs_type_database{$level}]);# all taxonomic levels
142 | 	$hs_GCF2class{$tmp[0]}=$class;# record the corresponding taxonomy for each GCF
143 | 	$all_genome_num++;
144 | }
145 | close LI;
146 | 
147 | my (%hash_gcf_rank,%complete);
148 | $/=">";
149 | open IN,"gzip -dc $database/$hs_site2enzyme{$site}.$level.fa.gz|" or die "cannot open $database/$hs_site2enzyme{$site}.$level.fa.gz\n";
150 | <IN>;
151 | while(<IN>){
152 | 	chomp;
153 | 	my($id,$tag)=split /\n/;
154 | 	#GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签
155 | 	my @tmp=split /\|/,$id;
156 | 	$hash_gcf_rank{$tmp[0]}++;
157 | 	next if($tmp[5]!=1);#跳过非unique标签
158 | 	my $class=$hs_GCF2class{$tmp[0]};# all taxonomic levels
159 | 	push @{$hs_tag2GCF{$tag}},$tmp[0];# record GCF for each 2b tag
160 | 	$hs_tag_theory_num{$class}{$tmp[0]}{$tag}++;# compute the number of 2b tags of each GCF under a given taxon and record the # of all 2b tags from the same taxa
161 | 	for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度
162 | 		if((keys %hash_gcf_rank)/$all_genome_num*100>=$i){
163 | 			print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志
164 | 			$complete{$i}++;
165 | 			last;
166 | 		}
167 | 	}
168 | }
169 | close IN;
170 | $/="\n";
171 | print STDOUT "###Loading database completed, ",`date`;
172 | 
173 | # process each sample in the list file
174 | open LI,"$list" or die "cannot open $list\n";
175 | while(<LI>){
176 | 	next if(/^#/ || /^$/); # 去除注释行和空行
177 | 	chomp;
178 | 	my ($sample_name,$sample_data)=split /\t/;
179 | 	$sample_data=abs_path($sample_data);#转为绝对路径
180 | 	print STDOUT "###($sample_name) Sample identification started, ",`date`;
181 | 	my (%hs_tag_num,%hs_detected_GCF_tag);
182 | 	# load a single sample
183 | 	if($sample_data=~/\.gz$/){
184 | 		open IN,"gzip -dc $sample_data|" or die "cannot open $sample_data\n";
185 | 	}else{
186 | 		open IN,"$sample_data" or die "cannot open $sample_data\n";
187 | 	}
188 | 	while(<IN>){
189 | 		my $line=$_;
190 | 		if($line=~/^@/){#fastq
191 | 			$line .=<IN> . <IN> . <IN>;
192 | 		}elsif($line=~/^>/){#fasta
193 | 			$line .=<IN>;
194 | 		}
195 | 		my $tag=(split /\n/,$line)[1];
196 | 		if(exists $hs_tag2GCF{$tag}){
197 | 			my $class=$hs_GCF2class{$hs_tag2GCF{$tag}[0]};
198 | 			$hs_tag_num{$class}{$tag}++;#实际样品标签深度
199 | 			for my $i(0..$#{$hs_tag2GCF{$tag}}){
200 | 				$hs_detected_GCF_tag{$class}{$hs_tag2GCF{$tag}[$i]}{$tag}=$hs_tag_theory_num{$class}{$hs_tag2GCF{$tag}[$i]}{$tag};
201 | 			}
202 | 		}else{#反向互补
203 | 			$tag=~tr/ATCG/TAGC/;
204 | 			$tag=reverse($tag);
205 | 			if(exists $hs_tag2GCF{$tag}){
206 | 				my $class=$hs_GCF2class{$hs_tag2GCF{$tag}[0]};
207 | 				$hs_tag_num{$class}{$tag}++;#实际样品标签深度
208 | 				for my $i(0..$#{$hs_tag2GCF{$tag}}){
209 | 					$hs_detected_GCF_tag{$class}{$hs_tag2GCF{$tag}[$i]}{$tag}=$hs_tag_theory_num{$class}{$hs_tag2GCF{$tag}[$i]}{$tag};
210 | 				}
211 | 			}
212 | 		}
213 | 	}
214 | 	close IN;
215 | 
216 | 	if((keys %hs_tag_num)==0){# go to the next sample if no 2b-RAD tag was detected in a sample
217 | 		print STDERR "!!!($sample_name) Warning: $hs_site2enzyme{$site}  $level the number of 2b-RAD tags for this sample is zero\n";
218 | 		print STDOUT "###($sample_name) Sample idenfication completed, ",`date`;
219 | 		next;
220 | 	}
221 | 	&CheckDir("$outdir/$sample_name");# create a filepath for each sample
222 | 	# compute the number of therotical and actual 2b-RAD tags for each GCF in each sample
223 | 	open DE,">$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.GCF_detected.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.GCF_detected.xls\n";
224 | 	for my $class(sort {$a cmp $b} keys %hs_detected_GCF_tag){
225 | 		for my $GCF(sort {$a cmp $b} keys %{$hs_detected_GCF_tag{$class}}){
226 | 			my $GCF_all_theory_num;
227 | 			my $detected_tag_num;
228 | 			$GCF_all_theory_num=keys %{$hs_tag_theory_num{$class}{$GCF}};# theoratical, the number of taxa-specific 2b-RAD tags from each GCF
229 | 			$detected_tag_num=keys %{$hs_detected_GCF_tag{$class}{$GCF}};# in a real sample, the number of taxa-specific 2b-RAD tags from each GCF
230 | 			my $percent=sprintf "%.4f",$detected_tag_num/$GCF_all_theory_num;# the percentage of detected 2b-RAD tags from a GCF specific to a taxon
231 | 			print DE "$class\t$GCF\t$GCF_all_theory_num\t$detected_tag_num\t$percent\n";
232 | 		}
233 | 	}
234 | 	close DE;
235 | 	# Output
236 | 	&CheckDir("$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}") if($verbose eq "yes");
237 | 	open OU,">$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls\n";
238 | 	print OU "$head\tTheoretical_Tag_Num\tSequenced_Tag_Num\tPercent\t";
239 | 	print OU "Sequenced_Reads_Num\tSequenced_Reads_Num/Theoretical_Tag_Num\tSequenced_Reads_Num/Sequenced_Tag_Num\tSequenced_Tag_Num(depth>1)\t";
240 | 	print OU "G_Score\n";
241 | 	for my $class(keys %hs_tag_num){
242 | 		if($verbose eq "yes"){ # output the detailed information on sequencing coverage of each 2b-RAD tag from a given genome detected in the real sample
243 | 			my $output_name=(split /\t/,$class)[-1];
244 | 			open DETAIL,">$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}/$output_name.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}/$output_name.xls\n";
245 | 		}
246 | 		my ($Theoretical_Tag_Num,$Sequenced_Tag_Num,$Sequenced_Tag_Num_2,$Sequenced_Tag_Num2Theoretical_Tag_Num);
247 | 		my ($Sequenced_Reads_Num,$Sequenced_Reads_Num2Theoretical_Tag_Num,$Sequenced_Reads_Num2Sequenced_Tag_Num);
248 | 		my ($G_Score);
249 | 		$Sequenced_Tag_Num=$Sequenced_Reads_Num=$Sequenced_Tag_Num_2=0;
250 | 		for my $tag(keys %{$hs_tag_num{$class}}){# iterate each 2b-RAD tag
251 | 			$Sequenced_Tag_Num++;
252 | 			$Sequenced_Tag_Num_2++ if($hs_tag_num{$class}{$tag}>1);# compute the number of 2b-RAD tags that have the sequencing coverage >1
253 | 			$Sequenced_Reads_Num+=$hs_tag_num{$class}{$tag}; # the number of reads detected
254 | 			if($verbose eq "yes"){
255 | 				print DETAIL "$tag\t$hs_tag_num{$class}{$tag}\n";
256 | 			}
257 | 		}
258 | 		if($verbose eq "yes"){
259 | 			close DETAIL;
260 | 		}
261 | 		# average number of theoretical 2b-RAD tags for each taxon
262 | 		my $species_all_theory_num;
263 | 		for my $GCF(keys %{$hs_tag_theory_num{$class}}){
264 | 			for my $tag(keys %{$hs_tag_theory_num{$class}{$GCF}}){
265 | 				$species_all_theory_num+=$hs_tag_theory_num{$class}{$GCF}{$tag};
266 | 			}
267 | 		}
268 | 		$Theoretical_Tag_Num=$species_all_theory_num/(keys %{$hs_tag_theory_num{$class}});# average number of theoretical 2b-RAD tags for each taxon
269 |         # statistical summmary
270 | 		$Sequenced_Tag_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Tag_Num/$Theoretical_Tag_Num*100;# 测到的标签占理论的百分比
271 | 		$Sequenced_Reads_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Theoretical_Tag_Num;# 测到的标签深度/理论标签数
272 | 		$Sequenced_Reads_Num2Sequenced_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Sequenced_Tag_Num;# 测到的标签平均深度
273 | 		$G_Score=sprintf "%.8f",sqrt($Sequenced_Tag_Num*$Sequenced_Reads_Num);#compute the g_score for each taxon
274 | 		next if ($G_Score<$g_score_threshold);# filter taxa that have g_score < $g_score_threshold
275 | 		print OU "$class\t$Theoretical_Tag_Num\t$Sequenced_Tag_Num\t$Sequenced_Tag_Num2Theoretical_Tag_Num%\t";
276 | 		print OU "$Sequenced_Reads_Num\t$Sequenced_Reads_Num2Theoretical_Tag_Num\t$Sequenced_Reads_Num2Sequenced_Tag_Num\t$Sequenced_Tag_Num_2\t";
277 | 		print OU "$G_Score\n";
278 | 	}
279 | 	close OU;
280 | 	undef %hs_tag_num;
281 | 	undef %hs_detected_GCF_tag;
282 | 	print STDOUT "###($sample_name) Sample identification completed, ",`date`;
283 | }
284 | close LI;
285 | 		
286 | # clean cache
287 | print STDOUT "### Cleaning the cached objects started, ",`date`;
288 | undef %hs_tag_theory_num;
289 | undef %hs_GCF2class;
290 | undef %hs_tag2GCF;
291 | print STDOUT "###Cleaning the cached objects completed, ",`date`;
292 | 
293 | sub CheckDir{
294 | 	my $file = shift;
295 | 	unless( -d $file ){
296 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
297 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
298 | 		}
299 | 		return 1;
300 | }
301 | 
302 | 
303 | 


--------------------------------------------------------------------------------
/scripts/CreateQuanDatabase_2bRAD.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Authors: Zheng Sun, Rongchao Zhang, Shi Huang
  3 | use warnings;
  4 | use strict;
  5 | use Getopt::Long;
  6 | use FindBin qw($Bin);
  7 | use File::Basename qw(dirname basename);
  8 | use Cwd 'abs_path';
  9 | 
 10 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang";
 11 | my $time="2020.12.16";
 12 | 
 13 | 
 14 | select STDOUT;$|=1;# cache cleaning
 15 | 
 16 | my $remove_redundant ||="no";# 基因组内部是否去冗余 yes or no, default value is "no"
 17 | 
 18 | my($list,$site,$type,$outdir,$enzyme_file,$help);
 19 | GetOptions(
 20 | 		"l:s"  => \$list,
 21 | 		"s:i"  => \$site,
 22 | 		"t:s"  => \$type,
 23 | 		"o:s"  => \$outdir,
 24 | 
 25 | 		"e:s"  => \$enzyme_file,#酶切结果文件，或库文件
 26 | 		"r:s"  => \$remove_redundant, #基因组内部是否去冗余
 27 | 		"h|help:s" => \$help,
 28 | 		);
 29 | 
 30 | sub usage{# help
 31 | print STDERR "\e[;33;1m
 32 | DESCRIPTION
 33 |   It constructs the taxa-specific 2b-RAD reference genome database from a whole-genome reference database.
 34 | USAGE
 35 |   perl $0
 36 | PARAMETERS
 37 |   -l <file> genome classification list (the line which begins with # will be ignored)
 38 |             eg:GCFid<tab>kingdom<tab>phylum<tab>class<tab>order<tab>family<tab>genus<tab>species<tab>strain(<tab>genome_path)
 39 |   -e <file> enzyme file or database file
 40 |   -s <int>  2b restriction enzymes (sites).
 41 |             [1]CspCI  [9]BplI
 42 |             [2]AloI   [10]FalI
 43 |             [3]BsaXI  [11]Bsp24I
 44 |             [4]BaeI   [12]HaeIV
 45 |             [5]BcgI   [13]CjePI
 46 |             [6]CjeI   [14]Hin4I
 47 |             [7]PpiI   [15]AlfI
 48 |             [8]PsrI   [16]BslFI
 49 |   -t <str>  The database level. One or more taxonomy level of the 2b-RAD reference database can be specified: kingdom,phylum,class,order,family,genus,species,strain. Use 'all' for any levels. (comma separated).
 50 |   -o <dir>  outdir (if not exists,it will be created)
 51 | OPTION
 52 |   -r <str>  whether to delete redundant tags within the genome (yes or no) [default: $remove_redundant]
 53 |   -h|help  print this help
 54 | Author:  $author
 55 | Last update:  $time\e[0m\n";
 56 | }
 57 | 
 58 | if(defined($help)){
 59 | 	&usage;
 60 | 	exit 0;
 61 | }
 62 | 
 63 | unless($list && $enzyme_file && $site && $type && $outdir){
 64 | 	&usage;
 65 | 	print STDERR "para -l -e -s -t or -o error.\n";
 66 | 	exit 1;
 67 | }
 68 | 
 69 | #转化为绝对路径
 70 | $list=abs_path($list);
 71 | $outdir=abs_path($outdir);
 72 | $enzyme_file=abs_path($enzyme_file);
 73 | 
 74 | #check the parameter -r: using default value "no"
 75 | unless($remove_redundant eq "yes" || $remove_redundant eq "no"){
 76 | 	&usage;
 77 | 	print STDERR "-r parameter error: $remove_redundant\n";
 78 | 	exit 1;
 79 | }
 80 | 
 81 | 
 82 | #所有分类水平
 83 | my %hs_type_database=(
 84 | 			'kingdom' => '1',
 85 | 			'phylum'  => '2',
 86 | 			'class'   => '3',
 87 | 			'order'   => '4',
 88 | 			'family'  => '5',
 89 | 			'genus'   => '6',
 90 | 			'species' => '7',
 91 | 			'strain'  => '8',
 92 | 			);
 93 | # check the parameter -t: specify the taxonomic level of 2b-RAD genome database
 94 | my %hs_type;
 95 | if($type eq "all"){
 96 | 	%hs_type=%hs_type_database;
 97 | }else{
 98 | 	my @tmp=split /,/,$type;
 99 | 	for my $i(@tmp){
100 | 		if(exists $hs_type_database{$i}){
101 | 			$hs_type{$i}=$hs_type_database{$i};
102 | 		}else{
103 | 			&usage;
104 | 			print STDERR "-t parameter error: cannot find '$i'\n";
105 | 			exit 1;
106 | 		}
107 | 	}
108 | }
109 | 
110 | # check the parameter -s:
111 | my (@site,$enzyme);
112 | if( 1 == $site ){#CspCI
113 | 	@site = (
114 | 		'[AGCT]{11}CAA[AGCT]{5}GTGG[AGCT]{10}',
115 | 		'[AGCT]{10}CCAC[AGCT]{5}TTG[AGCT]{11}',
116 | 		);
117 | 	$enzyme="CspCI";
118 | }elsif( 2 == $site ){#AloI
119 | 	@site = (
120 | 		'[AGCT]{7}GAAC[AGCT]{6}TCC[AGCT]{7}',
121 | 		'[AGCT]{7}GGA[AGCT]{6}GTTC[AGCT]{7}',
122 | 		);
123 | 	$enzyme="AloI";
124 | }elsif( 3 == $site ){#BsaXI
125 |     @site = (
126 |             '[AGCT]{9}AC[AGCT]{5}CTCC[AGCT]{7}',
127 |             '[AGCT]{7}GGAG[AGCT]{5}GT[AGCT]{9}',
128 |             );
129 | 	$enzyme="BsaXI";
130 | }elsif( 4 == $site ){#BaeI
131 |     @site = (
132 |             '[AGCT]{10}AC[AGCT]{4}GTA[CT]C[AGCT]{7}',
133 |             '[AGCT]{7}G[AG]TAC[AGCT]{4}GT[AGCT]{10}',
134 |             );
135 | 	$enzyme="BaeI";
136 | }elsif( 5 == $site ){#BcgI
137 |     @site = (
138 |             '[AGCT]{10}CGA[AGCT]{6}TGC[AGCT]{10}',
139 |             '[AGCT]{10}GCA[AGCT]{6}TCG[AGCT]{10}',
140 |             );
141 | 	$enzyme="BcgI";
142 | }elsif( 6 == $site ){#CjeI
143 |     @site = (
144 |             '[AGCT]{8}CCA[AGCT]{6}GT[AGCT]{9}',
145 |             '[AGCT]{9}AC[AGCT]{6}TGG[AGCT]{8}',
146 |             );
147 | 	$enzyme="CjeI";
148 | }elsif( 7 == $site ){#PpiI
149 |     @site = (
150 |             '[AGCT]{7}GAAC[AGCT]{5}CTC[AGCT]{8}',
151 |             '[AGCT]{8}GAG[AGCT]{5}GTTC[AGCT]{7}',
152 |             );
153 | 	$enzyme="PpiI";
154 | }elsif( 8 == $site ){#PsrI
155 |     @site = (
156 |             '[AGCT]{7}GAAC[AGCT]{6}TAC[AGCT]{7}',
157 |             '[AGCT]{7}GTA[AGCT]{6}GTTC[AGCT]{7}',
158 |             );
159 | 	$enzyme="PsrI";
160 | }elsif( 9 == $site ){#BplI
161 |     @site = (
162 |             '[AGCT]{8}GAG[AGCT]{5}CTC[AGCT]{8}', #palindromes
163 |             );
164 | 	$enzyme="BplI";
165 | }elsif( 10 == $site ){#FalI
166 |     @site = (
167 |             '[AGCT]{8}AAG[AGCT]{5}CTT[AGCT]{8}', #palindromes
168 |             );
169 | 	$enzyme="FalI";
170 | }elsif( 11 == $site ){#Bsp24I
171 |     @site = (
172 |             '[AGCT]{8}GAC[AGCT]{6}TGG[AGCT]{7}',
173 |             '[AGCT]{7}CCA[AGCT]{6}GTC[AGCT]{8}',
174 |             );
175 | 	$enzyme="Bsp24I";
176 | }elsif( 12 == $site ){#HaeIV
177 |     @site = (
178 |             '[AGCT]{7}GA[CT][AGCT]{5}[AG]TC[AGCT]{9}',
179 |             '[AGCT]{9}GA[CT][AGCT]{5}[AG]TC[AGCT]{7}',
180 |             );
181 | 	$enzyme="HaeIV";
182 | }elsif( 13 == $site ){#CjePI
183 |     @site = (
184 |             '[AGCT]{7}CCA[AGCT]{7}TC[AGCT]{8}',
185 |             '[AGCT]{8}GA[AGCT]{7}TGG[AGCT]{7}',
186 |             );
187 | 	$enzyme="CjePI";
188 | }elsif( 14 == $site ){#Hin4I
189 |     @site = (
190 |             '[AGCT]{8}GA[CT][AGCT]{5}[GAC]TC[AGCT]{8}',
191 |             '[AGCT]{8}GA[CTG][AGCT]{5}[AG]TC[AGCT]{8}',
192 |             );
193 | 	$enzyme="Hin4I";
194 | }elsif( 15 == $site ){#AlfI
195 |     @site = (
196 |             '[AGCT]{10}GCA[AGCT]{6}TGC[AGCT]{10}', #palindromes
197 |             );
198 | 	$enzyme="AlfI";
199 | }elsif( 16 == $site ){#BslFI ??some question?? single enzyme
200 |     @site = (
201 |             '[AGCT]{6}GGGAC[AGCT]{14}',
202 |             '[AGCT]{14}GTCCC[AGCT]{6}',
203 |             );
204 | 	$enzyme="BslFI";
205 | }else{
206 | 	&usage;
207 | 	print STDERR "The parameter -s is wrong\n";
208 | 	exit 1;
209 | }
210 | 
211 | #提供酶切文件，检查文件是否存在
212 | if(defined($enzyme_file)){
213 | 	unless(-e $enzyme_file){
214 | 		print STDERR "[ERROR] $enzyme_file does not exist,please check.\n";
215 | 		exit 1;
216 | 	}
217 | }
218 | 
219 | 
220 | #统计总的基因组个数
221 | my $genome_total_num=0;
222 | #my (%hash_gcf2class,%hash_gcf_rank);
223 | my %hash_gcf2class;
224 | if($list=~/\.gz/){
225 | 	open LI,"gzip -dc $list|" or die "cannot open $list\n";
226 | }else{
227 | 	open LI,"$list" or die "cannot open $list\n";
228 | }
229 | while(<LI>){
230 | 	next if(/^#/ || /^$/);# remove blank lines or lines starting with #
231 | 	chomp;
232 | 	my @tmp=split /\t/;
233 | 	$genome_total_num++;#总基因组个数
234 | 	$hash_gcf2class{$tmp[0]}=$_;#gcf对应的分类信息
235 | #	$hash_gcf_rank{$tmp[0]}=$genome_total_num;#记录基因组在列表中的排序，便于打印日志
236 | 	if(defined($enzyme_file)){#提供酶切文件
237 | 		;
238 | 	}else{#不提供酶切文件
239 | 		$tmp[-1]=abs_path($tmp[-1]);
240 | 		unless(-e $tmp[-1]){#check the availability of a genome fasta file
241 | 			print STDERR "[ERROR] $tmp[-1] does not exist,please check your genome file\n";
242 | 			exit 1;
243 | 		}
244 | 	}
245 | }
246 | close LI;
247 | 
248 | if($genome_total_num==0){
249 | 	print STDERR "[warning] There is no genome in the List file.\n";
250 | 	exit 0;
251 | }
252 | 
253 | 
254 | &CheckDir("$outdir");# create the output directory
255 | #&CheckDir("$outdir/database");
256 | 
257 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- start, ",`date`;#STDOUT
258 | for my $level(sort {$hs_type{$a}<=>$hs_type{$b}} keys %hs_type){ #iterate all taxonomic levels of 2b-RAD database
259 | 	print STDOUT "###($level) Record the taxonomies of each 2b-RAD tag -- start, ",`date`;#STDOUT
260 | 	my (%hash_ingenome,%hash,%complete,%hash_gcf_rank);
261 | 	my %hash_seq;#记录基因组酶切所有标签
262 | 	$/=">";
263 | 	if($enzyme_file=~/\.gz$/){#打开酶切文件
264 | 		open IN,"gzip -dc $enzyme_file|" or die "cannot open $enzyme_file\n";
265 | 	}else{
266 | 		open IN,"$enzyme_file" or die "cannot open $enzyme_file\n";
267 | 	}
268 | 	<IN>;
269 | 	while(<IN>){
270 | 		chomp;
271 | 		my @tmp=split /\n/;
272 | 		#GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签
273 | 		my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0];
274 | 		my $tag=$tmp[1];
275 | 		next unless(exists $hash_gcf2class{$gcfid});#gcfid不在列表中则跳过
276 | 		$hash_gcf_rank{$gcfid}++;
277 | 		$hash_seq{$gcfid}{$ingenome_tag_num}=join("\n",@tmp[0..1]);#记录列表中，基因组酶切的所有标签
278 | 
279 | 		my @a=split /\t/,$hash_gcf2class{$gcfid};#分类
280 | 		my $class=join("\t",@a[1..$hs_type{$level}]);#concatenate the full taxonomic annotation
281 | 		if(exists $hash{$tag}){#判断在哈希中是否存在
282 | 			$hash{$tag}{$class}++;#记录标签分类信息
283 | 			$hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余，则记录标签在基因组内部是否冗余
284 | 		}else{#反向互补处理
285 | 			$tag=~tr/ATCG/TAGC/;
286 | 			$tag=reverse($tag);
287 | 			$hash{$tag}{$class}++;#记录标签分类信息
288 | 			$hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余，则记录标签在基因组内部是否冗余
289 | 		}
290 | 		for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度
291 | 			if((keys %hash_gcf_rank)/$genome_total_num*100>=$i){
292 | 				print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志
293 | 				$complete{$i}++;
294 | 				last;
295 | 			}
296 | 		}
297 | 	}
298 | 	close IN;
299 | 	$/="\n";
300 | 	undef %hash_gcf_rank;
301 | 	print STDOUT "\n###($level) Record the taxonomies of each 2b-RAD tag -- complete, ",`date`;# STDOUT
302 | 	
303 | 	print STDOUT "###($level) Identification of taxa-specifc 2b-RAD tags -- start, ",`date`;# STDOUT
304 | 	undef %complete;#完成进度清空
305 | 	my (%hash_genome_tag_num,%hash_genome_unique_tag_num);
306 | 	my $complete=0;
307 | 	if($list=~/\.gz/){
308 | 		open LI,"gzip -dc $list|" or die "cannot open $list\n";
309 | 	}else{
310 | 		open LI,"$list" or die "cannot open $list\n";
311 | 	}
312 | 	open OU,"|gzip > $outdir/$enzyme.$level.fa.gz" or die "cannot open $outdir/$enzyme.$level.fa.gz\n";
313 | 	while(<LI>){
314 | 		next if(/^#/ || /^#/);# remove blank lines or lines starting with #
315 | 		my $line=$_;
316 | 		chomp($line);
317 | 		my $gcfid=(split /\t/,$line)[0];
318 | 		next unless(exists $hash_seq{$gcfid});
319 | 		for my $i(sort {$a<=>$b} keys %{$hash_seq{$gcfid}}){#循环标签
320 | 			my @tmp=split /\n/,$hash_seq{$gcfid}{$i};#id\nseq
321 | 			#GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签
322 | 			my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0];
323 | 			my $tag=$tmp[1];
324 | 			$gcfid=~s/^>//;
325 | 			unless(exists $hash{$tag}){#如果不存在，则进行反向互补
326 | 				$tag=~tr/ATCG/TAGC/;
327 | 				$tag=reverse($tag);
328 | 				#反向互补后，改变链的方向
329 | 				if($chain==0){
330 | 					$chain=1;
331 | 				}elsif($chain==1){
332 | 					$chain=0;
333 | 				}
334 | 			}
335 | 			if(keys %{$hash{$tag}}==1){#指定水平下为unique
336 | 				if($remove_redundant eq "yes"){#基因组内部需要去冗余
337 | 					if($hash_ingenome{$gcfid}{$tag}==1){#在基因组内部只出现过一次（noredundancy）
338 | 						$unique=1;
339 | 						$hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数
340 | 					}else{
341 | 						$unique=0;
342 | 					}
343 | 				}else{#基因组内部不需要去冗余
344 | 					$unique=1;
345 | 					$hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数
346 | 				}
347 | 			}else{
348 | 				$unique=0;
349 | 			}
350 | 			print OU ">$gcfid|$ingenome_tag_num|$scaid|$start|$chain|$unique\n$tag\n" if($unique==1);
351 | 		}
352 | 		$complete++;
353 | 		for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度
354 | 			if($complete/$genome_total_num*100>=$i){
355 | 				print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志
356 | 				$complete{$i}++;
357 | 				last;
358 | 			}
359 | 		}
360 | 
361 | 	}
362 | 	close LI;
363 | 	close OU;
364 | 	#统计输出
365 | 	open STAT,"> $outdir/$enzyme.$level.stat.xls" or die "cannot open $outdir/$enzyme.$level.stat.xls\n";
366 | 	print STAT "#Unique_Name\tAll_Tag_Num\tUnique_Tag_Num\n";
367 | 	if($list=~/\.gz/){
368 | 		open LI,"gzip -dc $list|" or die "cannot open $list\n";
369 | 	}else{
370 | 		open LI,"$list" or die "cannot open $list\n";
371 | 	}
372 | 	while(<LI>){
373 | 		next if(/^#/ || /^$/);# remove blank lines or lines starting with #
374 | 		my $line=$_;
375 | 		chomp($line);
376 | 		my @tmp=split /\t/,$line;
377 | 		print STAT "$tmp[0]";
378 | 		if(exists $hash_seq{$tmp[0]}){#酶切标签数
379 | 			my $genome_tag_num=keys %{$hash_seq{$tmp[0]}};
380 | 			print STAT "\t$genome_tag_num";
381 | 		}else{
382 | 			print STAT "\t0";
383 | 		}
384 | 		if(exists $hash_genome_unique_tag_num{$tmp[0]}){#unique标签数
385 | 			print STAT "\t$hash_genome_unique_tag_num{$tmp[0]}\n";
386 | 		}else{
387 | 			print STAT "\t0\n";
388 | 		}
389 | #		print STAT "$tmp[0]\t$hash_genome_tag_num{$tmp[0]}\t$hash_genome_unique_tag_num{$tmp[0]}\n";
390 | 	}
391 | 	close LI;
392 | 	close STAT;
393 | 	print STDOUT "\n###($level) Identification of taxa-specifc 2b-RAD tags -- complete, ",`date`; #STDOUT
394 | }
395 | 
396 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- complete, ",`date`;#STDOUT
397 | 
398 | 
399 | 
400 | 
401 | sub CheckDir{# create the directory
402 | 	my $file = shift;
403 | 	unless( -d $file ){
404 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
405 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
406 | 		}
407 | 		return 1;
408 | }
409 | 


--------------------------------------------------------------------------------
/scripts/CreateQualDatabase_2bRAD.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Authors: Zheng Sun, Rongchao Zhang, Shi Huang
  3 | use warnings;
  4 | use strict;
  5 | use Getopt::Long;
  6 | use FindBin qw($Bin);
  7 | use File::Basename qw(dirname basename);
  8 | use Cwd 'abs_path';
  9 | 
 10 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang";
 11 | my $time="2020.12.15";
 12 | 
 13 | 
 14 | select STDOUT;$|=1;# cache cleaning
 15 | 
 16 | my $remove_redundant ||="yes";# 基因组内部是否去冗余 yes or no, default value is "no"
 17 | 
 18 | my($list,$site,$type,$outdir,$enzyme_file,$help);
 19 | GetOptions(
 20 | 		"l:s"  => \$list,
 21 | 		"s:i"  => \$site,
 22 | 		"t:s"  => \$type,
 23 | 		"o:s"  => \$outdir,
 24 | 
 25 | 		"e:s"  => \$enzyme_file,#酶切结果文件，或库文件
 26 | 		"r:s"  => \$remove_redundant, #基因组内部是否去冗余
 27 | 		"h|help:s" => \$help,
 28 | 		);
 29 | 
 30 | sub usage{# help
 31 | print STDERR "\e[;33;1m
 32 | DESCRIPTION
 33 |   It constructs the taxa-specific 2b-RAD reference genome database from a whole-genome reference database.
 34 | USAGE
 35 |   perl $0
 36 | PARAMETERS
 37 |   -l <file> genome classification list (the line which begins with # will be ignored)
 38 |             eg:GCFid<tab>kingdom<tab>phylum<tab>class<tab>order<tab>family<tab>genus<tab>species<tab>strain(<tab>genome_path)
 39 |   -s <int>  One or multiple type 2b restriction enzymes (sites).
 40 |             [1]CspCI  [9]BplI
 41 |             [2]AloI   [10]FalI
 42 |             [3]BsaXI  [11]Bsp24I
 43 |             [4]BaeI   [12]HaeIV
 44 |             [5]BcgI   [13]CjePI
 45 |             [6]CjeI   [14]Hin4I
 46 |             [7]PpiI   [15]AlfI
 47 |             [8]PsrI   [16]BslFI
 48 |   -t <str>  The database level. One or more taxonomy level of the 2b-RAD reference database can be specified: kingdom,phylum,class,order,family,genus,species,strain. Use 'all' for any levels. (comma separated).
 49 |   -o <dir>  outdir (if not exists,it will be created)
 50 | OPTION
 51 |   -e <file> enzyme file or database file
 52 |   -r <str>  whether to delete redundant tags within the genome (yes or no) [default: $remove_redundant]
 53 |   -h|help  print this help
 54 | Author:  $author
 55 | Last update:  $time\e[0m\n";
 56 | }
 57 | 
 58 | if(defined($help)){
 59 | 	&usage;
 60 | 	exit 0;
 61 | }
 62 | 
 63 | unless($list && $site && $type && $outdir){
 64 | 	&usage;
 65 | 	print STDERR "para -l -s -t or -o error.\n";
 66 | 	exit 1;
 67 | }
 68 | 
 69 | #转化为绝对路径
 70 | $list=abs_path($list);
 71 | $outdir=abs_path($outdir);
 72 | 
 73 | #check the parameter -r: using default value "no"
 74 | unless($remove_redundant eq "yes" || $remove_redundant eq "no"){
 75 | 	&usage;
 76 | 	print STDERR "-r parameter error: $remove_redundant\n";
 77 | 	exit 1;
 78 | }
 79 | 
 80 | 
 81 | #所有分类水平
 82 | my %hs_type_database=(
 83 | 			'kingdom' => '1',
 84 | 			'phylum'  => '2',
 85 | 			'class'   => '3',
 86 | 			'order'   => '4',
 87 | 			'family'  => '5',
 88 | 			'genus'   => '6',
 89 | 			'species' => '7',
 90 | 			'strain'  => '8',
 91 | 			);
 92 | # check the parameter -t: specify the taxonomic level of 2b-RAD genome database
 93 | my %hs_type;
 94 | if($type eq "all"){
 95 | 	%hs_type=%hs_type_database;
 96 | }else{
 97 | 	my @tmp=split /,/,$type;
 98 | 	for my $i(@tmp){
 99 | 		if(exists $hs_type_database{$i}){
100 | 			$hs_type{$i}=$hs_type_database{$i};
101 | 		}else{
102 | 			&usage;
103 | 			print STDERR "-t parameter error: cannot find '$i'\n";
104 | 			exit 1;
105 | 		}
106 | 	}
107 | }
108 | 
109 | # check the parameter -s:
110 | my (@site,$enzyme);
111 | if( 1 == $site ){#CspCI
112 | 	@site = (
113 | 		'[AGCT]{11}CAA[AGCT]{5}GTGG[AGCT]{10}',
114 | 		'[AGCT]{10}CCAC[AGCT]{5}TTG[AGCT]{11}',
115 | 		);
116 | 	$enzyme="CspCI";
117 | }elsif( 2 == $site ){#AloI
118 | 	@site = (
119 | 		'[AGCT]{7}GAAC[AGCT]{6}TCC[AGCT]{7}',
120 | 		'[AGCT]{7}GGA[AGCT]{6}GTTC[AGCT]{7}',
121 | 		);
122 | 	$enzyme="AloI";
123 | }elsif( 3 == $site ){#BsaXI
124 |     @site = (
125 |             '[AGCT]{9}AC[AGCT]{5}CTCC[AGCT]{7}',
126 |             '[AGCT]{7}GGAG[AGCT]{5}GT[AGCT]{9}',
127 |             );
128 | 	$enzyme="BsaXI";
129 | }elsif( 4 == $site ){#BaeI
130 |     @site = (
131 |             '[AGCT]{10}AC[AGCT]{4}GTA[CT]C[AGCT]{7}',
132 |             '[AGCT]{7}G[AG]TAC[AGCT]{4}GT[AGCT]{10}',
133 |             );
134 | 	$enzyme="BaeI";
135 | }elsif( 5 == $site ){#BcgI
136 |     @site = (
137 |             '[AGCT]{10}CGA[AGCT]{6}TGC[AGCT]{10}',
138 |             '[AGCT]{10}GCA[AGCT]{6}TCG[AGCT]{10}',
139 |             );
140 | 	$enzyme="BcgI";
141 | }elsif( 6 == $site ){#CjeI
142 |     @site = (
143 |             '[AGCT]{8}CCA[AGCT]{6}GT[AGCT]{9}',
144 |             '[AGCT]{9}AC[AGCT]{6}TGG[AGCT]{8}',
145 |             );
146 | 	$enzyme="CjeI";
147 | }elsif( 7 == $site ){#PpiI
148 |     @site = (
149 |             '[AGCT]{7}GAAC[AGCT]{5}CTC[AGCT]{8}',
150 |             '[AGCT]{8}GAG[AGCT]{5}GTTC[AGCT]{7}',
151 |             );
152 | 	$enzyme="PpiI";
153 | }elsif( 8 == $site ){#PsrI
154 |     @site = (
155 |             '[AGCT]{7}GAAC[AGCT]{6}TAC[AGCT]{7}',
156 |             '[AGCT]{7}GTA[AGCT]{6}GTTC[AGCT]{7}',
157 |             );
158 | 	$enzyme="PsrI";
159 | }elsif( 9 == $site ){#BplI
160 |     @site = (
161 |             '[AGCT]{8}GAG[AGCT]{5}CTC[AGCT]{8}', #palindromes
162 |             );
163 | 	$enzyme="BplI";
164 | }elsif( 10 == $site ){#FalI
165 |     @site = (
166 |             '[AGCT]{8}AAG[AGCT]{5}CTT[AGCT]{8}', #palindromes
167 |             );
168 | 	$enzyme="FalI";
169 | }elsif( 11 == $site ){#Bsp24I
170 |     @site = (
171 |             '[AGCT]{8}GAC[AGCT]{6}TGG[AGCT]{7}',
172 |             '[AGCT]{7}CCA[AGCT]{6}GTC[AGCT]{8}',
173 |             );
174 | 	$enzyme="Bsp24I";
175 | }elsif( 12 == $site ){#HaeIV
176 |     @site = (
177 |             '[AGCT]{7}GA[CT][AGCT]{5}[AG]TC[AGCT]{9}',
178 |             '[AGCT]{9}GA[CT][AGCT]{5}[AG]TC[AGCT]{7}',
179 |             );
180 | 	$enzyme="HaeIV";
181 | }elsif( 13 == $site ){#CjePI
182 |     @site = (
183 |             '[AGCT]{7}CCA[AGCT]{7}TC[AGCT]{8}',
184 |             '[AGCT]{8}GA[AGCT]{7}TGG[AGCT]{7}',
185 |             );
186 | 	$enzyme="CjePI";
187 | }elsif( 14 == $site ){#Hin4I
188 |     @site = (
189 |             '[AGCT]{8}GA[CT][AGCT]{5}[GAC]TC[AGCT]{8}',
190 |             '[AGCT]{8}GA[CTG][AGCT]{5}[AG]TC[AGCT]{8}',
191 |             );
192 | 	$enzyme="Hin4I";
193 | }elsif( 15 == $site ){#AlfI
194 |     @site = (
195 |             '[AGCT]{10}GCA[AGCT]{6}TGC[AGCT]{10}', #palindromes
196 |             );
197 | 	$enzyme="AlfI";
198 | }elsif( 16 == $site ){#BslFI ??some question?? single enzyme
199 |     @site = (
200 |             '[AGCT]{6}GGGAC[AGCT]{14}',
201 |             '[AGCT]{14}GTCCC[AGCT]{6}',
202 |             );
203 | 	$enzyme="BslFI";
204 | }else{
205 | 	&usage;
206 | 	print STDERR "The parameter -s is wrong\n";
207 | 	exit 1;
208 | }
209 | 
210 | #提供酶切文件，检查文件是否存在
211 | if(defined($enzyme_file)){
212 | 	$enzyme_file=abs_path($enzyme_file);
213 | 	unless(-e $enzyme_file){
214 | 		print STDERR "[ERROR] $enzyme_file does not exist,please check.\n";
215 | 		exit 1;
216 | 	}
217 | }
218 | 
219 | 
220 | #统计总的基因组个数和不提供酶切文件时检测基因组路径是否存在
221 | my $genome_total_num=0;
222 | #my (%hash_gcf2class,%hash_gcf_rank);
223 | my %hash_gcf2class;
224 | if($list=~/\.gz/){
225 | 	open LI,"gzip -dc $list|" or die "cannot open $list\n";
226 | }else{
227 | 	open LI,"$list" or die "cannot open $list\n";
228 | }
229 | while(<LI>){
230 | 	next if(/^#/ || /^$/);# remove blank lines or lines starting with #
231 | 	chomp;
232 | 	my @tmp=split /\t/;
233 | 	$genome_total_num++;
234 | 	$hash_gcf2class{$tmp[0]}=$_;#gcf对应的分类信息
235 | #	$hash_gcf_rank{$tmp[0]}=$genome_total_num;#记录基因组在列表中的排序，便于打印日志
236 | 	if(defined($enzyme_file)){#提供酶切文件
237 | 		;
238 | 	}else{#不提供酶切文件
239 | 		$tmp[-1]=abs_path($tmp[-1]);
240 | 		unless(-e $tmp[-1]){#check the availability of a genome fasta file
241 | 			print STDERR "[ERROR] $tmp[-1] does not exist,please check your genome file\n";
242 | 			exit 1;
243 | 		}
244 | 	}
245 | }
246 | close LI;
247 | 
248 | &CheckDir("$outdir");# create the output directory
249 | #&CheckDir("$outdir/database");
250 | unless(defined($enzyme_file)){#未提供酶切文件则进行酶切
251 | 	print STDOUT "###($enzyme) Electron digestion -- start, ",`date`;# output the log file
252 | 	my $complete=0;
253 | 	my %complete;
254 | 	$enzyme_file="$outdir/$enzyme.enzyme.fa.gz";
255 | 	open OU,"|gzip > $outdir/$enzyme.enzyme.fa.gz" or die "cannot open $outdir/$enzyme.enzyme.fa.gz\n";
256 | 	if($list=~/\.gz/){
257 | 		open LI,"gzip -dc $list|" or die "cannot open $list\n";
258 | 	}else{
259 | 		open LI,"$list" or die "cannot open $list\n";
260 | 	}
261 | 	while(<LI>){
262 | 		next if(/^#/ || /^$/);# remove blank lines or lines starting with #
263 | 		my $line=$_;
264 | 		chomp($line);
265 | 		my @tmp=split /\t/,$line;
266 | 		$tmp[-1]=abs_path($tmp[-1]);
267 | 		$/=">";
268 | 		if($tmp[-1]=~/\.gz$/){
269 | 			open IN,"gzip -dc $tmp[-1]|" or die "cannot open $tmp[-1]\n";
270 | 		}else{
271 | 			open IN,"$tmp[-1]" or die "cannot open $tmp[-1]\n";
272 | 		}
273 | 		<IN>;
274 | 		my $ingenome_tag_num=0;
275 | 		while(<IN>){
276 | 			chomp;
277 | 			my @a=split /\n/;
278 | 			my $id=(split /\s+/,$a[0])[0];#scaffold id
279 | 			my $seq=join("",@a[1..$#a]);#scaffold seq
280 | 			$seq=uc($seq); # convert the lowercase to uppercase bases 小写碱基转换为大写
281 | 			my %hash_genome;
282 | 			for my $i(0..$#site){# iterate all restriction sites 循环酶切位点
283 | 				while($seq=~/($site[$i])/g){# digital digestion
284 | 					my $tag=$1;#序列
285 | 					my $len=length($tag);#标签长度
286 | 					my $pos=pos($seq);
287 | 					my $start=$pos-$len+1;#标签起始位置
288 | 					pos($seq)=$start;#调整位置
289 | 					#GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签
290 | 					$hash_genome{$start}=">$tmp[0]|0|$id|$start|0|-\n$tag\n";
291 | 				}
292 | 			}
293 | 			# sort the 2b-RAD tags by the genome positions排序后输出
294 | 			for my $pos(sort {$a <=> $b} keys %hash_genome){
295 | 				$ingenome_tag_num++;
296 | 				my @a=split /\|/,$hash_genome{$pos};
297 | 				print OU "$a[0]|$ingenome_tag_num|",join("|",@a[2..$#a]);
298 | 			}
299 | 			undef %hash_genome;
300 | 		}
301 | 		close IN;
302 | 		$/="\n";
303 | 		$complete++;#完成处理的基因组个数
304 | 		for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度
305 | 			if($complete/$genome_total_num*100>=$i){
306 | 				print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志
307 | 				$complete{$i}++;
308 | 				last;
309 | 			}
310 | 		}
311 | 	}
312 | 	close LI;
313 | 	close OU;
314 | 	print STDOUT "\n###($enzyme) Electron digestion -- complete, ",`date`;# output the log file
315 | }else{#提供酶切文件，跳过电子酶切
316 | 	print STDOUT "An enzyme digestion file has been provided, skipping electron digestion.\n";
317 | }
318 | 
319 | 
320 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- start, ",`date`;#STDOUT
321 | for my $level(sort {$hs_type{$a}<=>$hs_type{$b}} keys %hs_type){ #iterate all taxonomic levels of 2b-RAD database
322 | 	print STDOUT "###($level) Record the taxonomies of each 2b-RAD tag -- start, ",`date`;#STDOUT
323 | 	my (%hash_ingenome,%hash,%complete,%hash_gcf_rank);
324 | 	$/=">";
325 | 	if($enzyme_file=~/\.gz$/){#打开酶切文件
326 | 		open IN,"gzip -dc $enzyme_file|" or die "cannot open $enzyme_file\n";
327 | 	}else{
328 | 		open IN,"$enzyme_file" or die "cannot open $enzyme_file\n";
329 | 	}
330 | 	<IN>;
331 | 	while(<IN>){
332 | 		chomp;
333 | 		my @tmp=split /\n/;
334 | 		#GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签
335 | 		my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0];
336 | 		my $tag=$tmp[1];
337 | 		$gcfid=~s/^>//;
338 | 		next unless(exists $hash_gcf2class{$gcfid});#gcfid不在列表中则跳过
339 | 		$hash_gcf_rank{$gcfid}++;
340 | 		my @a=split /\t/,$hash_gcf2class{$gcfid};
341 | 		my $class=join("\t",@a[1..$hs_type{$level}]);#concatenate the full taxonomic annotation
342 | 		if(exists $hash{$tag}){#判断在哈希中是否存在
343 | 			$hash{$tag}{$class}++;#记录标签分类信息
344 | 			$hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余，则记录标签在基因组内部是否冗余
345 | 		}else{#反向互补处理
346 | 			$tag=~tr/ATCG/TAGC/;
347 | 			$tag=reverse($tag);
348 | 			$hash{$tag}{$class}++;#记录标签分类信息
349 | 			$hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余，则记录标签在基因组内部是否冗余
350 | 		}
351 | 		for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度
352 | 			if((keys %hash_gcf_rank)/$genome_total_num*100>=$i){
353 | 				print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志
354 | 				$complete{$i}++;
355 | 				last;
356 | 			}
357 | 		}
358 | 	}
359 | 	close IN;
360 | 	$/="\n";
361 | 	undef %hash_gcf_rank;
362 | 	print STDOUT "\n###($level) Record the taxonomies of each 2b-RAD tag -- complete, ",`date`;# STDOUT
363 | 	
364 | 	print STDOUT "###($level) Identification of taxa-specifc 2b-RAD tags -- start, ",`date`;# STDOUT
365 | 	undef %complete;#完成进度清空
366 | 	my (%hash_genome_tag_num,%hash_genome_unique_tag_num);
367 | 	$/=">";
368 | 	if($enzyme_file=~/\.gz$/){#打开酶切文件
369 | 		open IN,"gzip -dc $enzyme_file|" or die "cannot open $enzyme_file\n";
370 | 	}else{
371 | 		open IN,"$enzyme_file" or die "cannot open $enzyme_file\n";
372 | 	}
373 | 	open OU,"|gzip > $outdir/$enzyme.$level.fa.gz" or die "cannot open $outdir/$enzyme.$level.fa.gz\n";
374 | 	<IN>;
375 | 	while(<IN>){
376 | 		chomp;
377 | 		my @tmp=split /\n/;
378 | 		#GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签
379 | 		my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0];
380 | 		my $tag=$tmp[1];
381 | 		$gcfid=~s/^>//;
382 | 		next unless(exists $hash_gcf2class{$gcfid});#gcfid不在列表中则跳过
383 | 		$hash_genome_tag_num{$gcfid}++;#基因组电子酶切标签数
384 | 		unless(exists $hash{$tag}){#如果不存在，则进行反向互补
385 | 			$tag=~tr/ATCG/TAGC/;
386 | 			$tag=reverse($tag);
387 | 			#反向互补后，改变链的方向
388 | 			if($chain==0){
389 | 				$chain=1;
390 | 			}elsif($chain==1){
391 | 				$chain=0;
392 | 			}
393 | 		}
394 | 		if(keys %{$hash{$tag}}==1){#指定水平下为unique
395 | 			if($remove_redundant eq "yes"){#基因组内部需要去冗余
396 | 				if($hash_ingenome{$gcfid}{$tag}==1){#在基因组内部只出现过一次（noredundancy）
397 | 					$unique=1;
398 | 					$hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数
399 | 				}else{
400 | 					$unique=0;
401 | 				}
402 | 			}else{#基因组内部不需要去冗余
403 | 				$unique=1;
404 | 				$hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数
405 | 			}
406 | 		}else{
407 | 			$unique=0;
408 | 		}
409 | 		print OU ">$gcfid|$ingenome_tag_num|$scaid|$start|$chain|$unique\n$tag\n";
410 | 		for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度
411 | 			if((keys %hash_genome_tag_num)/$genome_total_num*100>=$i){
412 | 				print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志
413 | 				$complete{$i}++;
414 | 				last;
415 | 			}
416 | 		}
417 | 	}
418 | 	$/="\n";
419 | 	close IN;
420 | 	close OU;
421 | 	open STAT,"> $outdir/$enzyme.$level.stat.xls" or die "cannot open $outdir/$enzyme.$level.stat.xls\n";
422 | 	print STAT "#Unique_Name\tAll_Tag_Num\tUnique_Tag_Num\n";
423 | 	if($list=~/\.gz/){
424 | 		open LI,"gzip -dc $list|" or die "cannot open $list\n";
425 | 	}else{
426 | 		open LI,"$list" or die "cannot open $list\n";
427 | 	}
428 | 	while(<LI>){
429 | 		next if(/^#/ || /^$/);# remove blank lines or lines starting with #
430 | 		my $line=$_;
431 | 		chomp($line);
432 | 		my @tmp=split /\t/,$line;
433 | 		print STAT "$tmp[0]";
434 | 		if(exists $hash_genome_tag_num{$tmp[0]}){#酶切标签数
435 | 			print STAT "\t$hash_genome_tag_num{$tmp[0]}";
436 | 		}else{
437 | 			print STAT "\t0";
438 | 		}
439 | 		if(exists $hash_genome_unique_tag_num{$tmp[0]}){#unique标签数
440 | 			print STAT "\t$hash_genome_unique_tag_num{$tmp[0]}\n";
441 | 		}else{
442 | 			print STAT "\t0\n";
443 | 		}
444 | #		print STAT "$tmp[0]\t$hash_genome_tag_num{$tmp[0]}\t$hash_genome_unique_tag_num{$tmp[0]}\n";
445 | 	}
446 | 	close LI;
447 | 	close STAT;
448 | 	print STDOUT "\n###($level) Identification of taxa-specifc 2b-RAD tags -- complete, ",`date`; #STDOUT
449 | }
450 | 
451 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- complete, ",`date`;#STDOUT
452 | 
453 | 
454 | 
455 | 
456 | sub CheckDir{# create the directory
457 | 	my $file = shift;
458 | 	unless( -d $file ){
459 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
460 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
461 | 		}
462 | 		return 1;
463 | }
464 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 2bRAD-M
  2 | ----------------------------
  3 | This repository provides the 2bRAD-M computational pipeline for microbiome analysis, which has been formally published on Genome Biology:
  4 | 
  5 | [Species-resolved sequencing of low-biomass or degraded microbiomes using 2bRAD-M](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02576-9
  6 | ) by Zheng Sun, Shi Huang, Pengfei Zhu, Lam Tzehau, Helen Zhao, Jia Lv, Rongchao Zhang, Lisha Zhou, Qianya Niu, Xiuping Wang, Meng Zhang, Gongchao Jing, Zhenmin Bao, Jiquan Liu, Shi Wang, Jian Xu. Genome Biology, doi: https://doi.org/10.1186/s13059-021-02576-9
  7 | 
  8 | ## How it works
  9 |  The principle of 2bRAD-M on microbiome analyses:  
 10 |  (1) reliable enzyme-digested sequence tags can be derived that are specific to high-resolution taxa (e.g., species or strain) yet universally applicable for a broad range or all of bacterial, archaeal and fungal genomes;  
 11 |  (2) these taxa-specific, iso-length sequence tags can be evenly amplified and sequenced;  
 12 |  (3) the tag sequences can be mapped to reference genomes to reconstruct faithfully the taxonomic composition.
 13 |  
 14 |  You can also find more details for the 2bRAD-M workflow below. 
 15 |  
 16 |  ![workflow](2bRAD-M_workflow.png)
 17 |  
 18 |  * The experimental workflow has two steps: 
 19 |  
 20 |    (1) BcgI (a commercially available Type IIB restriction enzymes) is used, as an example, to digest total genomic DNA extracted from microbiome samples. BcgI recognizes the sequence of CGA-N6-TGC in the genomic DNA and cleaves on both upstream (12-10 bp) and downstream (10-12 bp) of this signature, producing short and iso-length DNA (32bp without sticky ends) across all loci. 
 21 |    
 22 |    (2) These so-called “2bRAD fragments” are ligated to adaptors, amplified and then sequenced. 
 23 |    
 24 |  * The computational workflow. The foundation here is a unique 2bRAD tag database (“2b-Tag-DB”), which contains taxa-specific 2bRAD tags identified from all the sequenced bacteria, fungi and archaea genomes. Mapping the 2bRAD reads against 2b-Tag-DB thus identifies the presence of species in a sample. Subsequently, to estimate relative abundance of the identified taxa, the mean read coverage of all 2bRAD tags specific to each taxon is derived. To improve utilization rate of reads and classification accuracy, a secondary, sample-specific 2b-Tag-DB was dynamically derived from only those candidate taxa identified in a particular sample, which produces more species-specific 2bRAD tags than the original 2b-Tag-DB and results in more accurate modeling of relative abundance of taxa.
 25 | 
 26 | ## Installation
 27 |  
 28 |  ### System requirements
 29 |  
 30 |  #### Dependencies
 31 |  All scripts in 2bRAD-M are written using Perl and recommended to run in a conda environment. This program should work properly in the Unix systems, or Mac OSX, as all required packages can be appropreiately download and installed. 
 32 |  
 33 |  #### Disk space
 34 |  Construction of a 2bRAD-M standard database (i.e., 2b-Tag-DB) requires approximately 10 GB of disk space. 
 35 | 
 36 |  #### Memory usage
 37 |  Running the standard pipeline requires < 30Gb of RAM, which is also compatible with multithreading. For example, the BcgI-derived (default) database size is 9.32 GB, and you will need more than that in RAM if you want to build the default database. In a test early on, the peak memory can reach up to 29GB.
 38 |  
 39 |  #### Speed 
 40 | About 20 minutes are required for loading the 2b-Tag-DB. For a typical gut metagenome, ~40 minutes are required for species profiling.
 41 |  
 42 |  ### Download the pipeline
 43 |  * Clone the latest version from GitHub (recommended):  
 44 |  
 45 |    `git clone https://github.com/shihuang047/2bRAD-M/`  
 46 |    `cd 2bRAD-M`
 47 |    
 48 |     This makes it easy to update the software in the future using `git pull` as bugs are fixed and features are added.
 49 |  * Alternatively, directly download the whole GitHub repo without installing GitHub:
 50 |  
 51 |    `wget https://github.com/shihuang047/2bRAD-M/archive/master.zip`  
 52 |    `unzip master.zip`  
 53 |    `cd 2bRAD-M-master`
 54 |    
 55 |  ### Install 2bRAD-M pipeline in a conda environment 
 56 |  * Conda installation  
 57 |    [Miniconda](https://docs.conda.io/en/latest/miniconda.html) provides the conda environment and package manager, and is the recommended way to install 2bRAD-M. 
 58 |  * Create a conda environment for 2bRAD-M pipeline:  
 59 |    After installing Miniconda and opening a new terminal, make sure you’re running the latest version of conda:
 60 |    
 61 |    `conda update conda`
 62 |    
 63 |    Once you have Miniconda installed, create a conda environment with the yml file `tools/2bRAD-M-20201225-conda.yml`.
 64 |    
 65 |    `conda env create -n 2bRAD-M-20201225 --file tools/2bRAD-M-20201225-conda.yml`
 66 |    
 67 |  * Activate the 2bRAD-M conda environment by running the following command:
 68 |  
 69 |    `source activate 2bRAD-M-20201225`
 70 | 
 71 |  ### Construct the reference 2B-Tag database (required)
 72 | 
 73 |    The script `tools/Download_2bRADTagDB_NCBI.pl` in this repo can be used to:
 74 |    
 75 |    * download the prebuilt 2b-Tag-DB from Figshare based on the NCBI Refseq (Oct., 2019) or GTDB  
 76 |    * download the example datasets for pipeline tutorial 
 77 |  
 78 |    You can specify $your_database_path locally (`$your_database_path=./2B-RAD-M-ref_db_NCBI/` or `$your_database_path=./2B-RAD-M-ref_db_GTDB/`) and run the script as following:  
 79 |    
 80 |    `perl tools/Download_2bRADTagDB_NCBI.pl $your_database_path` or
 81 |    
 82 |    `perl tools/Download_2bRADTagDB_GTDB.pl $your_database_path`
 83 |    
 84 |    It usually can take around 30 mins to save all files in the `$your_database_path`, but it still depends on your internet connenction speed and stability.
 85 | 
 86 | ## 2bRAD-M pipeline tutorial
 87 | 
 88 | ### **Overview**  
 89 | 
 90 | The 2bRAD-M analysis pipeline comprises a combination of 2bRAD-M scripts and optimized parameters for analyzing the 2bRAD or shotgun metagenomics sequencing data, which can output the most comprehensive output on each sample. The pipeline includes:
 91 |     
 92 | (1) **The digital restriction digestion** It is required when input DNA sequences are longer than 31bp or 33bp (e.g., 150bp) or derived from the common shotgun sequencing protocols. If input DNA sequences were produced by the 2bRAD sequencing protocol this step will be skipped.   
 93 | 
 94 | (2) **Qualitative analysis** Identify the microbes and preliminarily estimate their abundances based on the 2bRAD (such as. BcgI derived) species-specific markers of a prebuilt 2b-Tag-DB based on the NCBI Refseq (Oct., 2019). 
 95 | 
 96 | (3) **Quantitative analysis** Estimate the microbial abundances more precisely based on the 2bRAD species-specific markers in a sample-specific 2b-Tag-DB. Firstly, we fetch condidate genomes that were identified in a particular biolgical sample in step (2) from NCBI Refseq to construct a sample-targeted 2b-Tag-DB. Next, we remapped the sequencing reads to this more concise 2b-Tag-DB to estimate the abundance of all detected taxa and used G score to filter potential false positive discovery of microbial features.
 97 | 
 98 | (4) **Merging results from multiple samples**  The sample-wise results will be automatically merged into a feature table. 
 99 |     	
100 | ### **Usage**
101 |     
102 | The main script for implementing those analyses is `bin/2bRADM_Pipline.pl` in this repo. You can check out the usage by printing the help information via `perl bin/2bRADM_Pipline.pl -h`.
103 |     
104 | ```
105 | DESCRIPTION
106 | 	We here provided a streamlined 2bRAD pipeline for analyzing microbial compositions from the 2bRAD/shotgun metagenomics data based on the species-specific 2bRAD markers.
107 | USAGE
108 | 	perl bin/2bRADM_Pipline.pl
109 | PARAMETERS
110 |          -t   <int>    The acceptable types of an input sequencing data file. The file path should be also listed in the sample list file (para -l).
111 |                        [1] generic genome data in a fasta format
112 |                        [2] shotgun metagenomic data in a fastq format(either SE or PE platform is accepted)
113 |                        [3] 2bRAD data from a SE sequencing platform in a fastq format
114 |                        [4] 2bRAD data from a PE sequencing platform in a fastq format
115 |          -l   <file>   The filepath of the sample list. Each line includes an input sample ID and the file path of corresponding DNA sequence data where each field should be separated by <tab>. A line in this file that begins with # will be ignored. Only four formats of a sample list file are accepted and should match with parameter -t: 
116 |                        [1] sample<tab>sample.fa(.gz)
117 |                        [2] sample<tab>shotgun.1.fq(.gz)(<tab>shotgun.2.fq.gz)
118 |                        [3] sample<tab>2bsingle.fq(.gz or 2bsingle.1.fq.gz)
119 |                        [4] sample1<tab>sample2<tab>sample3<tab>sample4<tab>sample5<tab>R1.fq(.gz)<tab>R2.fq(.gz)
120 |          -d   <dir>    The working path of 2B-Tag-DB.
121 |          -o   <dir>    The output directory (if it doesn't exist, will be created automatically as 'outdir').
122 |        OPTIONS of Qualitative Analysis
123 |          -p   <str>   If qualitative analysis applies or not [default: yes] (yes or no)
124 |          -s1  <str>   The enzymatic site(s) for the qualitative analysis. One or more sites can be specified(comma separated) [default: 5]
125 |                       It represents which enzymatic recognition site(s) will be used for digital restriction digestion, and contructing 2b-Tag-DB for the following qualitative analysis and quantitative analysis.
126 |                       [1]CspCI  [5]BcgI  [9]BplI     [13]CjePI  [17]AllEnzyme
127 |                       [2]AloI   [6]CjeI  [10]FalI    [14]Hin4I
128 |                       [3]BsaXI  [7]PpiI  [11]Bsp24I  [15]AlfI
129 |                       [4]BaeI   [8]PsrI  [12]HaeIV   [16]BslFI
130 |          -t1  <str>   The taxonomic rank for 2bRAD markers in the qualitative database, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: species]
131 |        OPTIONS of Quantitative Analysis
132 |          -q   <str>   If quantitative analysis applies or not [default: yes] (yes or no)
133 |          -gsc <int>   G score threshold for identifying the condidate microbes present in a sample in qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB in the quantitative analysis step. [default: 5, it means >5]
134 |          -gcf <int>   The threshold of the 2bRAD tag number for the presence of a microbial genome (i.e., GCF) in the qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB in the quantitative analysis step. [default: 1, it means >1]
135 |          -s2  <str>   The enzyme site for the quantitative analysis. (refer to -s1) [default: 5, must be included in para -s1]
136 |          -t2  <str>   The taxonomic rank for 2bRAD markers in the quantitative analysis, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: species]
137 |        OPTIONS of CPU
138 |          -c1  <int>   The number of CPUs used for parallelizing the digital digestion step for multiple samples. [default: 10]
139 |          -c2  <int>   The number of CPUs used for parallelizing abundance profiling for multiple samples based on a single enzyme and combining results from multiple enzymes have been set via -s1. [default: 8] (each CPU needs about 15~65G of memory)
140 |        OPTIONS of Quality Control
141 |          -qc  <str>   If quality control applies or not. [default: yes] (yes or no)
142 |          -qcn <float> The maximum ratio of base "N". [default: 0.08]
143 |          -qcq <int>   The minimum quality score to keep. [default: 30]
144 |          -qcp <int>   The minimum percentage of bases that must have [-qcq] quality. [default: 80]
145 |          -qcb <int>   ASCII+33 or ASCII+64 quality scores as Phred scores [default: 33]
146 |        OPTIONS of Merging profiles
147 |          -ms  <str>   The mock-community sample name(s) (separated by commas). The specified samples will be removed from the merged output table.
148 |          -ncs <str>   The sample name(s) (separated by commas) of negative control that can be used for filtering potential contaminations.
149 |          -h|help   Print this help information.
150 | 
151 | ```
152 |  
153 | ### **Example data**
154 | 
155 | * **Analyze a in silico mock community** (synthetic data: `simulate_50.BcgI.fq.gz`) To test the generalizability of our 2bRAD markers for microbial profiling, we designed a mock microbiome structure containing 50 microbial species from a wide range of habitats such as oral, gut and soil environments. Given a specified abundance profile, we simulated the sequencing data based on all related genomes using [wigsim](https://github.com/lh3/wgsim). The sequence data file `simulate_50.BcgI.fq.gz` and its corresponding list file `list_simulation` will be automatically downloaded to `$your_database_path` via `tools/Download_2bRADTagDB.pl` as described above. Once all these downloaded, you can try to run the following command that will output the estimated microbial profile. 
156 | 
157 | ```
158 | perl bin/2bRADM_Pipline.pl \
159 | 			-t 3 \
160 | 			-l $your_database_path/list_simulation \
161 | 			-d $your_database_path/ \
162 | 			-o outdir \
163 | 			-gsc 60 \
164 | 			-qc no
165 | ```
166 | 
167 | * **Analyze a mock microbial community: MSA1002** (sequencing data: `MSA1002_R1.fq.gz`)
168 |    [MSA1002](https://www.atcc.org/en/Global/Products/MSA-1002.aspx) comprises the genomic material from 20 microbial strains that are evenly mixed. We sequenced this DNA sample using our 2bRAD protocol for optimizing and testing the bioinformatic pipeline. The sequencing data file `MSA1002_R1.fq.gz` and its corresponding list file `list_mock` will be automatically downloaded to `$your_database_path` via `tools/Download_2bRADTagDB.pl` as described above. Once all these downloaded, you can try to run the following command that will output the estimated microbial profile.
169 | 
170 | ```
171 | perl bin/2bRADM_Pipline.pl \
172 | 			-t 3 \
173 | 			-l $your_database_path/list_mock \
174 | 			-d $your_database_path/ \
175 | 			-o outdir 
176 | ```
177 | 
178 | ### **Output formats**
179 |  
180 | 2bRAD-M offers a standard format of sample-wide results. You can find this standard profiling result of a single sample at `$outdir/quantitative/$sample_id.combine.xls`. Taking the `MSA1002` analysis as example, the output is located at `outdir/quantitative/MSA1002.combine.xls`. 2bRAD-M standard sample report format is tab-delimited with one line per taxon. The fields of the output, from left-to-right, are as follows:
181 |    
182 | 1 to 7- The taxonomic ranks for a microbial taxon identified: 1 - "Kingdom"; 2 - "Phylum"; 3 - "Class"; 4 - "Order"; 5 - "Family"; 6 - "Genus"; 7 - "Species"  
183 | 8 - "Theoretical_Tag_Num": Average number of all 2bRAD marker tags of genomes under this taxon in theory  
184 | 9 - "Sequenced_Tag_Num": Number of 2bRAD marker tags detected in the sequencing data under this taxon  
185 | 10 - "Percent": The percent of sequenced 2bRAD marker tags under this taxon  
186 | 11 - "Sequenced_Reads_Num": Total number of sequenced reads  
187 | 12 - "Sequenced_Reads_Num/Theoretical_Tag_Num": The ratio of "Sequenced_Reads_Num" and "Theoretical_Tag_Num"， which further used for calculating "relative abundance" of this taxon within a sample via a normalization by the column-wise sum  
188 | 13 - "Sequenced_Reads_Num/Sequenced_Tag_Num": The ratio of "Sequenced_Reads_Num" and "Sequenced_Tag_Num"  
189 | 14 - "Sequenced_Tag_Num(depth>1)": Number of sequenced tags that have >1 sequencing coverage  
190 | 15 - "G_Score": the geometric mean of "Sequenced_Reads_Num" and "Sequenced_Tag_Num", which is used for controlling false positive discovery
191 | 	
192 | 
193 | 2bRAD-M also offer a standard format of the study-wise result. If you provided multiple sample IDs and corresponding fasta/fastq file names in the list file, this pipeline can automatically merge the abundance profiling results from multiple samples into one feature table, which is located at `$outdir/quantitative/Abundance_Stat.all.xls`. If you setup the negative-control samples for filtering potential contaminations in biological samples, you can find the filtered abundance profiles in the `$outdir/quantitative/Abundance_Stat.filtered.xls`. Otherwise, these two files should be identical. The standard study report format is also tab-delimited with one line per taxon. The fields of the output, from left-to-right, are as follows:
194 |   
195 | 1 to 7 - The taxonomic ranks for a microbial taxon identified: 1 - "Kingdom"; 2 - "Phylum"; 3 - "Class"; 4 - "Order"; 5 - "Family"; 6 - "Genus"; 7 - "Species"  
196 | 8 to N - The column name indicates a sample ID in this study, where you can find the relative abundances of taxa within this sample. `N = (the number of samples) + 7`  
197 | 
198 | 
199 | ## 2bRAD-M scripts for customized analyses 
200 |  * [Extract 2b tags](scripts/2bRADExtraction.pl) This script conducts digital type-2B-restriction disgestion of DNA data generated by a wide range of sequencing protocols by one of 16 restriction enzymes. For a given type 2b restriction enzyme, it can return a Fasta file including resulting 2b-RAD tags, and a statistical summary including raw number of input sequences, restriction enzyme used, number of restriction fragments produced, percentage of restriction fragments over the whole (meta)genome data. 
201 |  * [Build your own customized 2b-Tag-DB](scripts/CreateQualDatabase_2bRAD.pl) This script constructs the taxa-specific 2b-RAD reference genome database from a whole-genome reference database.
202 |  * [Species profiling for a single sample based on 2bRAD markers of a single enzyme](scripts/CalculateRelativeAbundance_Combined2bEnzymes.pl) This script computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by one or multiple type 2b restriction enzymes.
203 |  * [Species profiling for a single sample based on 2bRAD markers of multiple enzymes](scripts/CalculateRelativeAbundance_Single2bEnzymes.pl) This script computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by a single type 2b restriction enzyme.
204 |  * [Merge species profiles for multiple samples](scripts/MergeProfilesFromMultipleSamples.pl) This script can merge the abundance profiles from mulitple samples and filter potential contaminations in each biological sample using negative control samples.
205 |  
206 | ## Reference
207 |  * Wang S, Meyer E, McKay JK, Matz MV. 2b-RAD: a simple and flexible method for genome-wide genotyping. Nat Methods. 2012 May 20;9(8):808-10. doi: 10.1038/nmeth.2023. PMID: 22609625.
208 |  * Wang S, Liu P, Lv J, Li Y, Cheng T, Zhang L, Xia Y, Sun H, Hu X, Bao Z. Serial sequencing of isolength RAD tags for cost-efficient genome-wide profiling of genetic and epigenetic variations. Nat Protoc. 2016 Nov;11(11):2189-2200. doi: 10.1038/nprot.2016.133. Epub 2016 Oct 6. PMID: 27711051.
209 |  
210 | ## Acknowledgement
211 | 
212 |    This work was funded by Grant 31800088 from National Natural Science Foundation and 2019M652501 from China Postdoctoral Science Foundation, and Taishan Scholar Fund of Shandong Province of China. 
213 | 
214 | 


--------------------------------------------------------------------------------
/bin/2bRADM_Pipline.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Authors: Zheng Sun , Rongchao Zhang, Shi Huang
  3 | 
  4 | use warnings;
  5 | use strict;
  6 | use Getopt::Long;
  7 | use FindBin qw($Bin);
  8 | use File::Basename qw(dirname basename);
  9 | use Parallel::ForkManager;
 10 | use Cwd 'abs_path';
 11 | 
 12 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang";
 13 | my $time="20210102";
 14 | 
 15 | #scripts path
 16 | my $Bin="$Bin/../scripts/";
 17 | 
 18 | #定性参数
 19 | my $qual ||="yes";#是否进行定性
 20 | my $site1 ||=5;#酶切位点
 21 | my $level1 ||="species";#水平
 22 | 
 23 | #定量参数
 24 | my $quan ||="yes";#是否进行定量
 25 | my $g_score_threshold ||=5;#对定性的合并结果，进行分类筛选 gscore阈值
 26 | my $GCF_threshold ||=1;#鉴定到某个基因组几个标签以上，该基因组才会被纳入定量建库
 27 | my $site2 ||=5;#酶切位点
 28 | my $level2 ||="species";#水平
 29 | 
 30 | #数据酶切，基因组酶切CPU
 31 | my $cpu1 ||=10;
 32 | #多酶定性cpu
 33 | my $cpu2 ||=8;
 34 | 
 35 | #数据质控参数
 36 | my $qc ||="yes";#是否进行质控
 37 | my $qc_n ||=0.08;
 38 | my $qc_q ||=30;
 39 | my $qc_p ||=80;
 40 | my $qc_b ||=33;
 41 | 
 42 | my $mock_sample="";
 43 | my $negative_control_sample="";
 44 | 
 45 | select STDOUT;$|=1;#标准输出清楚缓存
 46 | 
 47 | my($list,$type,$database,$outdir,$help);
 48 | GetOptions(
 49 | 		"t:i"   => \$type,
 50 | 		"l:s"   => \$list,
 51 | 		"d:s"   => \$database,
 52 | 		"o:s"   => \$outdir,
 53 | 		#初步定性
 54 | 		"p:s"   => \$qual,
 55 | 		"s1:s"  => \$site1,
 56 | 		"t1:s"   => \$level1,
 57 | 
 58 | 		#精细定量
 59 | 		"q:s"   => \$quan,
 60 | 		"gsc:i"   => \$g_score_threshold,#筛选分类
 61 | 		"gcf:i" => \$GCF_threshold,#筛选分类中的基因组
 62 | 		"s2:s"  => \$site2,
 63 | 		"t2:s"  => \$level2,
 64 | 
 65 | 		#cpu
 66 | 		"c1:i"  => \$cpu1,
 67 | 		"c2:i"  => \$cpu2,
 68 | 		
 69 | 		#质控参数
 70 | 		"qc:s"  => \$qc,
 71 | 		"qcn:f"  => \$qc_n,
 72 | 		"qcq:i"  => \$qc_q,
 73 | 		"qcp:i"  => \$qc_p,
 74 | 		"qcb:i"  => \$qc_b,
 75 | 
 76 | 		#丰度结果过滤
 77 | 		"ms:s"  => \$mock_sample,
 78 | 		"ncs:s" => \$negative_control_sample,
 79 | 
 80 | 		"h|help:s" => \$help,
 81 | 		);
 82 | 
 83 | sub usage{#帮助
 84 | print STDERR "\e[;33;1m
 85 |     DESCRIPTION
 86 |         We here provided a streamlined 2bRAD pipeline for analyzing microbial compositions from the 2bRAD/shotgun metagenomics data based on the species-specific 2bRAD markers.
 87 |     USAGE
 88 |       perl $0
 89 |     PARAMETERS
 90 |           -t   <int>    The acceptable types of an input sequencing data file. The file path should be also listed in the sample list file (para -l)
 91 |                         [1] generic genome data in a fasta format
 92 |                         [2] shotgun metagenomic data in a fastq format(either SE or PE platform is accepted)
 93 |                         [3] 2bRAD data from a SE sequencing platform in a fastq format
 94 |                         [4] 2bRAD data from a PE sequencing platform in a fastq format
 95 |           -l   <file>   The filepath of the sample list. Each line includes an input sample ID and the file path of corresponding DNA sequence data where each field should be separated by <tab>. A line in this file that begins with # will be ignored. Only four formats of a sample list file are accepted and should match with parameter -t:
 96 |                         [1] sample<tab>sample.fa(.gz)
 97 |                         [2] sample<tab>shotgun.1.fq(.gz)(<tab>shotgun.2.fq.gz)
 98 |                         [3] sample<tab>2bsingle.fq(.gz or 2bsingle.1.fq.gz)
 99 |                         [4] sample1<tab>sample2<tab>sample3<tab>sample4<tab>sample5<tab>R1.fq(.gz)<tab>R2.fq(.gz)
100 |           -d   <dir>    The working path of 2B-Tag-DB
101 |           -o   <dir>    The output directory (if it doesn't exist, will be created automatically as 'outdir')
102 |         OPTIONS of Qualitative Analysis
103 |           -p   <str>   If qualitative analysis applies or not [default: $qual] (yes or no)
104 |           -s1  <str>   The enzymatic site(s) for the qualitative analysis. One or more sites can be specified(comma separated) [default: $site1]
105 |                        It represents which enzymatic recognition site(s) will be used for digital restriction digestion, and contructing 2b-Tag-DB for the following qualitative analysis and quantitative analysis.
106 |                        [1]CspCI  [5]BcgI  [9]BplI     [13]CjePI  [17]AllEnzyme
107 |                        [2]AloI   [6]CjeI  [10]FalI    [14]Hin4I
108 |                        [3]BsaXI  [7]PpiI  [11]Bsp24I  [15]AlfI
109 |                        [4]BaeI   [8]PsrI  [12]HaeIV   [16]BslFI
110 |           -t1  <str>   The taxonomic level for 2bRAD markers in the qualitative database, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: $level1]
111 |         OPTIONS of Quantitative Analysis
112 |           -q   <str>   If quantitative analysis applies or not [default: $quan] (yes or no)
113 |           -gsc <int>   G score threshold for identifying the condidate microbes present in a sample in qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB in the quantitative analysis step. [default: $g_score_threshold, it means >$g_score_threshold]
114 |           -gcf <int>   The threshold of the 2bRAD tag number for the presence of a microbial genome (i.e., GCF) in the qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB database in the quantitative analysis step. [default: $GCF_threshold, it means >$GCF_threshold]
115 |           -s2  <str>   The enzymatic site for the quantitative analysis. (refer to -s1) [default: $site2, must be included in para -s1]
116 |           -t2  <str>   The taxonomic level for 2bRAD markers in the quantitative analysis, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: $level2]
117 |         OPTIONS of CPU
118 |           -c1  <int>   The number of CPUs used for parallelizing the digital digestion step for multiple samples. [default: 10]
119 |           -c2  <int>   The number of CPUs used for parallelizing abundance profiling for multiple samples based on a single enzyme and combining results from multiple enzymes have been set via -s1. [default: 8] (each CPU needs about 15~65G of memory)
120 | 	OPTIONS of Quality Control
121 |           -qc  <str>   If quality control apply or not [default: $qc] (yes or no)
122 |           -qcn <float> The maximum ratio of base \"N\" [default: $qc_n]
123 |           -qcq <int>   The minimum quality score to keep [default: $qc_q]
124 |           -qcp <int>   The minimum percentage of bases that must have [-qcq] quality [default: $qc_p]
125 |           -qcb <int>   ASCII+33 or ASCII+64 quality scores as Phred scores [default: $qc_b]
126 |         OPTIONS of Merging profiles
127 |           -ms  <str>   The mock-community sample name(s) (separated by commas). The specified samples will be removed from the merged output table.
128 |           -ncs <str>   The sample name(s) (separated by commas) of negative control that can be used for filtering potential contaminations.
129 |           -h | help   Print this help information.
130 | 
131 |     AUTHOR:  $author $time\e[0m\n";
132 | }
133 | 
134 | if(defined($help)){
135 | 	&usage;
136 | 	exit 0;
137 | }
138 | 
139 | 
140 | my %hs_site2enzyme=(#酶切位点对应表
141 | 	'1'  =>  'CspCI',	'2'  =>  'AloI',
142 | 	'3'  =>  'BsaXI',	'4'  =>  'BaeI',
143 | 	'5'  =>  'BcgI',	'6'  =>  'CjeI',
144 | 	'7'  =>  'PpiI',	'8'  =>  'PsrI',
145 | 	'9'  =>  'BplI',	'10' =>  'FalI',
146 | 	'11' =>  'Bsp24I',	'12' =>  'HaeIV',
147 | 	'13' =>  'CjePI',	'14' =>  'Hin4I',
148 | 	'15' =>  'AlfI',	'16' =>  'BslFI',
149 | 	);
150 | #参数检测
151 | unless($type && $list && $database && $outdir){
152 | 	&usage;
153 | 	print STDERR "Parameter -t -l -d or -o error.\n";
154 | 	exit 1;
155 | }
156 | 
157 | #转化为绝对路径
158 | $list=abs_path($list);
159 | $database=abs_path($database);
160 | $outdir=abs_path($outdir);
161 | 
162 | #输入数据类型检测
163 | unless($type==1 || $type==2 || $type==3 || $type==4){
164 | 	&usage;
165 | 	print STDERR "Parameter -t is wrong.";
166 | 	exit 1;
167 | }
168 | 
169 | #数据库文件检测
170 | unless(-e "$database/abfh_classify_with_speciename.txt.gz"){
171 | 	print STDERR "incomplete database, $database/abfh_classify_with_speciename.txt.gz does not exists\n";
172 | 	exit 1;
173 | }
174 | 
175 | #定性参数检测
176 | unless($qual eq "no" || $qual eq "yes"){
177 | 	&usage;
178 | 	print STDERR "Parameter -p is wrong. Cannot get $qual\n";
179 | 	exit 1;
180 | }
181 | #定性鉴定水平检测
182 | unless($level1 eq "kingdom" || $level1 eq "phylum" || $level1 eq "class" || $level1 eq "order" || $level1 eq "family" || $level1 eq "genus" || $level1 eq "species" || $level1 eq "strain"){
183 | 	&usage;
184 | 	print STDERR "Parameter -t1 is wrong. Cannot get $level1\n";
185 | 	exit 1;
186 | }
187 | #定性酶切位点检查及数据库检测
188 | my %hs_site1;
189 | if($site1=~/17/){
190 | 	$site1="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16";
191 | }
192 | my @site1=split /,/,$site1;
193 | for my $i(@site1){
194 | 	$hs_site1{$i}++;
195 | 	unless(exists $hs_site2enzyme{$i}){
196 | 		&usage;
197 | 		print STDERR "parameter -s1 is wrong, $i does not exists\n";
198 | 		exit 1;
199 | 	}
200 | 	#检测定性数据库并检测species数据库，用于定量(findgenome脚本)
201 | 	unless(-e "$database/$hs_site2enzyme{$i}.$level1.fa.gz" && -e "$database/$hs_site2enzyme{$i}.species.fa.gz"){
202 | 		&usage;
203 | 		print STDERR "incomplete database, $database/$hs_site2enzyme{$i}.$level1.fa.gz or $database/$hs_site2enzyme{$i}.species.fa.gz does not exists\n";
204 | 		exit 1;
205 | 	}
206 | }
207 | 
208 | #定量参数检测
209 | #定性鉴定水平检测
210 | unless($level2 eq "kingdom" || $level2 eq "phylum" || $level2 eq "class" || $level2 eq "order" || $level2 eq "family" || $level2 eq "genus" || $level2 eq "species" || $level2 eq "strain"){
211 | 	&usage;
212 | 	print STDERR "Parameter -t2 is wrong. Cannot get $level2\n";
213 | 	exit 1;
214 | }
215 | #定性酶切位点检查
216 | my @site2;
217 | if($quan eq "yes"){
218 | 	if($site2=~/17/){
219 | 		$site2="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16";
220 | 	}
221 | 	@site2=split /,/,$site2;
222 | 	for my $i(@site2){
223 | 		unless(exists $hs_site2enzyme{$i}){#检测site2输入是否正确
224 | 			&usage;
225 | 			print STDERR "parameter -s2 is wrong, $i does not exists\n";
226 | 			exit 1;
227 | 		}
228 | 		unless(exists $hs_site1{$i}){#检测site2是否包含于site1
229 | 			&usage;
230 | 			print STDERR "parameter -s2 is wrong, $i is not included in para -s1\n";
231 | 			exit 1;
232 | 		}
233 | 	}
234 | }elsif($quan eq "no"){
235 | 	;
236 | }else{
237 | 	&usage;
238 | 	print STDERR "parameter -q is wrong\n";
239 | 	exit 1;
240 | }
241 | 
242 | #质控参数检查
243 | unless($qc eq "yes" || $qc eq "no"){
244 | 	&usage;
245 | 	print STDERR "Parameter -qc is wrong. Cannot get $qc\n";
246 | }
247 | 
248 | print STDOUT "###COMMAND: perl $0 -t $type -l $list -d $database -o $outdir -p $qual -s1 $site1 -t1 $level1 -q $quan -gsc $g_score_threshold -gcf $GCF_threshold -s2 $site2 -t2 $level2 -c1 $cpu1 -c2 $cpu2 -qc $qc -qcn $qc_n -qcq $qc_q -qcp $qc_p -qcb $qc_b -ms $mock_sample -ncs $negative_control_sample\n";
249 | &CheckDir($outdir);
250 | #数据酶切
251 | &CheckDir("$outdir/enzyme_result");
252 | print STDOUT "###Electronic digestion started, ",`date`;
253 | open LIST,"$list" or die "cannot open $list\n";
254 | my $pm=new Parallel::ForkManager($cpu1); #多线程
255 | while(<LIST>){
256 | 	my $line=$_;
257 | 	if(/^#/ || /^$/){;}else{#去除注释行和空行
258 | 		chomp($line);
259 | 		my @tmp=split /\t/,$line;
260 | 		for my $i(@site1){
261 | 			my $pid=$pm->start and next;
262 | 			if($type!=4){#除2brad五标签之外其他数据处理
263 | 				unless(-e "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz"){
264 | 					&execute("perl $Bin/2bRADExtraction.pl -i @tmp[1..$#tmp] -t $type -s $i -od $outdir/enzyme_result -op $tmp[0] -gz yes -qc $qc -n $qc_n -q $qc_q -p $qc_p -b $qc_b -fm fa 1>/dev/null");
265 | 					`sleep 1s`;
266 | 				}else{
267 | 					print STDOUT "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz already exists, skip.\n";
268 | 					`sleep 1s`;
269 | 				}
270 | 			}else{#2brad五标签处理
271 | 	 			unless(-e "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz" && 
272 | 					   -e "$outdir/enzyme_result/$tmp[1].$hs_site2enzyme{$i}.fa.gz" &&
273 | 					   -e "$outdir/enzyme_result/$tmp[2].$hs_site2enzyme{$i}.fa.gz" && 
274 | 					   -e "$outdir/enzyme_result/$tmp[3].$hs_site2enzyme{$i}.fa.gz" && 
275 | 					   -e "$outdir/enzyme_result/$tmp[4].$hs_site2enzyme{$i}.fa.gz" ){
276 | 					&execute("perl $Bin/2bRADExtraction.pl -i $tmp[-2] $tmp[-1] -t $type -s $i -od $outdir/enzyme_result -op @tmp[0..4] -gz yes -qc $qc -n $qc_n -q $qc_q -p $qc_p -b $qc_b -fm fa 1>/dev/null");
277 | 				}else{
278 | 					print STDOUT "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz && ";
279 | 					print STDOUT "$outdir/enzyme_result/$tmp[1].$hs_site2enzyme{$i}.fa.gz && ";
280 | 					print STDOUT "$outdir/enzyme_result/$tmp[2].$hs_site2enzyme{$i}.fa.gz && ";
281 | 					print STDOUT "$outdir/enzyme_result/$tmp[3].$hs_site2enzyme{$i}.fa.gz && ";
282 | 					print STDOUT "$outdir/enzyme_result/$tmp[4].$hs_site2enzyme{$i}.fa.gz already exist, skip\n";
283 | 					`sleep 1s`;
284 | 				}
285 | 			}
286 | 			$pm->finish;
287 | 		}
288 | 	}
289 | }
290 | $pm->wait_all_children;
291 | close LIST;
292 | print STDOUT "###Electronic digestion completed, ",`date`;
293 | 
294 | 
295 | ##整理列表
296 | &CheckDir("$outdir/list");
297 | for my $i(@site1){
298 | 	open LIST,"$list" or die "cannot open $list\n";
299 | 	open OU,">$outdir/list/$hs_site2enzyme{$i}.list" or die "cannot open $outdir/list/$hs_site2enzyme{$i}.list\n";
300 | 	while(<LIST>){
301 | 		next if(/^#/ || /^$/);
302 | 		chomp;
303 | 		my @tmp=split /\t/;
304 | 		if($type!=4){#除2brad五标签之外其他数据处理
305 | 			print OU "$tmp[0]\t$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz\n";
306 | 		}else{#2brad五标签处理
307 | 			print OU "$tmp[0]\t$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz\n";
308 | 			print OU "$tmp[1]\t$outdir/enzyme_result/$tmp[1].$hs_site2enzyme{$i}.fa.gz\n";
309 | 			print OU "$tmp[2]\t$outdir/enzyme_result/$tmp[2].$hs_site2enzyme{$i}.fa.gz\n";
310 | 			print OU "$tmp[3]\t$outdir/enzyme_result/$tmp[3].$hs_site2enzyme{$i}.fa.gz\n";
311 | 			print OU "$tmp[4]\t$outdir/enzyme_result/$tmp[4].$hs_site2enzyme{$i}.fa.gz\n";
312 | 		}
313 | 	}
314 | 	close LIST;
315 | 	close OU;
316 | }
317 | if($type==4){#2brad五标签样品名格式行转化成列
318 | 	open LIST,"$list" or die "cannot open $list\n";
319 | 	open OULI,">$outdir/list/2brad_5tag.list" or die "cannot open $outdir/list/2brad_5tag.list\n";
320 | 	while(<LIST>){
321 | 		next if(/^#/ || /^$/);#去除注释行和空行
322 | 		chomp;
323 | 		my @tmp=split /\t/;
324 | 		for my $i(0..4){
325 | 			print OULI "$tmp[$i]\t$outdir/quantitative/$tmp[$i]/$tmp[$i].combine.xls\n";
326 | 		}
327 | 	}
328 | 	close LIST;
329 | 	close OULI;
330 | }else{
331 | 	open LIST,"$list" or die "cannot open $list\n";
332 | 	open OULI,">$outdir/list/Abundance_Stat.list" or die "cannot open $outdir/list/Abundance_Stat.list\n";
333 | 	while(<LIST>){
334 | 		next if(/^#/ || /^$/);#去除注释行和空行
335 | 		chomp;
336 | 		my @tmp=split /\t/;
337 | 		print OULI "$tmp[0]\t$outdir/quantitative/$tmp[0]/$tmp[0].combine.xls\n";
338 | 	}
339 | 	close LIST;
340 | 	close OULI;
341 | }
342 | 
343 | ##多线程初步定性
344 | if($qual eq "yes"){#是否需要定性
345 | 	print STDOUT "###Qualitative analysis started, ",`date`;
346 | 	&CheckDir("$outdir/qualitative");
347 | 	$pm=new Parallel::ForkManager($cpu2);
348 | 	for my $i(@site1){
349 | 		my $pid=$pm->start and next;
350 | 		&execute("perl $Bin/CalculateRelativeAbundance_Single2bEnzyme.pl -l $outdir/list/$hs_site2enzyme{$i}.list -d $database/ -t $level1 -s $i -o $outdir/qualitative -g 0 -v yes 1> /dev/null");#未对G_score过滤
351 | 		`sleep 1s`;
352 | 		$pm->finish;
353 | 	}
354 | 	$pm->wait_all_children;
355 | 	
356 | 	##定性结果合并
357 | 	if($type!=4){#除2brad五标签之外其他数据处理
358 | 		&execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $list -s $site1 -io $outdir/qualitative -m combine -g 0");#未对G_score过滤
359 | 	}else{#2brad五标签处理
360 | 		&execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $outdir/list/2brad_5tag.list -s $site1 -io $outdir/qualitative -m combine -g 0");#未对G_score过滤
361 | 	}
362 | 	print STDOUT "###Qualitative completed, ",`date`;
363 | }else{
364 | 	if($quan eq "no"){
365 | 		print STDOUT "All Done, ",`date`;
366 | 		exit 0;
367 | 	}
368 | }
369 | 
370 | 
371 | if($quan eq "no"){
372 | 	print STDOUT "All Done, ",`date`;
373 | 	exit 0;
374 | }
375 | #精细定量
376 | print STDOUT "###Quantitative analysis started, ",`date`;
377 | 
378 | &CheckDir("$outdir/quantitative_sdb");
379 | &CheckDir("$outdir/quantitative");
380 | 
381 | #FindGenome_ByQualitative
382 | if($type!=4){#除2brad五标签之外其他数据处理
383 | 	&execute("perl $Bin/FindGenome_ByQualitative.pl -l $list -d $database -o $outdir/quantitative_sdb -qualdir $outdir/qualitative -gscore $g_score_threshold -gcf $GCF_threshold");
384 | }else{#2brad五标签处理
385 | 	&execute("perl $Bin/FindGenome_ByQualitative.pl -l $outdir/list/2brad_5tag.list -d $database -o $outdir/quantitative_sdb -qualdir $outdir/qualitative -gscore $g_score_threshold -gcf $GCF_threshold");
386 | }
387 | 
388 | if($type!=4){#除2brad五标签之外其他数据处理
389 | 	open LIST,"$list" or die "cannot open $list\n";
390 | }else{#2brad五标签处理
391 | 	open LIST,"$outdir/list/2brad_5tag.list" or die "cannot open $outdir/list/2brad_5tag.list\n";
392 | }
393 | $pm=new Parallel::ForkManager($cpu2);#样品间酶切位点间多线程并行
394 | my $rm;
395 | while(<LIST>){
396 | 	my $line=$_;
397 | 	if(/^#/ || /^$/){;}else{#去除注释行和空行
398 | 		chomp($line);
399 | 		my $sample_name=(split /\t/,$line)[0];
400 | 		$rm .=" $outdir/quantitative_sdb/$sample_name/database ";
401 | 		print STDOUT "Analyze $sample_name, ",`date`;
402 | 		&CheckDir("$outdir/quantitative_sdb/$sample_name/database");
403 | 		&execute("cp $outdir/quantitative_sdb/$sample_name/sdb.list $outdir/quantitative_sdb/$sample_name/database/abfh_classify_with_speciename.txt && gzip -f $outdir/quantitative_sdb/$sample_name/database/abfh_classify_with_speciename.txt");
404 | 		#精细定量开始
405 | 		for my $i(@site2){
406 | 			my $pid=$pm->start and next;
407 | 			open SA,">$outdir/quantitative_sdb/$sample_name/$hs_site2enzyme{$i}.list" or die "cannot open $outdir/quantitative_sdb/$sample_name/$hs_site2enzyme{$i}.list\n";
408 | 			print SA "$sample_name\t$outdir/enzyme_result/$sample_name.$hs_site2enzyme{$i}.fa.gz\n";
409 | 			close SA;
410 | 			if(-e "$outdir/quantitative_sdb/$sample_name/sdb.list"){#检测sdb.list文件
411 | 				my $file_wc="$outdir/quantitative_sdb/$sample_name/sdb.list";
412 | 				my @wc_l=split /\s+/,`wc -l $file_wc`;#检测sdb.list文件 基因组行数
413 | 				if($wc_l[0]!=0){#list中有基因组
414 | 					&execute("perl $Bin/CreateQuanDatabase_2bRAD.pl -l $outdir/quantitative_sdb/$sample_name/sdb.list -e $database/$hs_site2enzyme{$i}.species.fa.gz -s $i -t $level2 -o $outdir/quantitative_sdb/$sample_name/database -r no 1> /dev/null");#建库
415 | 					&execute("perl $Bin/CalculateRelativeAbundance_Single2bEnzyme.pl -l $outdir/quantitative_sdb/$sample_name/$hs_site2enzyme{$i}.list -d $outdir/quantitative_sdb/$sample_name/database -t $level2 -s $i -o $outdir/quantitative -g 0 -v yes 1> /dev/null");#定量 不对gscore进行过滤
416 | 				}else{#list中无基因组
417 | 					print STDERR "[ERROR] $outdir/quantitative_sdb/$sample_name/sdb.list does not exist or the content is empty, Sample $sample_name could not be quantitatively analyzed.\n";
418 | 				}
419 | 			}
420 | 			$pm->finish;
421 | 		}
422 | 	}
423 | }
424 | $pm->wait_all_children;
425 | close LIST;
426 | &execute("rm -rf $rm");#删除database
427 | 
428 | #精细定量结果合并
429 | if($type!=4){#除2brad五标签之外其他数据处理
430 | 	&execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $list -s $site2 -io $outdir/quantitative -m combine -g 0 1> /dev/null");#不对gscore过滤
431 | }else{#2brad五标签处理
432 | 	&execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $outdir/list/2brad_5tag.list -s $site2 -io $outdir/quantitative -m combine -g 0 1> /dev/null");#不对gscore过滤
433 | }
434 | print STDOUT "###Quantitative analysis complete, ",`date`;
435 | 
436 | print STDOUT "###Merging abundance profiles from multiple samples started, ",`date`;
437 | if($type!=4){#除2brad五标签之外其他数据处理
438 | 	&execute("perl $Bin/MergeProfilesFromMultipleSamples.pl -l $outdir/list/Abundance_Stat.list -o $outdir/quantitative -p Abundance_Stat -m $mock_sample -c $negative_control_sample");
439 | }else{#2brad五标签处理
440 | 	&execute("perl $Bin/MergeProfilesFromMultipleSamples.pl -l $outdir/list/2brad_5tag.list -o $outdir/quantitative -p Abundance_Stat -m $mock_sample -c $negative_control_sample");
441 | }
442 | 
443 | print STDOUT "###Merging abundance profiles completed, ",`date`;
444 | 
445 | print STDOUT "All Done, ",`date`;
446 | 
447 | sub CheckDir{#创建目录
448 | 	my $file = shift;
449 | 	unless( -d $file ){
450 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
451 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
452 | 	}
453 | 	return 1;
454 | }
455 | 
456 | 
457 | sub execute{#打出命令并执行
458 | 	my $cmd = shift;
459 | 	print STDOUT "$cmd\n";
460 | 	my $exit_code=system($cmd);
461 | 	if($exit_code!=0){
462 | 		print STDERR "Command $cmd failed with an exit code of $exit_code.\n";
463 | 		exit($exit_code >> 8);
464 | 	}
465 | }
466 | 


--------------------------------------------------------------------------------
/scripts/2bRADExtraction.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Authors: Zheng Sun, Rongchao Zhang, Shi Huang
  3 | # Last update: 2020.06.03
  4 | use strict;
  5 | use warnings;
  6 | use Getopt::Long;
  7 | use FindBin qw($Bin);
  8 | use File::Basename qw(dirname basename);
  9 | no strict 'refs';
 10 | 
 11 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang";
 12 | my $time="2020.06.03";
 13 | 
 14 | #set default parameters
 15 | my $ncount ||=0.08;
 16 | my $quality ||=30;
 17 | my $percent ||=80;
 18 | my $qbase ||=33;
 19 | my $format ||="fa";
 20 | my $gz ||="yes";
 21 | my $q_control ||="yes";
 22 | my $pear_cpu ||=1;
 23 | #set software path
 24 | my $pear ||="pear";
 25 | 
 26 | select STDOUT;$|=1;# Standard output for clearing cache
 27 | 
 28 | my (@input,$type,$site,$outdir,@outprefix);
 29 | my $help;
 30 | GetOptions(
 31 | 	"i:s{1,2}"  => \@input,# single-end or double-end reads
 32 | 	"t:i"  => \$type, #fa：reference genome data；fq：shotgun metagenomics data，single 2b-RAD tags，five concatenated 2b-RAD tags
 33 | 	"s:i"  => \$site, # restriction sites
 34 | 	"od:s" => \$outdir, # output directory
 35 | 	"op:s{1,5}" => \@outprefix, # output prefix
 36 | 
 37 | 	"gz:s" => \$gz, # do compression or not for the outputs
 38 | 
 39 | 	"qc:s" => \$q_control, # QC: Whether QC is required
 40 | 	"n:f"  => \$ncount, # QC: The # of ambiguity nucleotides N allowed
 41 | 	"q:i"  => \$quality, # QC: Minimum per-base quality score (XX)
 42 | 	"p:i"  => \$percent, # Minimum percentage of bases that must have per-base quality score over [-q]
 43 | 	"b:i"  => \$qbase, # Quality values of bases
 44 | 	"fm:s" => \$format, # The output file format: fa/fq
 45 | 
 46 | 	#software
 47 | 	"pe:s" => \$pear,#pear soft
 48 | 	"pc:i" => \$pear_cpu,#cpu of pear soft
 49 | 
 50 | 	"h|help:s" => \$help,
 51 | 	);
 52 | 
 53 | sub usage{
 54 | 	print STDERR "\e[;33;1m
 55 | Description
 56 |     It performs digital type-2B-restriction disgest of DNA data generated by a wide range of sequencing protocols by one of 16 restriction enzymes.For a given type 2b restriction enzyme, it can return a Fasta file including resulting 2b-RAD tags, and a statistical summary including raw number ofinput sequences, restriction enzyme used, number of restriction fragments produced, percentage of restriction fragments over the whole (meta)genomedata. The four application scenarios of this script are listed as below:
 57 |     1.To extract 2b-RAD tags from reference genome(s) data, run: perl EeTt.pl -i genome.fa(.gz) -t 1 -s 1 -od . -op sample
 58 |     2.To extract 2b-RAD tags from shotgun metagenomic sequencing data, run: perl EeTt.pl -i shotgun.R1.fq(.gz) (or shotgun.R1.fq.gz  shotgun.R2.fq.gz) -t 2 -s 1 -od . -op sample
 59 |     3.To extract single 2b-RAD tags from SE or PE sequencing data, run: perl EeTt.pl -i 2b-RADsingle.fq(.gz) (or 2b-RADsingle.R1.fq.gz from PE platform) -t 3 -s 1 -od . -op sample
 60 |     4.To split five concatenated 2b-RAD tags from PE sequencing data, run: perl EeTt.pl -i 2b-RAD.R1.fq(.gz) 2b-RAD.R2.fq(.gz) -t 4 -s 1 -od . -op sample1 sample2 sample3 sample4 sample5
 61 | Usage
 62 |     perl $0 -i <input_file> -t <type> -s <site> -od <outdir> -op <outprefix> [options]*
 63 | Required:
 64 |     -i  <str>    Input fasta/fastq filepath (.gz supported)
 65 |     -t  <int>    The type of input fasta/fastq file
 66 |                  [1] Reference genome data in the Fasta format
 67 |                  [2] Shotgun (meta)genome data in the Fastq format
 68 |                  [3] SE/PE sequencing data in the Fastq format
 69 |                  [4] PE sequencing data in the Fastq format
 70 |     -s  <int>    One of the type 2b restriction enzymes (sites).
 71 |                  [1]CspCI  [9]BplI
 72 |                  [2]AloI   [10]FalI
 73 |                  [3]BsaXI  [11]Bsp24I
 74 |                  [4]BaeI   [12]HaeIV
 75 |                  [5]BcgI   [13]CjePI
 76 |                  [6]CjeI   [14]Hin4I
 77 |                  [7]PpiI   [15]AlfI
 78 |                  [8]PsrI   [16]BslFI
 79 |     -od <str>    The output directory (automatically create if it does not exist)
 80 |     -op <str>    The output prefix (recommended: sample name(s))
 81 | Optional:
 82 |     -gz <str>    Whether the output file is compressed (yes or no) [$gz]
 83 |     -h|help      print this help
 84 | Optional (only applicable when -t equals 2, or 3, or 4, i.e. taking fastq data as input):
 85 |     -qc <str>    Whether quality control is required (yes or no) [$q_control]
 86 |     -n  <float>  Maximum percentage of ambiguity bases \"N\" [default: $ncount]
 87 |     -q  <int>    Minimum per-base quality score [default: $quality]
 88 |     -p  <int>    Minimum percentage of bases that must have per-base quality score over [-q] [default: $percent]
 89 |     -b  <int>    Phred quality score type [default: $qbase]
 90 |     -fm <str>    Output file format (fa or fq) [default: $format]
 91 |     -pe  <str>   Path of pear soft [$pear]
 92 |     -pc  <int>   Cpu of pear soft [$pear_cpu]
 93 | Author  $author $time\e[0m\n";
 94 | }
 95 | 
 96 | if(defined($help)){
 97 | 	&usage;
 98 | 	exit 0;
 99 | }
100 | 
101 | unless(@input && $type && $site && $outdir && @outprefix){# parameters checking
102 | 	&usage;
103 | 	print STDERR "Please check parameter -i -t -s -od -op\n";
104 | 	exit 1;
105 | }
106 | 
107 | 
108 | 
109 | # checking input args
110 | unless($gz eq "yes" || $gz eq "no"){
111 | 	&usage;
112 | 	print STDERR "Parameter -gz is wrong\n";
113 | 	exit 1;
114 | }
115 | unless($format eq "fa" || $format eq "fq"){
116 | 	&usage;
117 | 	print STDERR "Parameter -fm is wrong\n";
118 | 	exit 1;
119 | }
120 | unless($q_control eq "yes" || $q_control eq "no"){
121 | 	&usage;
122 | 	print STDERR "Parameter -qc is wrong\n";
123 | 	exit 1;
124 | }
125 | 
126 | # Define the DNA sequences at a given restriction enzyme site
127 | my ($enzyme,@site,@start,@end,$minpear,$maxpear);
128 | if( 1 == $site ){#CspCI
129 |     @site = (
130 |             '[AGCT]{11}CAA[AGCT]{5}GTGG[AGCT]{10}',
131 |             '[AGCT]{10}CCAC[AGCT]{5}TTG[AGCT]{11}',
132 |             );
133 | 	$enzyme="CspCI";
134 | 	@start = (0,37,78,119,160);
135 | 	@end   = (41,82,123,164,205);
136 | }elsif( 2 == $site ){#AloI
137 |     @site = (
138 |             '[AGCT]{7}GAAC[AGCT]{6}TCC[AGCT]{7}',
139 |             '[AGCT]{7}GGA[AGCT]{6}GTTC[AGCT]{7}',
140 |             );
141 | 	$enzyme="AloI";
142 | 	@start = (0,38,80,122,164);
143 | 	@end   = (42,84,126,168,210);
144 | }elsif( 3 == $site ){#BsaXI
145 |     @site = (
146 |             '[AGCT]{9}AC[AGCT]{5}CTCC[AGCT]{7}',
147 |             '[AGCT]{7}GGAG[AGCT]{5}GT[AGCT]{9}',
148 |             );
149 | 	$enzyme="BsaXI";
150 | 	@start = (0,33,69,105,141);
151 | 	@end   = (35,71,107,143,180);
152 | 	$minpear ||=173;
153 | 	$maxpear ||=181;
154 | }elsif( 4 == $site ){#BaeI
155 |     @site = (
156 |             '[AGCT]{10}AC[AGCT]{4}GTA[CT]C[AGCT]{7}',
157 |             '[AGCT]{7}G[AG]TAC[AGCT]{4}GT[AGCT]{10}',
158 |             );
159 | 	$enzyme="BaeI";
160 | 	@start = (0,38,79,120,161);
161 | 	@end   = (40,81,122,163,205);
162 | 	$minpear ||=198;
163 | 	$maxpear ||=206;
164 | }elsif( 5 == $site ){#BcgI
165 |     @site = (
166 |             '[AGCT]{10}CGA[AGCT]{6}TGC[AGCT]{10}',
167 |             '[AGCT]{10}GCA[AGCT]{6}TCG[AGCT]{10}',
168 |             );
169 | 	$enzyme="BcgI";
170 | 	@start = (0,36,75,114,153);
171 | 	@end   = (38,77,116,155,195);
172 | 	$minpear ||=188;
173 | 	$maxpear ||=196;
174 | }elsif( 6 == $site ){#CjeI
175 |     @site = (
176 |             '[AGCT]{8}CCA[AGCT]{6}GT[AGCT]{9}',
177 |             '[AGCT]{9}AC[AGCT]{6}TGG[AGCT]{8}',
178 |             );
179 | 	$enzyme="CjeI";
180 | 	@start = (0,40,83,126,169);
181 | 	@end   = (42,85,128,171,214);
182 | }elsif( 7 == $site ){#PpiI
183 |     @site = (
184 |             '[AGCT]{7}GAAC[AGCT]{5}CTC[AGCT]{8}',
185 |             '[AGCT]{8}GAG[AGCT]{5}GTTC[AGCT]{7}',
186 |             );
187 | 	$enzyme="PpiI";
188 | 	@start = (0,37,77,117,157);
189 | 	@end   = (39,79,119,159,199);
190 | }elsif( 8 == $site ){#PsrI
191 |     @site = (
192 |             '[AGCT]{7}GAAC[AGCT]{6}TAC[AGCT]{7}',
193 |             '[AGCT]{7}GTA[AGCT]{6}GTTC[AGCT]{7}',
194 |             );
195 | 	$enzyme="PsrI"; 
196 | 	@start = (0,37,77,117,157);
197 | 	@end   = (39,79,119,159,199);
198 | }elsif( 9 == $site ){#BplI
199 |     @site = (
200 |             '[AGCT]{8}GAG[AGCT]{5}CTC[AGCT]{8}', #palindromes
201 |             );
202 | 	$enzyme="BplI";
203 | 	@start = (0,37,77,117,157);
204 | 	@end   = (39,79,119,159,199);
205 | }elsif( 10 == $site ){#FalI
206 |     @site = (
207 |             '[AGCT]{8}AAG[AGCT]{5}CTT[AGCT]{8}', #palindromes
208 |             );
209 | 	$enzyme="FalI";
210 | 	@start = (0,37,77,117,157);
211 | 	@end   = (39,79,119,159,200);
212 | 	$minpear ||=193;
213 | 	$maxpear ||=201;
214 | }elsif( 11 == $site ){#Bsp24I
215 |     @site = (
216 |             '[AGCT]{8}GAC[AGCT]{6}TGG[AGCT]{7}',
217 |             '[AGCT]{7}CCA[AGCT]{6}GTC[AGCT]{8}',
218 |             );
219 | 	$enzyme="Bsp24I";
220 | 	@start = (0,37,77,117,157);
221 | 	@end   = (39,79,119,159,200);
222 | }elsif( 12 == $site ){#HaeIV
223 |     @site = (
224 |             '[AGCT]{7}GA[CT][AGCT]{5}[AG]TC[AGCT]{9}',
225 |             '[AGCT]{9}GA[CT][AGCT]{5}[AG]TC[AGCT]{7}',
226 |             );
227 | 	$enzyme="HaeIV";
228 | 	@start = (0,38,79,120,161);
229 | 	@end   = (40,81,122,163,204);
230 | }elsif( 13 == $site ){#CjePI
231 |     @site = (
232 |             '[AGCT]{7}CCA[AGCT]{7}TC[AGCT]{8}',
233 |             '[AGCT]{8}GA[AGCT]{7}TGG[AGCT]{7}',
234 |             );
235 | 	$enzyme="CjePI";
236 | 	@start = (0,39,81,123,165);
237 | 	@end   = (41,83,125,167,209);
238 | }elsif( 14 == $site ){#Hin4I
239 |     @site = (
240 |             '[AGCT]{8}GA[CT][AGCT]{5}[GAC]TC[AGCT]{8}',
241 |             '[AGCT]{8}GA[CTG][AGCT]{5}[AG]TC[AGCT]{8}',
242 |             );
243 | 	$enzyme="Hin4I";
244 | 	@start = (0,37,77,117,157);
245 | 	@end   = (39,79,119,159,199);
246 | }elsif( 15 == $site ){#AlfI
247 |     @site = (
248 |             '[AGCT]{10}GCA[AGCT]{6}TGC[AGCT]{10}', #palindromes
249 |             );
250 | 	$enzyme="AlfI";
251 | 	@start = (0,36,75,114,153);
252 | 	@end   = (38,77,116,155,194);
253 | }elsif( 16 == $site ){#BslFI ??some question?? single enzyme
254 |     @site = (
255 |             '[AGCT]{6}GGGAC[AGCT]{14}',
256 |             '[AGCT]{14}GTCCC[AGCT]{6}',
257 |             );
258 | 	$enzyme="BslFI";
259 | 	@start = (0,34,72,110,148);
260 | 	@end   = (38,76,114,152,190);
261 | }else{
262 | 	&usage;
263 | 	print STDERR "The parameter -s is wrong\n";
264 | 	exit 1;
265 | }
266 | 
267 | &CheckDir($outdir);
268 | my $raw_reads_num=0;# check # of raw reads 如果涉及到拼接，shotgun双端 需要单独记录拼接前的reads数目
269 | 
270 | if($#input==0 && $type==1 && $#outprefix==0){# reference genome(s)
271 | 	print STDOUT "COMMAND: perl $0 -i $input[0] -t 1 -s $site -od $outdir -op $outprefix[0] -gz $gz\n";
272 | 	print STDOUT "Electronic enzyme digestion of input genome(s) -- Start, ",`date`;
273 | 	&Electronic_enzyme;
274 | 	print STDOUT "Electronic enzyme digestion of input genome(s) -- End, ",`date`;
275 | }elsif($#input==0 && $type==2 && $#outprefix==0){# single-end reads from shotgun metagenomics
276 | 	if($q_control eq "yes"){# need QC
277 | 		print STDOUT "COMMAND: perl $0 -i $input[0] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format\n";
278 | 	}else{#no QC
279 | 		print STDOUT "COMMAND: perl $0 -i $input[0] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -fm $format\n";
280 | 	}
281 | 	print STDOUT "Tags extraction from shotgun metagenomics data -- Start, ",`date`;
282 | 	&fastq;
283 | 	print STDOUT "Tags extraction from shotgun metagenomics data -- End, ",`date`;
284 | }elsif($#input==1 && $type==2 && $#outprefix==0){# paired-end reads from shotgun metagenomics
285 | #	unless(-e "$pear"){
286 | #		&usage;
287 | #		print STDERR "Can not find software $pear\n";
288 | #		exit 1;
289 | #	}
290 | 	if($q_control eq "yes"){# need QC
291 | 		print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format -pe $pear -pc $pear_cpu\n";
292 | 	}else{# no QC
293 | 		print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -fm $format -pe $pear -pc $pear_cpu\n";
294 | 	}
295 | 	print STDOUT "Tags extraction from shotgun metagenomics data -- Start, ",`date`;
296 | 	&pear; # Merge the paired-end reads using PEAR. PEAR is a very fast and accurate software tool to merge paired-end reads from next-generation sequencing experiments.
297 | 	$input[0]="$outdir/$outprefix[0].$enzyme.pear.fastq.gz";
298 | 	&fastq;
299 | 	&execute("rm -f $outdir/$outprefix[0].$enzyme.pear.fastq.gz");
300 | 	print STDOUT "Tags extraction from shotgun metagenomics data -- End, ",`date`;
301 | }elsif($#input==0 && $type==3 && $#outprefix==0){# single iso-RAD tags
302 | 	if($q_control eq "yes"){# need QC
303 | 		print STDOUT "COMMAND: perl $0 -i $input[0] -t 3 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format\n";
304 | 	}else{#no QC
305 | 		print STDOUT "COMMAND: perl $0 -i $input[0] -t 3 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -fm $format\n";
306 | 	}
307 | 	print STDOUT "Data Split for single isoRAD tags from SE Platform  -- Start, ",`date`;
308 | 	&Single_Lable;
309 | 	print STDOUT "Data Split for Single isoRAD tags from SE Platform -- End, ",`date`;
310 | }elsif($#input==1 && $type==4 && $#outprefix==4){# five concatenated isoRAD tags
311 | #	unless(-e "$pear"){
312 | #		&usage;
313 | #		print STDERR "Can not find $pear\n";
314 | #		exit 1;
315 | #	}
316 | 	if($q_control eq "yes"){# need QC
317 | 		print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 4 -s $site -od $outdir -op ",join(" ",@outprefix[0..4])," -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format -pe $pear -pc $pear_cpu\n";
318 | 	}else{#不需要质控
319 | 		print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 4 -s $site -od $outdir -op ",join(" ",@outprefix[0..4])," -gz $gz -qc $q_control -fm $format -pe $pear -pc $pear_cpu\n";
320 | 	}
321 | 	print STDOUT "Data split for five concatenated isoRAD tags from PE platform   -- Start, ",`date`;
322 | 	&Five_Lable;
323 | 	print STDOUT "Data split for five concatenated isoRAD tags from PE platform  -- End, ",`date`;
324 | }else{
325 | 	&usage;
326 | 	print STDERR "Please check parameter -i -t -op\n";
327 | 	exit 1;
328 | }
329 | 
330 | 
331 | sub pear{
332 | 	my $r1=$input[0];
333 | 	my $r2=$input[1];
334 | 	if($r1=~/\.gz$/){# count the # of raw reads
335 | 		open R,"gzip -dc $r1|" or die "cannot open $r1\n";
336 | 	}else{
337 | 		open R,"$r1" or die "cannot open $r1\n";
338 | 	}
339 | 	while(<R>){#统计双端shotgun数据reads数
340 | 		$raw_reads_num++;
341 | 		<R>;
342 | 		<R>;
343 | 		<R>;
344 | 	}
345 | 	close R;
346 | 	my $outprefix="$outprefix[0].$enzyme";
347 | 	&execute("$pear -f $r1 -r $r2 -e -o $outdir/$outprefix -j $pear_cpu");#pear 拼接，当插入片段过短时，会无法拼接
348 | 	&execute("cat $outdir/$outprefix.assembled.fastq $outdir/$outprefix.unassembled.forward.fastq $outdir/$outprefix.unassembled.reverse.fastq | gzip > $outdir/$outprefix.pear.fastq.gz");#拼接后的reads和不能拼接的R1R2合并
349 | 	&execute("rm -f $outdir/$outprefix.assembled.fastq $outdir/$outprefix.unassembled.forward.fastq $outdir/$outprefix.unassembled.reverse.fastq");
350 | 	&execute("rm -f $outdir/$outprefix.discarded.fastq");
351 | }
352 | sub fastq{
353 | 	my $fastq=$input[0];
354 | 	my $outprefix=$outprefix[0];
355 | 	if($fastq=~/\.gz$/){
356 | 		open IN,"gzip -dc $fastq|" or die "cannot open $fastq\n";
357 | 	}else{
358 | 		open IN,"$fastq" or die "cannot open $fastq\n";
359 | 	}
360 | 	if($gz eq "yes"){
361 | 		open OU,"|gzip >$outdir/$outprefix.$enzyme.$format.gz" or die "cannot open $outdir/$outprefix.$enzyme.$format.gz\n";
362 | 	}else{
363 | 		open OU,">$outdir/$outprefix.$enzyme.$format" or die "cannot open $outdir/$outprefix.$enzyme.$format\n";
364 | 	}
365 | 	open STAT,">$outdir/$outprefix.$enzyme.stat.xls" or die "cannot open $outdir/$outprefix.$enzyme.stat.xls\n";
366 | 	if($raw_reads_num==0){# single-end reads from shotgun metagenomics
367 | 		print STAT "sample\tenzyme\tinput_reads_num\tenzyme_reads_num\tpercent\n";
368 | 	}else{#双端shotgun数据
369 | 		print STAT "sample\tenzyme\tinput_reads_num\tcombine_uncombineR1R2_reads_num\tenzyme_reads_num\tpercent\n";
370 | 	}
371 | 	my($input_reads_num,$enzyme_reads_num,$percent_sub);
372 | 	$enzyme_reads_num=0;
373 | 	while(<IN>){
374 | 		$input_reads_num++;# When inputting paired-end shotgun metagenomics reads，we only record the # of merged reads 当输入数据是双端shotgun数据时，此时记录的是拼接后的reads数以及未能拼接的R1R2之和
375 | 		my $line=$_ . <IN> . <IN> . <IN>;
376 | 		if($q_control eq "yes"){# QC
377 | 			next unless(&CheckN($line));
378 | 			next unless(&CheckQ($line));
379 | 		}
380 | 		my @tmp=split /\n/,$line;
381 | 		my %uniq;# deredundancy as some restriction sites will have palindromes
382 | 		for my $site(@site){
383 | 			while($tmp[1]=~/($site)/g){
384 | 				my $pos=pos($tmp[1]);
385 | 				my $seq=$1;
386 | 				my $len=length($seq);
387 | 				$pos=$pos-$len+1;
388 | 				my $qual=substr($tmp[3],$pos-1,$len);
389 | 				pos($tmp[1])=$pos;
390 | 				$uniq{$pos}{$len}="$tmp[0]-$pos\n$seq\n+\n$qual\n";
391 | 			}
392 | 		}
393 | 		for my $pos(sort {$a<=>$b} keys %uniq){
394 | 			for my $len(sort {$a<=>$b} keys %{$uniq{$pos}}){
395 | 				$enzyme_reads_num++;
396 | 				my @a=split /\n/,$uniq{$pos}{$len};
397 | 				if($format eq "fq"){
398 | 					print OU "$a[0]\n$a[1]\n$a[2]\n$a[3]\n";
399 | 				}elsif($format eq "fa"){
400 | 					$a[0]=~s/^@/>/;
401 | 					print OU "$a[0]\n$a[1]\n";
402 | 				}
403 | 			}
404 | 		}
405 | 		undef %uniq;
406 | 	}
407 | 	close IN;
408 | 	close OU;
409 | 	if($raw_reads_num==0){#single-end reads from shotgun metagenomics
410 | 		$percent_sub=sprintf "%.2f",$enzyme_reads_num/$input_reads_num*100;
411 | 		print STAT "$outprefix\t$enzyme\t$input_reads_num\t$enzyme_reads_num\t$percent_sub%\n";
412 | 	}else{#双端shotgun数据
413 | 		$percent_sub=sprintf "%.2f",$enzyme_reads_num/$raw_reads_num*100;
414 | 		print STAT "$outprefix\t$enzyme\t$raw_reads_num\t$input_reads_num\t$enzyme_reads_num\t$percent_sub%\n";
415 | 	}
416 | 	close STAT;
417 | }
418 | 
419 | 
420 | 
421 | sub Five_Lable{# Data split for five concanated 2b-RAD tags
422 | 	my $r1=$input[0];
423 | 	my $r2=$input[1];
424 | 	my ($output,@fhandle);
425 | 	my $outprefix=$outprefix[0];# rename intermediate files using the first sample name
426 | 	my $input_reads_num;
427 | 	if($r1=~/\.gz$/){# record the # of raw reads
428 | 		open R,"gzip -dc $r1|" or die "cannot open $r1\n";
429 | 	}else{
430 | 		open R,"$r1" or die "cannot open $r1\n";
431 | 	}
432 | 	while(<R>){
433 | 		$input_reads_num++;
434 | 		<R>;
435 | 		<R>;
436 | 		<R>;
437 | 	}
438 | 	close R;
439 | 	&execute("$pear -f $r1 -r $r2 -e -n $minpear -m $maxpear -o $outdir/$outprefix -j $pear_cpu");# merge data
440 | 	open IN,"$outdir/$outprefix.assembled.fastq" or die "cannot open $outdir/$outprefix.assembled.fastq\n";
441 | 	for my $i(1..$#start+1){# open file handle
442 | 		my $fh="OU" . $i;
443 | 		my $j=$i-1;
444 | 		$output="$outdir/$outprefix[$j].$enzyme.$format";
445 | 		if($gz eq "yes"){
446 | 			open $fh,"|gzip > $output.gz" or die "cannot open $output.gz\n";
447 | 		}elsif($gz eq "no"){
448 | 			open $fh,"> $output" or die "cannot open $output\n";
449 | 		}
450 | 		push @fhandle,$fh;
451 | 	}
452 | 	my $stat_name=join("-",@outprefix[0..4]);
453 | 	open STAT,">$outdir/$stat_name.$enzyme.stat.xls" or die "cannot open $outdir/$stat_name.$enzyme.stat.xls\n";
454 | 	print STAT "sample\tenzyme\tinput_reads_num\tcombine_reads_num\tenzyme_reads_num\tqc_reads_num\tpercent\n";
455 | 	my($combine_reads_num,%enzyme_reads_num,%qc_reads_num);
456 | 	$combine_reads_num=0;
457 | 	for my $i(0..$#start){
458 | 		$qc_reads_num{$i}=0;
459 | 		$enzyme_reads_num{$i}=0;
460 | 	}
461 | 	while(<IN>){
462 | 		$combine_reads_num++;#拼接后reads统计
463 | 		my $line=$_ . <IN> . <IN> .<IN>;
464 | 		my @tmp=split /\n/,$line;
465 | 		for my $i(0..$#start){
466 | 			my $fh=$fhandle[$i];
467 | 			my $num=$i+1;
468 | 			my $id="$tmp[0]:$num";#1
469 | 			my $seq=substr($tmp[1],$start[$i],$end[$i]-$start[$i]+1);#2
470 | 			my $qual=substr($tmp[3],$start[$i],$end[$i]-$start[$i]+1);#4
471 | 			for my $j(0..$#site){
472 | 				if($seq=~s/^(\S*?)($site[$j])\S*/$2/){
473 | 					$enzyme_reads_num{$i}++;#含酶切位点reads
474 | 					my $begin=length($1);
475 | 					my $len=length($2);
476 | 					$qual=substr($qual,$begin,$len);
477 | 					my $sub_line="$id\n$seq\n$tmp[2]\n$qual";
478 | 					if($q_control eq "yes"){# check if QC needed
479 | 						next unless(&CheckN($sub_line));
480 | 						next unless(&CheckQ($sub_line));
481 | 					}
482 | 					$qc_reads_num{$i}++;#质控后reads
483 | 					if($format eq "fa"){
484 | 						$id=~s/^@/>/;
485 | 						print $fh "$id\n$seq\n";
486 | 					}elsif($format eq "fq"){
487 | 						print $fh "$sub_line\n";
488 | 					}
489 | 					last;
490 | 				}
491 | 			}
492 | 		}
493 | 	}
494 | 	close IN;
495 | 	for(@fhandle){
496 | 		close $_;
497 | 	}
498 | 	for my $i(0..$#start){
499 | 		my $percent_sub=sprintf "%.2f",$qc_reads_num{$i}/$input_reads_num*100;
500 | 		print STAT "$outprefix[$i]\t$enzyme\t$input_reads_num\t$combine_reads_num\t$enzyme_reads_num{$i}\t$qc_reads_num{$i}\t$percent_sub%\n";
501 | 	}
502 | 	close STAT;
503 | 	&execute("rm -f $outdir/$outprefix.assembled.fastq");
504 | 	&execute("rm -f $outdir/$outprefix.unassembled.forward.fastq $outdir/$outprefix.unassembled.reverse.fastq");
505 | 	&execute("rm -f $outdir/$outprefix.discarded.fastq");
506 | }
507 | 
508 | 
509 | sub Single_Lable{
510 | 	my $r=$input[0];
511 | 	my $outprefix=$outprefix[0];
512 | 	my $output;
513 | 	if($r=~/\.gz$/){
514 | 		open IN,"gzip -dc $r|" or die "cannot open $r\n";
515 | 	}else{
516 | 		open IN,"$r" or die "cannot open $r\n";
517 | 	}
518 | 	$output="$outdir/$outprefix.$enzyme.$format";
519 | 	if($gz eq "yes"){
520 | 		open OU,"|gzip > $output.gz" or die "cannot open $output.gz\n";
521 | 	}elsif($gz eq "no"){
522 | 		open OU,"> $output" or die "cannot open $output\n";
523 | 	}
524 | 	open STAT,">$outdir/$outprefix.$enzyme.stat.xls" or die "cannot open $outdir/$outprefix.$enzyme.stat.xls\n";
525 | 	print STAT "sample\tenzyme\tinput_reads_num\tenzyme_reads_num\tqc_reads_num\tpercent\n";
526 | 	my ($input_reads_num,$enzyme_reads_num,$qc_reads_num,$percent_sub);
527 | 	$qc_reads_num=0;
528 | 	while(<IN>){
529 | 		$input_reads_num++;#原始数据reads
530 | 		my $line=$_ . <IN> . <IN> . <IN>;
531 | 		my @tmp=split /\n/,$line;
532 | 		if(length($tmp[1])>50){# if input read length exceeds 50bp (such as those from PE150 platform), we will chop the sequence to the first 50-bp for the following analysis
533 | 			$tmp[1]=substr($tmp[1],0,50);
534 | 			$tmp[3]=substr($tmp[3],0,50);
535 | 		}
536 | 		for my $i(0..$#site){
537 | 			if($tmp[1]=~s/^(\S*?)($site[$i])\S*/$2/){#取的是第一个酶切序列，非贪婪匹配
538 | 				$enzyme_reads_num++;#有酶切位点的reads
539 | 				my $begin=length($1);
540 | 				my $len=length($2);
541 | 				$tmp[3]=substr($tmp[3],$begin,$len);# quality score
542 | 				my $sub_line=join("\n",@tmp[0..3]);
543 | 				if($q_control eq "yes"){# check if QC needed
544 | 					next unless(&CheckN($sub_line));
545 | 					next unless(&CheckQ($sub_line));
546 | 				}
547 | 				$qc_reads_num++;#质控后的reads
548 | 				if($format eq "fa"){
549 | 					$tmp[0]=~s/^@/>/;
550 | 					print OU "$tmp[0]\n$tmp[1]\n";
551 | 				}elsif($format eq "fq"){
552 | 					print OU "$sub_line\n";
553 | 				}
554 | 				last;
555 | 			}
556 | 		}
557 | 	}
558 | 	close IN;
559 | 	close OU;
560 | 	$percent_sub=sprintf "%.2f",$qc_reads_num/$input_reads_num*100;
561 | 	print STAT "$outprefix\t$enzyme\t$input_reads_num\t$enzyme_reads_num\t$qc_reads_num\t$percent_sub%\n";
562 | 	close STAT;
563 | }
564 | 
565 | sub CheckN{
566 |     my $line=shift;
567 |     my @tmp=split /\n/,$line;
568 |     my $length=length($tmp[1]);
569 |     @tmp=split //,$tmp[1];
570 |     my $count=0;
571 |     for my $base(@tmp){
572 |         if($base eq "N"){$count++;};
573 |     }
574 |     if($ncount>0 && $ncount<1){
575 |         if($ncount>= $count/$length){
576 |             return 1;
577 |         }else{
578 |             return 0;
579 |         }
580 |     }elsif($ncount==0 || $ncount>=1){
581 |         if($ncount>=$count){
582 |             return 1;
583 |         }else{
584 |             return 0;
585 |         }
586 |     }
587 | }
588 | 
589 | sub CheckQ{
590 |     my $line = shift;
591 |     my @array = split /\n/,$line;
592 |     @array = split //,$array[3];
593 |     my $count = 0;
594 |     foreach my $i( @array ){
595 |         next unless( ord($i) >= $quality + $qbase );
596 |         $count ++;
597 |     }
598 |     if( $count >= scalar(@array) * $percent / 100 ){
599 |         return 1;
600 |     }else{
601 |         return 0;
602 |     }
603 | }
604 | 
605 | sub Electronic_enzyme{
606 | 	my $genome=$input[0];
607 | 	my $outprefix=$outprefix[0];
608 | 	my $cnt=0;
609 | 	$/=">";
610 | 	if($genome=~/\.gz$/){
611 | 		open IN,"gzip -dc $genome|" or die "cannot open $genome\n";
612 | 	}else{
613 | 		open IN,"$genome" or die "cannot open $genome\n";
614 | 	}
615 | 	if($gz eq "yes"){
616 | 		open OU,"|gzip > $outdir/$outprefix.$enzyme.fa.gz" or die "cannot open $outdir/$outprefix.$enzyme.fa.gz\n";
617 | 	}else{
618 | 		open OU,">$outdir/$outprefix.$enzyme.fa" or die "cannot open $outdir/$outprefix.$enzyme.fa\n";
619 | 	}
620 | 	open STAT,">$outdir/$outprefix.$enzyme.stat.xls" or die "cannot open $outdir/$outprefix.$enzyme.stat.xls\n";
621 | 	print STAT "sample\tenzyme\tinput_reads_num\tenzyme_reads_num\tpercent\n";
622 | 	my($input_reads_num,$enzyme_reads_num,$percent_sub);
623 | 	$enzyme_reads_num=0;
624 | 	<IN>;
625 | 	while(<IN>){
626 | 		chomp;
627 | 		$input_reads_num++;
628 | 		my @tmp=split /\n/;
629 | 		my $id=(split /\s+/,$tmp[0])[0];
630 | 		my $seq=join("",@tmp[1..$#tmp]);
631 | 		$seq=uc($seq); # convert the lowercase to uppercase bases 小写碱基转换为大写
632 | 		my %hash;my %hash_tag;
633 | 		for my $i(0..$#site){ # iterate all restriction sites 循环酶切位点
634 | 			while($seq=~/($site[$i])/g){ # digital digestion
635 | 				my $tag=$1;
636 | 				my $len=length($tag);
637 | 				my $pos=pos($seq);
638 | 				$pos=$pos-$len+1;
639 | 				my $pos_end=$pos+$len-1;
640 | 				pos($seq)=$pos; #调整位置
641 | 				$hash{$pos}{$pos_end}="$id-$pos-$pos_end";
642 | 				$hash_tag{$pos}{$pos_end}=$tag;
643 | 			}
644 | 		}
645 | 		# sort the 2b-RAD tags by the genome positions排序后输出
646 | 		for my $pos(sort {$a<=>$b} keys %hash){
647 | 			for my $pos_end(sort {$a<=>$b} keys %{$hash{$pos}}){
648 | 				$cnt++;
649 | 				$enzyme_reads_num++;
650 | 				print OU ">$hash{$pos}{$pos_end}-$cnt\n"; #染色体id-起始位置-终止位置-第n个标签
651 | 				print OU "$hash_tag{$pos}{$pos_end}\n";
652 | 			}
653 | 		}
654 | 		undef %hash;
655 | 		undef %hash_tag;
656 | 	}
657 | 	close IN;
658 | 	close OU;
659 | 	$percent_sub=sprintf "%.2f",$enzyme_reads_num/$input_reads_num*100;
660 | 	print STAT "$outprefix\t$enzyme\t$input_reads_num\t$enzyme_reads_num\t$percent_sub%\n";
661 | 	close STAT;
662 | 	$/="\n";
663 | }
664 | 
665 | 
666 | sub execute{
667 | 	my $cmd = shift;
668 | 	print "$cmd\n";
669 | 	my $exit_code=system($cmd);
670 | 	if($exit_code!=0){
671 | 		print STDERR "Command $cmd failed with an exit code of $exit_code.\n";
672 | 		exit($exit_code >> 8);
673 | 	}
674 | }
675 | sub CheckDir{
676 | 	my $file = shift;
677 | 	unless( -d $file ){
678 | 		if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");}
679 | 		else{print STDERR "$file not exists and cannot be built\n";exit 1;}
680 | 		}
681 | 		return 1;
682 | }
683 | 


--------------------------------------------------------------------------------