├── 2bRAD-M_workflow.png ├── .gitignore ├── tools ├── 2bRAD-M-20201225-conda.yml ├── Download_2bRADTagDB_NCBI.pl └── Download_2bRADTagDB_GTDB.pl ├── manual ├── LICENSE ├── scripts ├── FindGenome_ByQualitative.pl ├── MergeProfilesFromMultipleSamples.pl ├── CalculateRelativeAbundance_Combined2bEnzymes.pl ├── CalculateRelativeAbundance_Single2bEnzyme.pl ├── CreateQuanDatabase_2bRAD.pl ├── CreateQualDatabase_2bRAD.pl └── 2bRADExtraction.pl ├── README.md └── bin └── 2bRADM_Pipline.pl /2bRAD-M_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shihuang047/2bRAD-M/HEAD/2bRAD-M_workflow.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rhistory 2 | *.RData 3 | .ipynb_checkpoints/ 4 | # LibreOffice lock files 5 | .~lock* 6 | # Apple-OS-styled files 7 | *.DS_Store 8 | *.nc 9 | -------------------------------------------------------------------------------- /tools/2bRAD-M-20201225-conda.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - perl=5.26.2 7 | - perl-parallel-forkmanager=2.02 8 | - pear=0.9.6 9 | -------------------------------------------------------------------------------- /manual: -------------------------------------------------------------------------------- 1 | 1.install software 2 | conda env create -n 2bRAD-M-20201225 --file tools/2bRAD-M-20201225-conda.yml 3 | conda activate 2bRAD-M-20201225 4 | 5 | 2.download genome, database and example data 6 | for NCBI database: perl tools/Download_2bRADTagDB_NCBI.pl your_database_path(default:./2B-RAD-M-ref_db_NCBI/) 7 | for GTDB database: perl tools/Download_2bRADTagDB_GTDB.pl your_database_path(default:./2B-RAD-M-ref_db_GTDB/) 8 | 9 | 3.test the Pipline with sample data 10 | 1)simulate_50: 11 | perl bin/2bRADM_Pipline.pl -t 3 -l your_database_path/list_simulation -d your_database_path -o outdir -gsc 60 -qc no 12 | 2)MSA1002_R1: 13 | perl bin/2bRADM_Pipline.pl -t 3 -l your_database_path/list_mock -d your_database_path -o outdir 14 | 15 | Note: 16 | You need to activate the environment each time you use it. (conda activate 2bRAD-M-20201225) 17 | 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The 2bRAD-M software is licensed under the MIT license. 2 | 3 | Copyright (c) 2021 QIBEBT, CAS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/Download_2bRADTagDB_NCBI.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #Author:zhangrongchao, zhangrongchaoxx@163.com 3 | use strict; 4 | use warnings; 5 | use File::Basename qw(dirname basename); 6 | use Cwd 'abs_path'; 7 | 8 | $ARGV[0] ||="2B-RAD-M-ref_db_NCBI"; 9 | 10 | #if($#ARGV!=0){ 11 | # print STDERR "perl $0 outdir\n"; 12 | # exit 1; 13 | #} 14 | 15 | my $outdir=$ARGV[0];#下载目录 16 | 17 | $outdir=abs_path($outdir); 18 | &CheckDir("$outdir"); 19 | 20 | 21 | my @a=('abfh_classify','MSA1002','simulate_50');#分类表,实际数据,模拟数据 22 | #my @b=('BcgI.species');#需要下载的库文件 23 | my @b=('BcgI.species','CjePI.species');#需要下载的库文件 24 | 25 | my %hash_path=( 26 | 'abfh_classify'=>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25889157/abfh_classify_with_speciename.txt.gz',], 27 | 28 | 'MSA1002' =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25623566/MSA1002_R1.fq.gz',], 29 | 30 | # 'simulate_50' =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25621832/simulate_50.fa.gz',], 31 | 'simulate_50' =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25915428/simulate_50.BcgI.fq.gz',], 32 | 33 | 'BcgI.species' =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25889544/BcgI.species.fa.gz0', 34 | 'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25889658/BcgI.species.fa.gz1',], 35 | 36 | 'CjePI.species'=>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25891653/CjePI.species.fa.gz0', 37 | 'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25890987/CjePI.species.fa.gz1', 38 | 'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25890996/CjePI.species.fa.gz2', 39 | 'https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25891002/CjePI.species.fa.gz3',], 40 | ); 41 | 42 | my %hash_md5=( 43 | 'abfh_classify'=>['25f3a20babb56fd9f2a61eeddb82151a',], 44 | 45 | 'MSA1002' =>['bc2b189213975f6d6c0833a4ba726239',], 46 | 47 | # 'simulate_50' =>['9defe990462d3fef8eb69a2c359d72da',], 48 | 'simulate_50' =>['04cafca5b5c23c48774e9d515dde42a8',], 49 | 50 | 'BcgI.species' =>['b36cc8e85fb68f1b3cc5301c49cafe98', 51 | '071b711730ce87e6c1f85f29319a5979',], 52 | 53 | 'CjePI.species'=>['a32c1998d0d800fe336d9f03756b8409', 54 | '1eb528474f89a6550f69c160d0885dd8', 55 | 'b803ea6b0e2bca1c6381b2a15a76876d', 56 | 'a3d5f018fb3410b507759f2eabee4d04',] 57 | ); 58 | 59 | #合并后文件md5 60 | my %complete_md5=( 61 | 'BcgI.species' =>'75171aabcb754e827e5824ae755d06af', 62 | 'CjePI.species'=>'bcfdef3722dfc763e09fd185f580198d', 63 | ); 64 | 65 | #download abfh_classify && MSA1002 && simulate_50 66 | for my $i(@a){ 67 | my $name=(split /\//,$hash_path{$i}[0])[-1]; 68 | my $file_md5;#下载的文件的MD5值 69 | while(1){ 70 | if(-e "$outdir/$name"){ 71 | chomp($file_md5=`md5sum $outdir/$name`); 72 | $file_md5=(split /\s+/,$file_md5)[0]; 73 | } 74 | if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[0]){ 75 | print STDOUT "File $name has been downloaded.\n"; 76 | last; 77 | }else{ 78 | `wget -t 0 -O $outdir/$name $hash_path{$i}[0]`; 79 | } 80 | } 81 | } 82 | #example list 83 | open OU,">$outdir/list_mock" or die "cannot open $outdir/list_mock\n"; 84 | print OU "MSA1002\t$outdir/MSA1002_R1.fq.gz\n"; 85 | close OU; 86 | 87 | open OU,">$outdir/list_simulation" or die "cannot open $outdir/list_simulation\n"; 88 | #print OU "simulate_50\t$outdir/simulate_50.fa.gz\n"; 89 | print OU "simulate_50\t$outdir/simulate_50.BcgI.fq.gz\n"; 90 | close OU; 91 | 92 | 93 | 94 | #下载数据库文件 95 | for my $i(@b){ 96 | my $cat=""; 97 | while(1){ 98 | my $md5; 99 | if(-e "$outdir/$i.fa.gz"){#存在完成文件 100 | chomp($md5=`md5sum $outdir/$i.fa.gz`); 101 | $md5=(split /\s+/,$md5)[0]; 102 | } 103 | if(-e "$outdir/$i.fa.gz" && $md5 eq $complete_md5{$i}){ 104 | print STDOUT "File $i.fa.gz hash been downloaded.\n"; 105 | `rm -rf $cat`; 106 | last; 107 | }else{ 108 | for my $j(0..$#{$hash_path{$i}}){#循环每个文件 109 | my $name=(split /\//,$hash_path{$i}[$j])[-1]; 110 | my $file_md5;#下载的文件的MD5值 111 | while(1){ 112 | if(-e "$outdir/$name"){ 113 | chomp($file_md5=`md5sum $outdir/$name`); 114 | $file_md5=(split /\s+/,$file_md5)[0]; 115 | } 116 | if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[$j]){ 117 | print STDOUT "File $name has been downloaded.\n"; 118 | $cat .=" $outdir/$name"; 119 | last; 120 | }else{ 121 | `wget -t 0 -O $outdir/$name $hash_path{$i}[$j]`; 122 | } 123 | } 124 | } 125 | `cat $cat > $outdir/$i.fa.gz`; 126 | } 127 | } 128 | } 129 | 130 | print STDOUT "Congratulations! All databases have been downloaded.\n"; 131 | 132 | sub CheckDir{ 133 | my $file = shift; 134 | unless( -d $file ){ 135 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 136 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 137 | } 138 | return 1; 139 | } 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /scripts/FindGenome_ByQualitative.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; 3 | use strict; 4 | use Getopt::Long; 5 | use FindBin qw($Bin); 6 | use File::Basename qw(dirname basename); 7 | 8 | my $author="Zhangrongchao"; 9 | my $time="20201222"; 10 | 11 | my $g_score_threshold ||=5;#对定性的合并结果,进行分类筛选 gscore阈值 12 | my $GCF_threshold ||=1;#鉴定到某个基因组几个标签以上,该基因组才会被纳入定量建库 13 | 14 | 15 | my($list,$database,$outdir,$qual_dir,$help); 16 | GetOptions( 17 | "l:s" => \$list, #待处理样品列表 18 | "d:s" => \$database, #数据库目录 19 | "o:s" => \$outdir, #输出目录 20 | "qualdir:s" => \$qual_dir, #所有样品定性结果总目录 21 | 22 | "gscore:i" => \$g_score_threshold,#筛选分类 23 | "gcf:i" => \$GCF_threshold,#筛选分类中的基因组 24 | 25 | "h|help:s" => \$help, 26 | ); 27 | 28 | sub usage{#帮助 29 | print STDERR "\e[;33;1m 30 | DESCRIPTION 31 | 2b微生物根据定性结果筛选定量基因组生成建库所需list 32 | USAGE 33 | perl $0 34 | PARAMETERS 35 | -l sample list (the line which begins with # will be ignored) 36 | eg: sample(......) 37 | -d database path 38 | -o outdir (if not exists,it will be created) 39 | -qualdir dir of qualitative 40 | OPTIONS 41 | -gscore G score threshold of classify in qualitative analysis, it decides quantitative database. [$g_score_threshold, it means >$g_score_threshold] 42 | -gcf detected tag threshold of GCF in qualitative analysis, it decides quantitative database. [$GCF_threshold, it means >$GCF_threshold] 43 | -h|help print help 44 | AUTHOR: $author $time\e[0m\n"; 45 | } 46 | 47 | 48 | if(defined($help)){ 49 | &usage; 50 | exit 0; 51 | } 52 | 53 | #参数检测 54 | unless($list && $database && $outdir && $qual_dir){ 55 | &usage; 56 | print STDERR "para -l -d -o or -qualdir error.\n"; 57 | exit 1; 58 | } 59 | 60 | #数据库文件检测 61 | unless(-e "$database/abfh_classify_with_speciename.txt.gz"){ 62 | print STDERR "incomplete database, $database/abfh_classify_with_speciename.txt.gz does not exists.\n"; 63 | exit 1; 64 | } 65 | 66 | #记录数据库中gcf转化为全部信息 67 | my %gcf2classify_path; 68 | open DB,"gzip -dc $database/abfh_classify_with_speciename.txt.gz|" or die "cannot open $database/abfh_classify_with_speciename.txt.gz\n"; 69 | while(){ 70 | next if(/^#/ || /^$/); 71 | chomp; 72 | my @tmp=split /\t/; 73 | $gcf2classify_path{$tmp[0]}=$_; 74 | } 75 | close DB; 76 | 77 | 78 | &CheckDir("$outdir"); 79 | 80 | open IN,"$list" or die "cannot open $list\n"; 81 | while(){ 82 | next if(/^#/ || /^$/);#跳过注释行和空行 83 | chomp; 84 | my $sample_name=(split /\t/)[0]; 85 | #样品定量列表 86 | ##单样品重建库list准备 87 | ##记录通过G_score阈值的分类 88 | my (%hs_pass_Gscore_class,@enzyme_use); 89 | if(-e "$qual_dir/$sample_name/$sample_name.combine.xls"){ 90 | open XI,"$qual_dir/$sample_name/$sample_name.combine.xls" or die "cannot open $qual_dir/$sample_name/$sample_name.combine.xls\n"; 91 | }else{ 92 | print STDERR "!!!$sample_name does not have $qual_dir/$sample_name/$sample_name.combine.xls, can't do quantitative analysis\n"; 93 | next; 94 | } 95 | while(){ 96 | chomp; 97 | my @tmp=split /\t/; 98 | next if(/^#Kingdom/i);#跳过表头 99 | if(/^#/){#记录 合并使用的酶 组合 100 | my @a=split /\s+/,$tmp[0]; 101 | for my $enzyme(@a){ 102 | $enzyme=~s/^#//; 103 | next if($enzyme eq "combine");#跳过 combine字段 104 | push @enzyme_use,$enzyme; 105 | unless(-e "$database/$enzyme.species.fa.gz"){ 106 | print STDERR "incomplete database, $database/abfh_classify_with_speciename.txt.gz does not exists.\n"; 107 | exit 1; 108 | } 109 | } 110 | next; 111 | } 112 | my $class=join("\t",@tmp[0..$#tmp-8]);#获取分类信息 113 | $hs_pass_Gscore_class{$class}++ if($tmp[-1]>$g_score_threshold);#通过gscore阈值的分类 114 | } 115 | close XI; 116 | &CheckDir("$outdir/$sample_name");#建立每个样品的文件夹 117 | open OU,"|sort|uniq > $outdir/$sample_name/sdb.list" or die "cannot open $outdir/$sample_name/sdb.list\n"; #输出选出的基因组列表,并排序去重 118 | for my $enzyme(@enzyme_use){ 119 | if(-e "$qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls"){ 120 | open QU,"$qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls" or die "cannot open $qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls\n"; 121 | }else{ 122 | print STDERR "warning: $sample_name does not have $qual_dir/$sample_name/$sample_name.$enzyme.GCF_detected.xls\n"; 123 | next; 124 | } 125 | while(){ 126 | chomp; 127 | my @tmp=split /\t/; 128 | my $class=join("\t",@tmp[0..$#tmp-4]); 129 | if(exists $hs_pass_Gscore_class{$class} && $tmp[-2]>$GCF_threshold){ 130 | print OU "$gcf2classify_path{$tmp[-4]}\n"; 131 | # my @all_class=split /\t/,$gcf2classify_path{$tmp[-4]}; 132 | # print OU join("\t",@all_class[0..8]),"\t$database/genome_ref/$all_class[-1]\n"; 133 | } 134 | } 135 | close QU; 136 | } 137 | close OU; 138 | 139 | } 140 | close IN; 141 | 142 | 143 | sub CheckDir{# create the directory 144 | my $file = shift; 145 | unless( -d $file ){ 146 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 147 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 148 | } 149 | return 1; 150 | } 151 | 152 | 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /scripts/MergeProfilesFromMultipleSamples.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; 3 | use strict; 4 | use Getopt::Long; 5 | use Cwd 'abs_path'; 6 | 7 | my ($list,$outdir,$prefix,$mock,$control,$help); 8 | GetOptions( 9 | "l:s" => \$list, 10 | "o:s" => \$outdir, 11 | "p:s" => \$prefix, 12 | 13 | "m:s" => \$mock, 14 | "c:s" => \$control, 15 | 16 | "h|help:s" => \$help, 17 | ); 18 | sub usage{ 19 | print STDERR "\e[;33;1m 20 | DESCRIPTION 21 | Merge the abundance profiles from mulitple samples. If negative control samples were provided, 22 | this script can also filter all taxa in negative control samples (i.e., potential contaminations) for each biological sample. 23 | USAGE 24 | perl $0 25 | 26 | PARAMETERS 27 | -l A list file indicating the sample_id and corresponding output files from the last step. 28 | e.g., sample_idqualitative/sample_id/sample_id.combine.xls 29 | -o The output directory 30 | -p The output prefix 31 | OPTIONS 32 | -m The mock-community sample name(s) (separated by commas). 33 | -c The sample name(s) (separated by commas) of negative control that can be used for filtering potential contaminations. 34 | 35 | -h|help help 36 | AUTHOR: ZRC 2020.09.14 37 | \e[0m\n"; 38 | } 39 | 40 | if(defined($help)){ 41 | &usage; 42 | exit 0; 43 | } 44 | 45 | unless($list && $outdir && $prefix){ 46 | &usage; 47 | print STDERR "para -l -o or -p error.\n"; 48 | exit 1; 49 | } 50 | 51 | &CheckDir("$outdir"); 52 | #记录mock样品 53 | my (%hash_mock,%hash_control); 54 | if(defined($mock)){ 55 | my @mock=split /,/,$mock; 56 | for(@mock){ 57 | $hash_mock{$_}++; 58 | } 59 | } 60 | #记录control样品 61 | if(defined($control)){ 62 | my @control=split /,/,$control; 63 | for(@control){ 64 | $hash_control{$_}++; 65 | } 66 | } 67 | 68 | #读取定性/定量计算结果文件 69 | my (%hash_specie,%hash_all,@sample_sort,$classify_col,$head); 70 | #循环样品 71 | open LI,"$list" or die "cannot open $list\n"; 72 | while(
  • ){ 73 | next if (/^#/ || /^$/);#去除注释行和空行 74 | chomp; 75 | my ($sample,$path)=split /\t/; 76 | $path=abs_path($path); 77 | unless(-e $path){ 78 | print STDERR "warning: $sample $path not exist, cannot be calculate Abundance.\n"; 79 | next; 80 | } 81 | push @sample_sort,$sample;#记录样品顺序 82 | open IN,"$path" or die "cannot open $path\n"; 83 | while(){ 84 | chomp; 85 | my @tmp=split /\t/; 86 | if(/^#Kingdom/i){ 87 | for my $i(0..$#tmp){#确定分类列 88 | if($tmp[$i] eq "Theoretical_Tag_Num"){ 89 | $classify_col=$i-1; 90 | $head=join("\t",@tmp[0..$classify_col]); 91 | last; 92 | } 93 | } 94 | } 95 | next if(/^#/);#跳过注释行 96 | my $id=join("\t",@tmp[0..$classify_col]); 97 | $hash_specie{$id}{$sample}=$tmp[-4];#记录Sequenced_Reads_Num/Theoretical_Tag_Num值 98 | $hash_all{$sample}+=$tmp[-4];#记录总数 99 | } 100 | close IN; 101 | } 102 | close LI; 103 | 104 | #输出所有样品丰度计算结果 105 | open OU,">$outdir/$prefix.all.xls" or die "cannot open $outdir/$prefix.all.xls\n"; 106 | print OU "$head\t",join("\t",@sample_sort),"\n";#表头 107 | for my $id(sort {$a cmp $b} keys %hash_specie){#循环检测到的物种 108 | my $judge=0; 109 | my $print=$id; 110 | for my $sample(@sample_sort){#循环样品 111 | if(exists $hash_specie{$id}{$sample}){ 112 | my $percent=$hash_specie{$id}{$sample}/$hash_all{$sample}; 113 | if($percent==0){ 114 | $print .="\t0"; 115 | }else{ 116 | $print .="\t$percent"; 117 | $judge++; 118 | } 119 | }else{ 120 | $print .="\t0"; 121 | } 122 | } 123 | print OU "$print\n" if ($judge!=0); 124 | } 125 | close OU; 126 | 127 | #输出 删除mock和阴性对照样品,以及阴性对照检测出来的菌 的结果 128 | open OU,">$outdir/$prefix.filtered.xls" or die "cannot open $outdir/$prefix.filtered.xls\n"; 129 | #表头处理 130 | print OU "$head"; 131 | for(@sample_sort){ 132 | next if(exists $hash_mock{$_} || exists $hash_control{$_});#过滤掉mock和阴性对照样品 133 | print OU "\t$_"; 134 | } 135 | print OU "\n"; 136 | for my $id(sort {$a cmp $b} keys %hash_specie){#循环检测到的物种 137 | my $judge=0;#整行判断 138 | my $judge_control=0;#阴性对照判断 139 | my $print=$id; 140 | for my $sample(@sample_sort){ 141 | next if(exists $hash_mock{$sample});#过滤掉mock样品 142 | next if(exists $hash_control{$sample});#过滤掉阴性对照样品 143 | # if(exists $hash_control{$sample}){#阴性对照样品处理 144 | # $judge_control++ if(exists $hash_specie{$id}{$sample} && $hash_specie{$id}{$sample}!=0); 145 | # next; 146 | # } 147 | if(exists $hash_specie{$id}{$sample}){ 148 | my $percent=0; 149 | $percent=$hash_specie{$id}{$sample}/$hash_all{$sample} if($hash_all{$sample}!=0); 150 | if($percent==0){ 151 | $print .="\t0"; 152 | }else{ 153 | $print .="\t$percent"; 154 | $judge++; 155 | } 156 | }else{ 157 | $print .="\t0"; 158 | } 159 | } 160 | print OU "$print\n" if ($judge!=0 && $judge_control==0); 161 | } 162 | 163 | sub CheckDir{#创建目录 164 | my $file = shift; 165 | unless( -d $file ){ 166 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 167 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 168 | } 169 | return 1; 170 | } 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /tools/Download_2bRADTagDB_GTDB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #Author:zhangrongchao, zhangrongchaoxx@163.com 3 | use strict; 4 | use warnings; 5 | use File::Basename qw(dirname basename); 6 | use Cwd 'abs_path'; 7 | 8 | $ARGV[0] ||="2B-RAD-M-ref_db_GTDB"; 9 | 10 | #if($#ARGV!=0){ 11 | # print STDERR "perl $0 outdir\n"; 12 | # exit 1; 13 | #} 14 | 15 | my $outdir=$ARGV[0];#下载目录 16 | 17 | $outdir=abs_path($outdir); 18 | &CheckDir("$outdir"); 19 | 20 | 21 | my @a=('abfh_classify','MSA1002','simulate_50');#分类表,实际数据,模拟数据 22 | #my @b=('BcgI.species');#需要下载的库文件 23 | my @b=('BcgI.species','CjePI.species');#需要下载的库文件 24 | 25 | my %hash_path=( 26 | 'abfh_classify'=>['https://figshare.com/ndownloader/files/31653170/abfh_classify_with_speciename.txt.gz',], 27 | 28 | 'MSA1002' =>['https://figshare.com/ndownloader/files/25623566/MSA1002_R1.fq.gz',], 29 | 30 | # 'simulate_50' =>['https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/25621832/simulate_50.fa.gz',], 31 | 'simulate_50' =>['https://figshare.com/ndownloader/files/25915428/simulate_50.BcgI.fq.gz',], 32 | 33 | 'BcgI.species' =>['https://figshare.com/ndownloader/files/31653911/BcgI.species.fa.gz0', 34 | 'https://figshare.com/ndownloader/files/31659299/BcgI.species.fa.gz1', 35 | 'https://figshare.com/ndownloader/files/31653614/BcgI.species.fa.gz2',], 36 | 37 | 'CjePI.species'=>['https://figshare.com/ndownloader/files/31660241/CjePI.species.fa.gz0', 38 | 'https://figshare.com/ndownloader/files/31660358/CjePI.species.fa.gz1', 39 | 'https://figshare.com/ndownloader/files/31662320/CjePI.species.fa.gz2', 40 | 'https://figshare.com/ndownloader/files/31662794/CjePI.species.fa.gz3', 41 | 'https://figshare.com/ndownloader/files/31659818/CjePI.species.fa.gz4',], 42 | ); 43 | 44 | my %hash_md5=( 45 | 'abfh_classify'=>['c2faa9ae97b704b3d0705709cf22ecb4',], 46 | 47 | 'MSA1002' =>['bc2b189213975f6d6c0833a4ba726239',], 48 | 49 | # 'simulate_50' =>['9defe990462d3fef8eb69a2c359d72da',], 50 | 'simulate_50' =>['04cafca5b5c23c48774e9d515dde42a8',], 51 | 52 | 'BcgI.species' =>['a1b70d0de71093a0bb9bedbadab641b0', 53 | '383fd8c85a23aee4a48d48aa41845f17', 54 | 'd19a5ce115fac8708fb0919f619ddf19',], 55 | 56 | 'CjePI.species'=>['8b1c62c80bdf3b05182f2fe47d0f0751', 57 | '4662c85ef0e12a749d8b9284302e2a18', 58 | 'ed3d3a27df05b7c0eb97140f78f54a75', 59 | '063b3c362f41889037b3bb15d8a0617f', 60 | '021a06a6e926b4ba91acba0c398877d7',] 61 | ); 62 | 63 | #合并后文件md5 64 | my %complete_md5=( 65 | 'BcgI.species' =>'eea6b5ec34b00a749d45199a91fd3e34', 66 | 'CjePI.species'=>'3d9913da22ac340357d4e708a7506de8', 67 | ); 68 | 69 | #download abfh_classify && MSA1002 && simulate_50 70 | for my $i(@a){ 71 | my @tmp=split /\//,$hash_path{$i}[0]; 72 | my $url=join("/",@tmp[0..$#tmp-1]); 73 | my $name=$tmp[-1]; 74 | my $file_md5;#下载的文件的MD5值 75 | while(1){ 76 | if(-e "$outdir/$name"){ 77 | chomp($file_md5=`md5sum $outdir/$name`); 78 | $file_md5=(split /\s+/,$file_md5)[0]; 79 | } 80 | if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[0]){ 81 | print STDOUT "File $name has been downloaded.\n"; 82 | last; 83 | }else{ 84 | `wget -t 0 -O $outdir/$name $url`; 85 | } 86 | } 87 | } 88 | #example list 89 | open OU,">$outdir/list_mock" or die "cannot open $outdir/list_mock\n"; 90 | print OU "MSA1002\t$outdir/MSA1002_R1.fq.gz\n"; 91 | close OU; 92 | 93 | open OU,">$outdir/list_simulation" or die "cannot open $outdir/list_simulation\n"; 94 | #print OU "simulate_50\t$outdir/simulate_50.fa.gz\n"; 95 | print OU "simulate_50\t$outdir/simulate_50.BcgI.fq.gz\n"; 96 | close OU; 97 | 98 | 99 | 100 | #下载数据库文件 101 | for my $i(@b){ 102 | my $cat=""; 103 | while(1){ 104 | my $md5; 105 | if(-e "$outdir/$i.fa.gz"){#存在完成文件 106 | chomp($md5=`md5sum $outdir/$i.fa.gz`); 107 | $md5=(split /\s+/,$md5)[0]; 108 | } 109 | if(-e "$outdir/$i.fa.gz" && $md5 eq $complete_md5{$i}){ 110 | print STDOUT "File $i.fa.gz hash been downloaded.\n"; 111 | `rm -rf $cat`; 112 | last; 113 | }else{ 114 | for my $j(0..$#{$hash_path{$i}}){#循环每个文件 115 | my @tmp=split /\//,$hash_path{$i}[$j]; 116 | my $url=join("/",@tmp[0..$#tmp-1]); 117 | my $name=$tmp[-1]; 118 | my $file_md5;#下载的文件的MD5值 119 | while(1){ 120 | if(-e "$outdir/$name"){ 121 | chomp($file_md5=`md5sum $outdir/$name`); 122 | $file_md5=(split /\s+/,$file_md5)[0]; 123 | } 124 | if(-e "$outdir/$name" && $file_md5 eq $hash_md5{$i}[$j]){ 125 | print STDOUT "File $name has been downloaded.\n"; 126 | $cat .=" $outdir/$name"; 127 | last; 128 | }else{ 129 | `wget -t 0 -O $outdir/$name $url`; 130 | } 131 | } 132 | } 133 | `cat $cat > $outdir/$i.fa.gz`; 134 | } 135 | } 136 | } 137 | 138 | print STDOUT "Congratulations! All databases have been downloaded.\n"; 139 | 140 | sub CheckDir{ 141 | my $file = shift; 142 | unless( -d $file ){ 143 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 144 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 145 | } 146 | return 1; 147 | } 148 | 149 | 150 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /scripts/CalculateRelativeAbundance_Combined2bEnzymes.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; 3 | use strict; 4 | use Getopt::Long; 5 | use FindBin qw($Bin); 6 | use File::Basename qw(dirname basename); 7 | 8 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang"; 9 | my $time="2020.06.03"; 10 | 11 | #默认值 12 | my $mark ||="combine"; 13 | my $g_score_threshold ||=0; 14 | 15 | #Standard output for clearing cache 16 | select STDOUT;$|=1; 17 | 18 | my ($list,$site,$outdir,$help); 19 | GetOptions( 20 | "l:s" => \$list, 21 | "s:s" => \$site, 22 | "io:s" => \$outdir, 23 | 24 | "m:s" => \$mark, 25 | "g:i" => \$g_score_threshold, 26 | "h|help:s" => \$help, 27 | ); 28 | 29 | 30 | sub usage{# helper information 31 | print STDERR "\e[;33;1m 32 | DESCRIPTION 33 | It computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by one or multiple type 2b restriction enzymes. 34 | USAGE 35 | perl $0 36 | PARAMETERS 37 | -l The path of the input filepath list (the line which begins with # will be ignored) 38 | eg: sample_name... 39 | -s One or multiple type 2b restriction enzymes (sites). The selected sites should be separated by comma. 40 | [1]CspCI [9]BplI 41 | [2]AloI [10]FalI 42 | [3]BsaXI [11]Bsp24I 43 | [4]BaeI [12]HaeIV 44 | [5]BcgI [13]CjePI 45 | [6]CjeI [14]Hin4I 46 | [7]PpiI [15]AlfI 47 | [8]PsrI [16]BslFI 48 | [17]All_Detected_Enzyme 49 | -io The input and output directory 50 | OPTIONS 51 | -m Whether the taxa idenfication or abundance estimation should take into account for the 2b-RAD taxa-specific markers from more than one restriction sites [combine] 52 | -g The G-score threshold [$g_score_threshold, it means >=$g_score_threshold] To control the false-positive in the species identification, G score was derived for each species identified within a sample, which is a harmonious mean of read coverage of 2b-RAD markers belongs to a species and number of all possible 2b-RAD markers of this species. Therecommended/default threshold is $g_score_threshold. 53 | -h|help print this help. 54 | AUTHOR: $author $time\e[0m\n"; 55 | } 56 | 57 | if(defined($help)){ 58 | &usage; 59 | exit 0; 60 | } 61 | 62 | my %hs_site2enzyme=(#the codes for all restriction enzymes 63 | '1' => 'CspCI', '2' => 'AloI', 64 | '3' => 'BsaXI', '4' => 'BaeI', 65 | '5' => 'BcgI', '6' => 'CjeI', 66 | '7' => 'PpiI', '8' => 'PsrI', 67 | '9' => 'BplI', '10' => 'FalI', 68 | '11' => 'Bsp24I', '12' => 'HaeIV', 69 | '13' => 'CjePI', '14' => 'Hin4I', 70 | '15' => 'AlfI', '16' => 'BslFI', 71 | ); 72 | 73 | unless($list && $site && $outdir){ 74 | &usage; 75 | exit 1; 76 | } 77 | 78 | # check the availability of the taxa-specific 2b-RAD reference genome database 79 | if($site=~/17/){ 80 | $site="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16"; 81 | } 82 | my @site=split /,/,$site; 83 | for $site(@site){ 84 | unless(exists $hs_site2enzyme{$site}){#检测酶切位点是否存在 85 | &usage; 86 | print STDERR "Parameter -s is wrong\n"; 87 | exit 1; 88 | } 89 | } 90 | =pod 91 | #注释文件检测并读取 92 | my %hs_anno; 93 | unless(-e "$database/species_ID_annotation.txt"){ 94 | &usage; 95 | print STDERR "cannot find $database/species_ID_annotation.txt\n"; 96 | exit; 97 | }else{ 98 | open AN,"$database/species_ID_annotation.txt" or die "cannot open $database/species_ID_annotation.txt\n"; 99 | while(){ 100 | next if(/^#/ || /^$/);#去除注释行和空行 101 | chomp; 102 | my @tmp=split /\t/; 103 | $hs_anno{$tmp[2]}="$tmp[1]\t$tmp[3]"; 104 | } 105 | close AN; 106 | } 107 | =cut 108 | 109 | #合并处理 110 | open LI,"$list" or die "cannot open $list\n"; 111 | while(
  • ){#循环样品 112 | next if(/^#/ || /^$/);#去除注释行和空行 113 | chomp; 114 | my (%hs_sample,$head); 115 | my @use_site;#用到的酶 结果 116 | my $sample_name=(split /\t/)[0]; 117 | my $cnt=0; 118 | for $site(@site){# iterate all enzymes 119 | if(-e "$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls"){ 120 | push @use_site,$hs_site2enzyme{$site}; 121 | }else{ 122 | print STDERR "warning: $sample_name cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls\n"; 123 | next;#跳过没有鉴定的酶 文件 124 | } 125 | $cnt++; 126 | open IN,"$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls"; 127 | while(){ 128 | chomp; 129 | my $line=$_; 130 | next if($line=~/^#/); 131 | if($line=~/^Kingdom/i){ 132 | $head=$line; 133 | next; 134 | } 135 | my @tmp=split /\t/; 136 | my $class=join("\t",@tmp[0..$#tmp-8]); 137 | $hs_sample{$class}{-8}+=$tmp[-8];#理论标签数 138 | $hs_sample{$class}{-7}+=$tmp[-7];#测序得到的标签数 139 | $hs_sample{$class}{-5}+=$tmp[-5];#测序得到的reads数 140 | $hs_sample{$class}{-2}+=$tmp[-2];#测到的深度大于1的标签数 141 | } 142 | close IN; 143 | } 144 | next if($cnt==0);#如果所有的酶都没有结果,那么不继续输出 145 | open OU,">$outdir/$sample_name/$sample_name.$mark.xls" or die "cannot open $outdir/$sample_name/$sample_name.$mark.xls\n"; 146 | print OU "#@use_site combine\n"; 147 | # print OU "#Kingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecie\tTheoretical_Tag_Num\tSequenced_Tag_Num\tPercent\tSequenced_Reads_Num\tSequenced_Reads_Num/Theoretical_Tag_Num\tSequenced_Reads_Num/Sequenced_Tag_Num\tG_Score\ttaxid\tunique_name\n"; 148 | print OU "#$head\n"; 149 | for my $class(keys %hs_sample){ 150 | my @tmp=split /\t/,$class; 151 | my $Theoretical_Tag_Num=$hs_sample{$class}{-8}; 152 | my $Sequenced_Tag_Num=$hs_sample{$class}{-7}; 153 | my $Sequenced_Tag_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Tag_Num/$Theoretical_Tag_Num*100; 154 | my $Sequenced_Reads_Num=$hs_sample{$class}{-5}; 155 | my $Sequenced_Reads_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Theoretical_Tag_Num; 156 | my $Sequenced_Reads_Num2Sequenced_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Sequenced_Tag_Num; 157 | my $Sequenced_Tag_Num_2=$hs_sample{$class}{-2}; 158 | my $G_Score=sprintf "%.8f",sqrt($Sequenced_Tag_Num*$Sequenced_Reads_Num); 159 | next if ($G_Score<$g_score_threshold);#过滤gscore阈值 160 | print OU "$class\t$Theoretical_Tag_Num\t$Sequenced_Tag_Num\t$Sequenced_Tag_Num2Theoretical_Tag_Num%\t"; 161 | print OU "$Sequenced_Reads_Num\t$Sequenced_Reads_Num2Theoretical_Tag_Num\t$Sequenced_Reads_Num2Sequenced_Tag_Num\t"; 162 | # print OU "$G_Score\t$hs_anno{$tmp[6]}\n"; 163 | print OU "$Sequenced_Tag_Num_2\t$G_Score\n"; 164 | } 165 | close OU; 166 | undef %hs_sample; 167 | } 168 | close IN; 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /scripts/CalculateRelativeAbundance_Single2bEnzyme.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use warnings; 3 | use strict; 4 | use Getopt::Long; 5 | use FindBin qw($Bin); 6 | use File::Basename qw(dirname basename); 7 | use Cwd 'abs_path'; 8 | 9 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang"; 10 | my $time="2020.12.21"; 11 | 12 | #set default parameters 13 | my $g_score_threshold ||=0; 14 | my $verbose ||="yes"; 15 | 16 | select STDOUT;$|=1;#Standard output for clearing cache 17 | 18 | my ($list,$database,$site,$outdir,$level); 19 | GetOptions( 20 | "l:s" => \$list, 21 | "d:s" => \$database, 22 | "t:s" => \$level, 23 | "s:s" => \$site, 24 | "o:s" => \$outdir, 25 | 26 | "g:i" => \$g_score_threshold, 27 | "v:s" => \$verbose, 28 | ); 29 | 30 | 31 | sub usage{# help information 32 | print STDERR "\e[;33;1m 33 | DESCRIPTION 34 | It computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by a single type 2b restriction enzyme. 35 | USAGE 36 | perl $0 37 | Required: 38 | -l The path of the input filepath list (the line that begin with # will be ignored) e.g: sample_namedata_path(fa|fq)(.gz). 39 | -d The database filepath. 40 | -t The taxonomy level of the taxa-specific 2b-RAD database used. It should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. 41 | -s One of the type 2b restriction enzymes (sites). 42 | [1]CspCI [9]BplI 43 | [2]AloI [10]FalI 44 | [3]BsaXI [11]Bsp24I 45 | [4]BaeI [12]HaeIV 46 | [5]BcgI [13]CjePI 47 | [6]CjeI [14]Hin4I 48 | [7]PpiI [15]AlfI 49 | [8]PsrI [16]BslFI 50 | -o The output directory (automatically create if it does not exist) 51 | Optional: 52 | -g The threshold of G score [$g_score_threshold, it means >=$g_score_threshold]. To control the false-positive in the species identification, G score was derived for each speciesidentified within a sample, which is a harmonious mean of read coverage of 2b-RAD markers belongs to a species and number of all possible 2b-RAD markers of this species. Therecommended/default threshold is $g_score_threshold. 53 | -v This specify if more detailed information will be shown [$verbose] (yes or no) 54 | AUTHOR: $author $time\e[0m\n"; 55 | } 56 | 57 | 58 | my %hs_site2enzyme=(# the codes for all restriction enzymes 59 | '1' => 'CspCI', '2' => 'AloI', 60 | '3' => 'BsaXI', '4' => 'BaeI', 61 | '5' => 'BcgI', '6' => 'CjeI', 62 | '7' => 'PpiI', '8' => 'PsrI', 63 | '9' => 'BplI', '10' => 'FalI', 64 | '11' => 'Bsp24I', '12' => 'HaeIV', 65 | '13' => 'CjePI', '14' => 'Hin4I', 66 | '15' => 'AlfI', '16' => 'BslFI', 67 | ); 68 | 69 | my %hs_type_database=( 70 | 'kingdom' => '1', 71 | 'phylum' => '2', 72 | 'class' => '3', 73 | 'order' => '4', 74 | 'family' => '5', 75 | 'genus' => '6', 76 | 'species' => '7', 77 | 'strain' => '8', 78 | ); 79 | 80 | my @HEAD=( 81 | 'Kingdom', 82 | 'Phylum', 83 | 'Class', 84 | 'Order', 85 | 'Family', 86 | 'Genus', 87 | 'Species', 88 | 'Strain', 89 | ); 90 | 91 | unless($list && $database && $level && $site && $outdir){ 92 | &usage; 93 | exit 1; 94 | } 95 | 96 | #转换绝对路径 97 | $list=abs_path($list); 98 | $database=abs_path($database); 99 | $outdir=abs_path($outdir); 100 | 101 | 102 | # parameter checking 103 | unless($verbose eq "yes" || $verbose eq "no"){ 104 | &usage; 105 | print STDERR "Parameter -v is wrong\n"; 106 | exit 1; 107 | } 108 | # check the taxonomic level of a 2b-RAD reference genome database 109 | unless($level eq "kingdom" || $level eq "phylum" || $level eq "class" || $level eq "order" || $level eq "family" || $level eq "genus" || $level eq "species" || $level eq "strain"){ 110 | &usage; 111 | print STDERR "Parameter -t is wrong. Cannot get $level\n"; 112 | exit 1; 113 | } 114 | # check the parameter -s and -d 115 | unless(exists $hs_site2enzyme{$site}){ 116 | &usage; 117 | print STDERR "Parameter -s $site is wrong\n"; 118 | exit 1; 119 | } 120 | #检查库文件 121 | unless(-e "$database/$hs_site2enzyme{$site}.$level.fa.gz" && -e "$database/abfh_classify_with_speciename.txt.gz"){ 122 | &usage; 123 | print STDERR "Incomplete database, please check the parameter(-d).\n"; 124 | exit 1; 125 | } 126 | 127 | print STDOUT "COMMAND: perl $0 -l $list -d $database -t $level -s $site -o $outdir -g $g_score_threshold -v $verbose\n"; 128 | 129 | &CheckDir($outdir); 130 | 131 | my $head=join("\t",@HEAD[0..$hs_type_database{$level}-1]); 132 | # load the database 133 | print STDOUT "### Loading the database, $database/$hs_site2enzyme{$site}.$level.fa.gz, ",`date`; 134 | my (%hs_tag2GCF,%hs_GCF2class,%hs_tag_theory_num); 135 | my $all_genome_num; 136 | open LI,"gzip -dc $database/abfh_classify_with_speciename.txt.gz|" or die "cannot open $database/abfh_classify_with_speciename.txt.gz\n"; 137 | while(
  • ){ 138 | next if(/^#/ || /^$/);#去掉注释行和空行 139 | chomp; 140 | my @tmp=split /\t/; 141 | my $class=join("\t",@tmp[1..$hs_type_database{$level}]);# all taxonomic levels 142 | $hs_GCF2class{$tmp[0]}=$class;# record the corresponding taxonomy for each GCF 143 | $all_genome_num++; 144 | } 145 | close LI; 146 | 147 | my (%hash_gcf_rank,%complete); 148 | $/=">"; 149 | open IN,"gzip -dc $database/$hs_site2enzyme{$site}.$level.fa.gz|" or die "cannot open $database/$hs_site2enzyme{$site}.$level.fa.gz\n"; 150 | ; 151 | while(){ 152 | chomp; 153 | my($id,$tag)=split /\n/; 154 | #GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签 155 | my @tmp=split /\|/,$id; 156 | $hash_gcf_rank{$tmp[0]}++; 157 | next if($tmp[5]!=1);#跳过非unique标签 158 | my $class=$hs_GCF2class{$tmp[0]};# all taxonomic levels 159 | push @{$hs_tag2GCF{$tag}},$tmp[0];# record GCF for each 2b tag 160 | $hs_tag_theory_num{$class}{$tmp[0]}{$tag}++;# compute the number of 2b tags of each GCF under a given taxon and record the # of all 2b tags from the same taxa 161 | for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度 162 | if((keys %hash_gcf_rank)/$all_genome_num*100>=$i){ 163 | print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志 164 | $complete{$i}++; 165 | last; 166 | } 167 | } 168 | } 169 | close IN; 170 | $/="\n"; 171 | print STDOUT "###Loading database completed, ",`date`; 172 | 173 | # process each sample in the list file 174 | open LI,"$list" or die "cannot open $list\n"; 175 | while(
  • ){ 176 | next if(/^#/ || /^$/); # 去除注释行和空行 177 | chomp; 178 | my ($sample_name,$sample_data)=split /\t/; 179 | $sample_data=abs_path($sample_data);#转为绝对路径 180 | print STDOUT "###($sample_name) Sample identification started, ",`date`; 181 | my (%hs_tag_num,%hs_detected_GCF_tag); 182 | # load a single sample 183 | if($sample_data=~/\.gz$/){ 184 | open IN,"gzip -dc $sample_data|" or die "cannot open $sample_data\n"; 185 | }else{ 186 | open IN,"$sample_data" or die "cannot open $sample_data\n"; 187 | } 188 | while(){ 189 | my $line=$_; 190 | if($line=~/^@/){#fastq 191 | $line .= . . ; 192 | }elsif($line=~/^>/){#fasta 193 | $line .=; 194 | } 195 | my $tag=(split /\n/,$line)[1]; 196 | if(exists $hs_tag2GCF{$tag}){ 197 | my $class=$hs_GCF2class{$hs_tag2GCF{$tag}[0]}; 198 | $hs_tag_num{$class}{$tag}++;#实际样品标签深度 199 | for my $i(0..$#{$hs_tag2GCF{$tag}}){ 200 | $hs_detected_GCF_tag{$class}{$hs_tag2GCF{$tag}[$i]}{$tag}=$hs_tag_theory_num{$class}{$hs_tag2GCF{$tag}[$i]}{$tag}; 201 | } 202 | }else{#反向互补 203 | $tag=~tr/ATCG/TAGC/; 204 | $tag=reverse($tag); 205 | if(exists $hs_tag2GCF{$tag}){ 206 | my $class=$hs_GCF2class{$hs_tag2GCF{$tag}[0]}; 207 | $hs_tag_num{$class}{$tag}++;#实际样品标签深度 208 | for my $i(0..$#{$hs_tag2GCF{$tag}}){ 209 | $hs_detected_GCF_tag{$class}{$hs_tag2GCF{$tag}[$i]}{$tag}=$hs_tag_theory_num{$class}{$hs_tag2GCF{$tag}[$i]}{$tag}; 210 | } 211 | } 212 | } 213 | } 214 | close IN; 215 | 216 | if((keys %hs_tag_num)==0){# go to the next sample if no 2b-RAD tag was detected in a sample 217 | print STDERR "!!!($sample_name) Warning: $hs_site2enzyme{$site} $level the number of 2b-RAD tags for this sample is zero\n"; 218 | print STDOUT "###($sample_name) Sample idenfication completed, ",`date`; 219 | next; 220 | } 221 | &CheckDir("$outdir/$sample_name");# create a filepath for each sample 222 | # compute the number of therotical and actual 2b-RAD tags for each GCF in each sample 223 | open DE,">$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.GCF_detected.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.GCF_detected.xls\n"; 224 | for my $class(sort {$a cmp $b} keys %hs_detected_GCF_tag){ 225 | for my $GCF(sort {$a cmp $b} keys %{$hs_detected_GCF_tag{$class}}){ 226 | my $GCF_all_theory_num; 227 | my $detected_tag_num; 228 | $GCF_all_theory_num=keys %{$hs_tag_theory_num{$class}{$GCF}};# theoratical, the number of taxa-specific 2b-RAD tags from each GCF 229 | $detected_tag_num=keys %{$hs_detected_GCF_tag{$class}{$GCF}};# in a real sample, the number of taxa-specific 2b-RAD tags from each GCF 230 | my $percent=sprintf "%.4f",$detected_tag_num/$GCF_all_theory_num;# the percentage of detected 2b-RAD tags from a GCF specific to a taxon 231 | print DE "$class\t$GCF\t$GCF_all_theory_num\t$detected_tag_num\t$percent\n"; 232 | } 233 | } 234 | close DE; 235 | # Output 236 | &CheckDir("$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}") if($verbose eq "yes"); 237 | open OU,">$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}.xls\n"; 238 | print OU "$head\tTheoretical_Tag_Num\tSequenced_Tag_Num\tPercent\t"; 239 | print OU "Sequenced_Reads_Num\tSequenced_Reads_Num/Theoretical_Tag_Num\tSequenced_Reads_Num/Sequenced_Tag_Num\tSequenced_Tag_Num(depth>1)\t"; 240 | print OU "G_Score\n"; 241 | for my $class(keys %hs_tag_num){ 242 | if($verbose eq "yes"){ # output the detailed information on sequencing coverage of each 2b-RAD tag from a given genome detected in the real sample 243 | my $output_name=(split /\t/,$class)[-1]; 244 | open DETAIL,">$outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}/$output_name.xls" or die "cannot open $outdir/$sample_name/$sample_name.$hs_site2enzyme{$site}/$output_name.xls\n"; 245 | } 246 | my ($Theoretical_Tag_Num,$Sequenced_Tag_Num,$Sequenced_Tag_Num_2,$Sequenced_Tag_Num2Theoretical_Tag_Num); 247 | my ($Sequenced_Reads_Num,$Sequenced_Reads_Num2Theoretical_Tag_Num,$Sequenced_Reads_Num2Sequenced_Tag_Num); 248 | my ($G_Score); 249 | $Sequenced_Tag_Num=$Sequenced_Reads_Num=$Sequenced_Tag_Num_2=0; 250 | for my $tag(keys %{$hs_tag_num{$class}}){# iterate each 2b-RAD tag 251 | $Sequenced_Tag_Num++; 252 | $Sequenced_Tag_Num_2++ if($hs_tag_num{$class}{$tag}>1);# compute the number of 2b-RAD tags that have the sequencing coverage >1 253 | $Sequenced_Reads_Num+=$hs_tag_num{$class}{$tag}; # the number of reads detected 254 | if($verbose eq "yes"){ 255 | print DETAIL "$tag\t$hs_tag_num{$class}{$tag}\n"; 256 | } 257 | } 258 | if($verbose eq "yes"){ 259 | close DETAIL; 260 | } 261 | # average number of theoretical 2b-RAD tags for each taxon 262 | my $species_all_theory_num; 263 | for my $GCF(keys %{$hs_tag_theory_num{$class}}){ 264 | for my $tag(keys %{$hs_tag_theory_num{$class}{$GCF}}){ 265 | $species_all_theory_num+=$hs_tag_theory_num{$class}{$GCF}{$tag}; 266 | } 267 | } 268 | $Theoretical_Tag_Num=$species_all_theory_num/(keys %{$hs_tag_theory_num{$class}});# average number of theoretical 2b-RAD tags for each taxon 269 | # statistical summmary 270 | $Sequenced_Tag_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Tag_Num/$Theoretical_Tag_Num*100;# 测到的标签占理论的百分比 271 | $Sequenced_Reads_Num2Theoretical_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Theoretical_Tag_Num;# 测到的标签深度/理论标签数 272 | $Sequenced_Reads_Num2Sequenced_Tag_Num=sprintf "%.8f",$Sequenced_Reads_Num/$Sequenced_Tag_Num;# 测到的标签平均深度 273 | $G_Score=sprintf "%.8f",sqrt($Sequenced_Tag_Num*$Sequenced_Reads_Num);#compute the g_score for each taxon 274 | next if ($G_Score<$g_score_threshold);# filter taxa that have g_score < $g_score_threshold 275 | print OU "$class\t$Theoretical_Tag_Num\t$Sequenced_Tag_Num\t$Sequenced_Tag_Num2Theoretical_Tag_Num%\t"; 276 | print OU "$Sequenced_Reads_Num\t$Sequenced_Reads_Num2Theoretical_Tag_Num\t$Sequenced_Reads_Num2Sequenced_Tag_Num\t$Sequenced_Tag_Num_2\t"; 277 | print OU "$G_Score\n"; 278 | } 279 | close OU; 280 | undef %hs_tag_num; 281 | undef %hs_detected_GCF_tag; 282 | print STDOUT "###($sample_name) Sample identification completed, ",`date`; 283 | } 284 | close LI; 285 | 286 | # clean cache 287 | print STDOUT "### Cleaning the cached objects started, ",`date`; 288 | undef %hs_tag_theory_num; 289 | undef %hs_GCF2class; 290 | undef %hs_tag2GCF; 291 | print STDOUT "###Cleaning the cached objects completed, ",`date`; 292 | 293 | sub CheckDir{ 294 | my $file = shift; 295 | unless( -d $file ){ 296 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 297 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 298 | } 299 | return 1; 300 | } 301 | 302 | 303 | -------------------------------------------------------------------------------- /scripts/CreateQuanDatabase_2bRAD.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Authors: Zheng Sun, Rongchao Zhang, Shi Huang 3 | use warnings; 4 | use strict; 5 | use Getopt::Long; 6 | use FindBin qw($Bin); 7 | use File::Basename qw(dirname basename); 8 | use Cwd 'abs_path'; 9 | 10 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang"; 11 | my $time="2020.12.16"; 12 | 13 | 14 | select STDOUT;$|=1;# cache cleaning 15 | 16 | my $remove_redundant ||="no";# 基因组内部是否去冗余 yes or no, default value is "no" 17 | 18 | my($list,$site,$type,$outdir,$enzyme_file,$help); 19 | GetOptions( 20 | "l:s" => \$list, 21 | "s:i" => \$site, 22 | "t:s" => \$type, 23 | "o:s" => \$outdir, 24 | 25 | "e:s" => \$enzyme_file,#酶切结果文件,或库文件 26 | "r:s" => \$remove_redundant, #基因组内部是否去冗余 27 | "h|help:s" => \$help, 28 | ); 29 | 30 | sub usage{# help 31 | print STDERR "\e[;33;1m 32 | DESCRIPTION 33 | It constructs the taxa-specific 2b-RAD reference genome database from a whole-genome reference database. 34 | USAGE 35 | perl $0 36 | PARAMETERS 37 | -l genome classification list (the line which begins with # will be ignored) 38 | eg:GCFidkingdomphylumclassorderfamilygenusspeciesstrain(genome_path) 39 | -e enzyme file or database file 40 | -s 2b restriction enzymes (sites). 41 | [1]CspCI [9]BplI 42 | [2]AloI [10]FalI 43 | [3]BsaXI [11]Bsp24I 44 | [4]BaeI [12]HaeIV 45 | [5]BcgI [13]CjePI 46 | [6]CjeI [14]Hin4I 47 | [7]PpiI [15]AlfI 48 | [8]PsrI [16]BslFI 49 | -t The database level. One or more taxonomy level of the 2b-RAD reference database can be specified: kingdom,phylum,class,order,family,genus,species,strain. Use 'all' for any levels. (comma separated). 50 | -o outdir (if not exists,it will be created) 51 | OPTION 52 | -r whether to delete redundant tags within the genome (yes or no) [default: $remove_redundant] 53 | -h|help print this help 54 | Author: $author 55 | Last update: $time\e[0m\n"; 56 | } 57 | 58 | if(defined($help)){ 59 | &usage; 60 | exit 0; 61 | } 62 | 63 | unless($list && $enzyme_file && $site && $type && $outdir){ 64 | &usage; 65 | print STDERR "para -l -e -s -t or -o error.\n"; 66 | exit 1; 67 | } 68 | 69 | #转化为绝对路径 70 | $list=abs_path($list); 71 | $outdir=abs_path($outdir); 72 | $enzyme_file=abs_path($enzyme_file); 73 | 74 | #check the parameter -r: using default value "no" 75 | unless($remove_redundant eq "yes" || $remove_redundant eq "no"){ 76 | &usage; 77 | print STDERR "-r parameter error: $remove_redundant\n"; 78 | exit 1; 79 | } 80 | 81 | 82 | #所有分类水平 83 | my %hs_type_database=( 84 | 'kingdom' => '1', 85 | 'phylum' => '2', 86 | 'class' => '3', 87 | 'order' => '4', 88 | 'family' => '5', 89 | 'genus' => '6', 90 | 'species' => '7', 91 | 'strain' => '8', 92 | ); 93 | # check the parameter -t: specify the taxonomic level of 2b-RAD genome database 94 | my %hs_type; 95 | if($type eq "all"){ 96 | %hs_type=%hs_type_database; 97 | }else{ 98 | my @tmp=split /,/,$type; 99 | for my $i(@tmp){ 100 | if(exists $hs_type_database{$i}){ 101 | $hs_type{$i}=$hs_type_database{$i}; 102 | }else{ 103 | &usage; 104 | print STDERR "-t parameter error: cannot find '$i'\n"; 105 | exit 1; 106 | } 107 | } 108 | } 109 | 110 | # check the parameter -s: 111 | my (@site,$enzyme); 112 | if( 1 == $site ){#CspCI 113 | @site = ( 114 | '[AGCT]{11}CAA[AGCT]{5}GTGG[AGCT]{10}', 115 | '[AGCT]{10}CCAC[AGCT]{5}TTG[AGCT]{11}', 116 | ); 117 | $enzyme="CspCI"; 118 | }elsif( 2 == $site ){#AloI 119 | @site = ( 120 | '[AGCT]{7}GAAC[AGCT]{6}TCC[AGCT]{7}', 121 | '[AGCT]{7}GGA[AGCT]{6}GTTC[AGCT]{7}', 122 | ); 123 | $enzyme="AloI"; 124 | }elsif( 3 == $site ){#BsaXI 125 | @site = ( 126 | '[AGCT]{9}AC[AGCT]{5}CTCC[AGCT]{7}', 127 | '[AGCT]{7}GGAG[AGCT]{5}GT[AGCT]{9}', 128 | ); 129 | $enzyme="BsaXI"; 130 | }elsif( 4 == $site ){#BaeI 131 | @site = ( 132 | '[AGCT]{10}AC[AGCT]{4}GTA[CT]C[AGCT]{7}', 133 | '[AGCT]{7}G[AG]TAC[AGCT]{4}GT[AGCT]{10}', 134 | ); 135 | $enzyme="BaeI"; 136 | }elsif( 5 == $site ){#BcgI 137 | @site = ( 138 | '[AGCT]{10}CGA[AGCT]{6}TGC[AGCT]{10}', 139 | '[AGCT]{10}GCA[AGCT]{6}TCG[AGCT]{10}', 140 | ); 141 | $enzyme="BcgI"; 142 | }elsif( 6 == $site ){#CjeI 143 | @site = ( 144 | '[AGCT]{8}CCA[AGCT]{6}GT[AGCT]{9}', 145 | '[AGCT]{9}AC[AGCT]{6}TGG[AGCT]{8}', 146 | ); 147 | $enzyme="CjeI"; 148 | }elsif( 7 == $site ){#PpiI 149 | @site = ( 150 | '[AGCT]{7}GAAC[AGCT]{5}CTC[AGCT]{8}', 151 | '[AGCT]{8}GAG[AGCT]{5}GTTC[AGCT]{7}', 152 | ); 153 | $enzyme="PpiI"; 154 | }elsif( 8 == $site ){#PsrI 155 | @site = ( 156 | '[AGCT]{7}GAAC[AGCT]{6}TAC[AGCT]{7}', 157 | '[AGCT]{7}GTA[AGCT]{6}GTTC[AGCT]{7}', 158 | ); 159 | $enzyme="PsrI"; 160 | }elsif( 9 == $site ){#BplI 161 | @site = ( 162 | '[AGCT]{8}GAG[AGCT]{5}CTC[AGCT]{8}', #palindromes 163 | ); 164 | $enzyme="BplI"; 165 | }elsif( 10 == $site ){#FalI 166 | @site = ( 167 | '[AGCT]{8}AAG[AGCT]{5}CTT[AGCT]{8}', #palindromes 168 | ); 169 | $enzyme="FalI"; 170 | }elsif( 11 == $site ){#Bsp24I 171 | @site = ( 172 | '[AGCT]{8}GAC[AGCT]{6}TGG[AGCT]{7}', 173 | '[AGCT]{7}CCA[AGCT]{6}GTC[AGCT]{8}', 174 | ); 175 | $enzyme="Bsp24I"; 176 | }elsif( 12 == $site ){#HaeIV 177 | @site = ( 178 | '[AGCT]{7}GA[CT][AGCT]{5}[AG]TC[AGCT]{9}', 179 | '[AGCT]{9}GA[CT][AGCT]{5}[AG]TC[AGCT]{7}', 180 | ); 181 | $enzyme="HaeIV"; 182 | }elsif( 13 == $site ){#CjePI 183 | @site = ( 184 | '[AGCT]{7}CCA[AGCT]{7}TC[AGCT]{8}', 185 | '[AGCT]{8}GA[AGCT]{7}TGG[AGCT]{7}', 186 | ); 187 | $enzyme="CjePI"; 188 | }elsif( 14 == $site ){#Hin4I 189 | @site = ( 190 | '[AGCT]{8}GA[CT][AGCT]{5}[GAC]TC[AGCT]{8}', 191 | '[AGCT]{8}GA[CTG][AGCT]{5}[AG]TC[AGCT]{8}', 192 | ); 193 | $enzyme="Hin4I"; 194 | }elsif( 15 == $site ){#AlfI 195 | @site = ( 196 | '[AGCT]{10}GCA[AGCT]{6}TGC[AGCT]{10}', #palindromes 197 | ); 198 | $enzyme="AlfI"; 199 | }elsif( 16 == $site ){#BslFI ??some question?? single enzyme 200 | @site = ( 201 | '[AGCT]{6}GGGAC[AGCT]{14}', 202 | '[AGCT]{14}GTCCC[AGCT]{6}', 203 | ); 204 | $enzyme="BslFI"; 205 | }else{ 206 | &usage; 207 | print STDERR "The parameter -s is wrong\n"; 208 | exit 1; 209 | } 210 | 211 | #提供酶切文件,检查文件是否存在 212 | if(defined($enzyme_file)){ 213 | unless(-e $enzyme_file){ 214 | print STDERR "[ERROR] $enzyme_file does not exist,please check.\n"; 215 | exit 1; 216 | } 217 | } 218 | 219 | 220 | #统计总的基因组个数 221 | my $genome_total_num=0; 222 | #my (%hash_gcf2class,%hash_gcf_rank); 223 | my %hash_gcf2class; 224 | if($list=~/\.gz/){ 225 | open LI,"gzip -dc $list|" or die "cannot open $list\n"; 226 | }else{ 227 | open LI,"$list" or die "cannot open $list\n"; 228 | } 229 | while(
  • ){ 230 | next if(/^#/ || /^$/);# remove blank lines or lines starting with # 231 | chomp; 232 | my @tmp=split /\t/; 233 | $genome_total_num++;#总基因组个数 234 | $hash_gcf2class{$tmp[0]}=$_;#gcf对应的分类信息 235 | # $hash_gcf_rank{$tmp[0]}=$genome_total_num;#记录基因组在列表中的排序,便于打印日志 236 | if(defined($enzyme_file)){#提供酶切文件 237 | ; 238 | }else{#不提供酶切文件 239 | $tmp[-1]=abs_path($tmp[-1]); 240 | unless(-e $tmp[-1]){#check the availability of a genome fasta file 241 | print STDERR "[ERROR] $tmp[-1] does not exist,please check your genome file\n"; 242 | exit 1; 243 | } 244 | } 245 | } 246 | close LI; 247 | 248 | if($genome_total_num==0){ 249 | print STDERR "[warning] There is no genome in the List file.\n"; 250 | exit 0; 251 | } 252 | 253 | 254 | &CheckDir("$outdir");# create the output directory 255 | #&CheckDir("$outdir/database"); 256 | 257 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- start, ",`date`;#STDOUT 258 | for my $level(sort {$hs_type{$a}<=>$hs_type{$b}} keys %hs_type){ #iterate all taxonomic levels of 2b-RAD database 259 | print STDOUT "###($level) Record the taxonomies of each 2b-RAD tag -- start, ",`date`;#STDOUT 260 | my (%hash_ingenome,%hash,%complete,%hash_gcf_rank); 261 | my %hash_seq;#记录基因组酶切所有标签 262 | $/=">"; 263 | if($enzyme_file=~/\.gz$/){#打开酶切文件 264 | open IN,"gzip -dc $enzyme_file|" or die "cannot open $enzyme_file\n"; 265 | }else{ 266 | open IN,"$enzyme_file" or die "cannot open $enzyme_file\n"; 267 | } 268 | ; 269 | while(){ 270 | chomp; 271 | my @tmp=split /\n/; 272 | #GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签 273 | my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0]; 274 | my $tag=$tmp[1]; 275 | next unless(exists $hash_gcf2class{$gcfid});#gcfid不在列表中则跳过 276 | $hash_gcf_rank{$gcfid}++; 277 | $hash_seq{$gcfid}{$ingenome_tag_num}=join("\n",@tmp[0..1]);#记录列表中,基因组酶切的所有标签 278 | 279 | my @a=split /\t/,$hash_gcf2class{$gcfid};#分类 280 | my $class=join("\t",@a[1..$hs_type{$level}]);#concatenate the full taxonomic annotation 281 | if(exists $hash{$tag}){#判断在哈希中是否存在 282 | $hash{$tag}{$class}++;#记录标签分类信息 283 | $hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余,则记录标签在基因组内部是否冗余 284 | }else{#反向互补处理 285 | $tag=~tr/ATCG/TAGC/; 286 | $tag=reverse($tag); 287 | $hash{$tag}{$class}++;#记录标签分类信息 288 | $hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余,则记录标签在基因组内部是否冗余 289 | } 290 | for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度 291 | if((keys %hash_gcf_rank)/$genome_total_num*100>=$i){ 292 | print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志 293 | $complete{$i}++; 294 | last; 295 | } 296 | } 297 | } 298 | close IN; 299 | $/="\n"; 300 | undef %hash_gcf_rank; 301 | print STDOUT "\n###($level) Record the taxonomies of each 2b-RAD tag -- complete, ",`date`;# STDOUT 302 | 303 | print STDOUT "###($level) Identification of taxa-specifc 2b-RAD tags -- start, ",`date`;# STDOUT 304 | undef %complete;#完成进度清空 305 | my (%hash_genome_tag_num,%hash_genome_unique_tag_num); 306 | my $complete=0; 307 | if($list=~/\.gz/){ 308 | open LI,"gzip -dc $list|" or die "cannot open $list\n"; 309 | }else{ 310 | open LI,"$list" or die "cannot open $list\n"; 311 | } 312 | open OU,"|gzip > $outdir/$enzyme.$level.fa.gz" or die "cannot open $outdir/$enzyme.$level.fa.gz\n"; 313 | while(
  • ){ 314 | next if(/^#/ || /^#/);# remove blank lines or lines starting with # 315 | my $line=$_; 316 | chomp($line); 317 | my $gcfid=(split /\t/,$line)[0]; 318 | next unless(exists $hash_seq{$gcfid}); 319 | for my $i(sort {$a<=>$b} keys %{$hash_seq{$gcfid}}){#循环标签 320 | my @tmp=split /\n/,$hash_seq{$gcfid}{$i};#id\nseq 321 | #GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签 322 | my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0]; 323 | my $tag=$tmp[1]; 324 | $gcfid=~s/^>//; 325 | unless(exists $hash{$tag}){#如果不存在,则进行反向互补 326 | $tag=~tr/ATCG/TAGC/; 327 | $tag=reverse($tag); 328 | #反向互补后,改变链的方向 329 | if($chain==0){ 330 | $chain=1; 331 | }elsif($chain==1){ 332 | $chain=0; 333 | } 334 | } 335 | if(keys %{$hash{$tag}}==1){#指定水平下为unique 336 | if($remove_redundant eq "yes"){#基因组内部需要去冗余 337 | if($hash_ingenome{$gcfid}{$tag}==1){#在基因组内部只出现过一次(noredundancy) 338 | $unique=1; 339 | $hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数 340 | }else{ 341 | $unique=0; 342 | } 343 | }else{#基因组内部不需要去冗余 344 | $unique=1; 345 | $hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数 346 | } 347 | }else{ 348 | $unique=0; 349 | } 350 | print OU ">$gcfid|$ingenome_tag_num|$scaid|$start|$chain|$unique\n$tag\n" if($unique==1); 351 | } 352 | $complete++; 353 | for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度 354 | if($complete/$genome_total_num*100>=$i){ 355 | print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志 356 | $complete{$i}++; 357 | last; 358 | } 359 | } 360 | 361 | } 362 | close LI; 363 | close OU; 364 | #统计输出 365 | open STAT,"> $outdir/$enzyme.$level.stat.xls" or die "cannot open $outdir/$enzyme.$level.stat.xls\n"; 366 | print STAT "#Unique_Name\tAll_Tag_Num\tUnique_Tag_Num\n"; 367 | if($list=~/\.gz/){ 368 | open LI,"gzip -dc $list|" or die "cannot open $list\n"; 369 | }else{ 370 | open LI,"$list" or die "cannot open $list\n"; 371 | } 372 | while(
  • ){ 373 | next if(/^#/ || /^$/);# remove blank lines or lines starting with # 374 | my $line=$_; 375 | chomp($line); 376 | my @tmp=split /\t/,$line; 377 | print STAT "$tmp[0]"; 378 | if(exists $hash_seq{$tmp[0]}){#酶切标签数 379 | my $genome_tag_num=keys %{$hash_seq{$tmp[0]}}; 380 | print STAT "\t$genome_tag_num"; 381 | }else{ 382 | print STAT "\t0"; 383 | } 384 | if(exists $hash_genome_unique_tag_num{$tmp[0]}){#unique标签数 385 | print STAT "\t$hash_genome_unique_tag_num{$tmp[0]}\n"; 386 | }else{ 387 | print STAT "\t0\n"; 388 | } 389 | # print STAT "$tmp[0]\t$hash_genome_tag_num{$tmp[0]}\t$hash_genome_unique_tag_num{$tmp[0]}\n"; 390 | } 391 | close LI; 392 | close STAT; 393 | print STDOUT "\n###($level) Identification of taxa-specifc 2b-RAD tags -- complete, ",`date`; #STDOUT 394 | } 395 | 396 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- complete, ",`date`;#STDOUT 397 | 398 | 399 | 400 | 401 | sub CheckDir{# create the directory 402 | my $file = shift; 403 | unless( -d $file ){ 404 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 405 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 406 | } 407 | return 1; 408 | } 409 | -------------------------------------------------------------------------------- /scripts/CreateQualDatabase_2bRAD.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Authors: Zheng Sun, Rongchao Zhang, Shi Huang 3 | use warnings; 4 | use strict; 5 | use Getopt::Long; 6 | use FindBin qw($Bin); 7 | use File::Basename qw(dirname basename); 8 | use Cwd 'abs_path'; 9 | 10 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang"; 11 | my $time="2020.12.15"; 12 | 13 | 14 | select STDOUT;$|=1;# cache cleaning 15 | 16 | my $remove_redundant ||="yes";# 基因组内部是否去冗余 yes or no, default value is "no" 17 | 18 | my($list,$site,$type,$outdir,$enzyme_file,$help); 19 | GetOptions( 20 | "l:s" => \$list, 21 | "s:i" => \$site, 22 | "t:s" => \$type, 23 | "o:s" => \$outdir, 24 | 25 | "e:s" => \$enzyme_file,#酶切结果文件,或库文件 26 | "r:s" => \$remove_redundant, #基因组内部是否去冗余 27 | "h|help:s" => \$help, 28 | ); 29 | 30 | sub usage{# help 31 | print STDERR "\e[;33;1m 32 | DESCRIPTION 33 | It constructs the taxa-specific 2b-RAD reference genome database from a whole-genome reference database. 34 | USAGE 35 | perl $0 36 | PARAMETERS 37 | -l genome classification list (the line which begins with # will be ignored) 38 | eg:GCFidkingdomphylumclassorderfamilygenusspeciesstrain(genome_path) 39 | -s One or multiple type 2b restriction enzymes (sites). 40 | [1]CspCI [9]BplI 41 | [2]AloI [10]FalI 42 | [3]BsaXI [11]Bsp24I 43 | [4]BaeI [12]HaeIV 44 | [5]BcgI [13]CjePI 45 | [6]CjeI [14]Hin4I 46 | [7]PpiI [15]AlfI 47 | [8]PsrI [16]BslFI 48 | -t The database level. One or more taxonomy level of the 2b-RAD reference database can be specified: kingdom,phylum,class,order,family,genus,species,strain. Use 'all' for any levels. (comma separated). 49 | -o outdir (if not exists,it will be created) 50 | OPTION 51 | -e enzyme file or database file 52 | -r whether to delete redundant tags within the genome (yes or no) [default: $remove_redundant] 53 | -h|help print this help 54 | Author: $author 55 | Last update: $time\e[0m\n"; 56 | } 57 | 58 | if(defined($help)){ 59 | &usage; 60 | exit 0; 61 | } 62 | 63 | unless($list && $site && $type && $outdir){ 64 | &usage; 65 | print STDERR "para -l -s -t or -o error.\n"; 66 | exit 1; 67 | } 68 | 69 | #转化为绝对路径 70 | $list=abs_path($list); 71 | $outdir=abs_path($outdir); 72 | 73 | #check the parameter -r: using default value "no" 74 | unless($remove_redundant eq "yes" || $remove_redundant eq "no"){ 75 | &usage; 76 | print STDERR "-r parameter error: $remove_redundant\n"; 77 | exit 1; 78 | } 79 | 80 | 81 | #所有分类水平 82 | my %hs_type_database=( 83 | 'kingdom' => '1', 84 | 'phylum' => '2', 85 | 'class' => '3', 86 | 'order' => '4', 87 | 'family' => '5', 88 | 'genus' => '6', 89 | 'species' => '7', 90 | 'strain' => '8', 91 | ); 92 | # check the parameter -t: specify the taxonomic level of 2b-RAD genome database 93 | my %hs_type; 94 | if($type eq "all"){ 95 | %hs_type=%hs_type_database; 96 | }else{ 97 | my @tmp=split /,/,$type; 98 | for my $i(@tmp){ 99 | if(exists $hs_type_database{$i}){ 100 | $hs_type{$i}=$hs_type_database{$i}; 101 | }else{ 102 | &usage; 103 | print STDERR "-t parameter error: cannot find '$i'\n"; 104 | exit 1; 105 | } 106 | } 107 | } 108 | 109 | # check the parameter -s: 110 | my (@site,$enzyme); 111 | if( 1 == $site ){#CspCI 112 | @site = ( 113 | '[AGCT]{11}CAA[AGCT]{5}GTGG[AGCT]{10}', 114 | '[AGCT]{10}CCAC[AGCT]{5}TTG[AGCT]{11}', 115 | ); 116 | $enzyme="CspCI"; 117 | }elsif( 2 == $site ){#AloI 118 | @site = ( 119 | '[AGCT]{7}GAAC[AGCT]{6}TCC[AGCT]{7}', 120 | '[AGCT]{7}GGA[AGCT]{6}GTTC[AGCT]{7}', 121 | ); 122 | $enzyme="AloI"; 123 | }elsif( 3 == $site ){#BsaXI 124 | @site = ( 125 | '[AGCT]{9}AC[AGCT]{5}CTCC[AGCT]{7}', 126 | '[AGCT]{7}GGAG[AGCT]{5}GT[AGCT]{9}', 127 | ); 128 | $enzyme="BsaXI"; 129 | }elsif( 4 == $site ){#BaeI 130 | @site = ( 131 | '[AGCT]{10}AC[AGCT]{4}GTA[CT]C[AGCT]{7}', 132 | '[AGCT]{7}G[AG]TAC[AGCT]{4}GT[AGCT]{10}', 133 | ); 134 | $enzyme="BaeI"; 135 | }elsif( 5 == $site ){#BcgI 136 | @site = ( 137 | '[AGCT]{10}CGA[AGCT]{6}TGC[AGCT]{10}', 138 | '[AGCT]{10}GCA[AGCT]{6}TCG[AGCT]{10}', 139 | ); 140 | $enzyme="BcgI"; 141 | }elsif( 6 == $site ){#CjeI 142 | @site = ( 143 | '[AGCT]{8}CCA[AGCT]{6}GT[AGCT]{9}', 144 | '[AGCT]{9}AC[AGCT]{6}TGG[AGCT]{8}', 145 | ); 146 | $enzyme="CjeI"; 147 | }elsif( 7 == $site ){#PpiI 148 | @site = ( 149 | '[AGCT]{7}GAAC[AGCT]{5}CTC[AGCT]{8}', 150 | '[AGCT]{8}GAG[AGCT]{5}GTTC[AGCT]{7}', 151 | ); 152 | $enzyme="PpiI"; 153 | }elsif( 8 == $site ){#PsrI 154 | @site = ( 155 | '[AGCT]{7}GAAC[AGCT]{6}TAC[AGCT]{7}', 156 | '[AGCT]{7}GTA[AGCT]{6}GTTC[AGCT]{7}', 157 | ); 158 | $enzyme="PsrI"; 159 | }elsif( 9 == $site ){#BplI 160 | @site = ( 161 | '[AGCT]{8}GAG[AGCT]{5}CTC[AGCT]{8}', #palindromes 162 | ); 163 | $enzyme="BplI"; 164 | }elsif( 10 == $site ){#FalI 165 | @site = ( 166 | '[AGCT]{8}AAG[AGCT]{5}CTT[AGCT]{8}', #palindromes 167 | ); 168 | $enzyme="FalI"; 169 | }elsif( 11 == $site ){#Bsp24I 170 | @site = ( 171 | '[AGCT]{8}GAC[AGCT]{6}TGG[AGCT]{7}', 172 | '[AGCT]{7}CCA[AGCT]{6}GTC[AGCT]{8}', 173 | ); 174 | $enzyme="Bsp24I"; 175 | }elsif( 12 == $site ){#HaeIV 176 | @site = ( 177 | '[AGCT]{7}GA[CT][AGCT]{5}[AG]TC[AGCT]{9}', 178 | '[AGCT]{9}GA[CT][AGCT]{5}[AG]TC[AGCT]{7}', 179 | ); 180 | $enzyme="HaeIV"; 181 | }elsif( 13 == $site ){#CjePI 182 | @site = ( 183 | '[AGCT]{7}CCA[AGCT]{7}TC[AGCT]{8}', 184 | '[AGCT]{8}GA[AGCT]{7}TGG[AGCT]{7}', 185 | ); 186 | $enzyme="CjePI"; 187 | }elsif( 14 == $site ){#Hin4I 188 | @site = ( 189 | '[AGCT]{8}GA[CT][AGCT]{5}[GAC]TC[AGCT]{8}', 190 | '[AGCT]{8}GA[CTG][AGCT]{5}[AG]TC[AGCT]{8}', 191 | ); 192 | $enzyme="Hin4I"; 193 | }elsif( 15 == $site ){#AlfI 194 | @site = ( 195 | '[AGCT]{10}GCA[AGCT]{6}TGC[AGCT]{10}', #palindromes 196 | ); 197 | $enzyme="AlfI"; 198 | }elsif( 16 == $site ){#BslFI ??some question?? single enzyme 199 | @site = ( 200 | '[AGCT]{6}GGGAC[AGCT]{14}', 201 | '[AGCT]{14}GTCCC[AGCT]{6}', 202 | ); 203 | $enzyme="BslFI"; 204 | }else{ 205 | &usage; 206 | print STDERR "The parameter -s is wrong\n"; 207 | exit 1; 208 | } 209 | 210 | #提供酶切文件,检查文件是否存在 211 | if(defined($enzyme_file)){ 212 | $enzyme_file=abs_path($enzyme_file); 213 | unless(-e $enzyme_file){ 214 | print STDERR "[ERROR] $enzyme_file does not exist,please check.\n"; 215 | exit 1; 216 | } 217 | } 218 | 219 | 220 | #统计总的基因组个数和不提供酶切文件时检测基因组路径是否存在 221 | my $genome_total_num=0; 222 | #my (%hash_gcf2class,%hash_gcf_rank); 223 | my %hash_gcf2class; 224 | if($list=~/\.gz/){ 225 | open LI,"gzip -dc $list|" or die "cannot open $list\n"; 226 | }else{ 227 | open LI,"$list" or die "cannot open $list\n"; 228 | } 229 | while(
  • ){ 230 | next if(/^#/ || /^$/);# remove blank lines or lines starting with # 231 | chomp; 232 | my @tmp=split /\t/; 233 | $genome_total_num++; 234 | $hash_gcf2class{$tmp[0]}=$_;#gcf对应的分类信息 235 | # $hash_gcf_rank{$tmp[0]}=$genome_total_num;#记录基因组在列表中的排序,便于打印日志 236 | if(defined($enzyme_file)){#提供酶切文件 237 | ; 238 | }else{#不提供酶切文件 239 | $tmp[-1]=abs_path($tmp[-1]); 240 | unless(-e $tmp[-1]){#check the availability of a genome fasta file 241 | print STDERR "[ERROR] $tmp[-1] does not exist,please check your genome file\n"; 242 | exit 1; 243 | } 244 | } 245 | } 246 | close LI; 247 | 248 | &CheckDir("$outdir");# create the output directory 249 | #&CheckDir("$outdir/database"); 250 | unless(defined($enzyme_file)){#未提供酶切文件则进行酶切 251 | print STDOUT "###($enzyme) Electron digestion -- start, ",`date`;# output the log file 252 | my $complete=0; 253 | my %complete; 254 | $enzyme_file="$outdir/$enzyme.enzyme.fa.gz"; 255 | open OU,"|gzip > $outdir/$enzyme.enzyme.fa.gz" or die "cannot open $outdir/$enzyme.enzyme.fa.gz\n"; 256 | if($list=~/\.gz/){ 257 | open LI,"gzip -dc $list|" or die "cannot open $list\n"; 258 | }else{ 259 | open LI,"$list" or die "cannot open $list\n"; 260 | } 261 | while(
  • ){ 262 | next if(/^#/ || /^$/);# remove blank lines or lines starting with # 263 | my $line=$_; 264 | chomp($line); 265 | my @tmp=split /\t/,$line; 266 | $tmp[-1]=abs_path($tmp[-1]); 267 | $/=">"; 268 | if($tmp[-1]=~/\.gz$/){ 269 | open IN,"gzip -dc $tmp[-1]|" or die "cannot open $tmp[-1]\n"; 270 | }else{ 271 | open IN,"$tmp[-1]" or die "cannot open $tmp[-1]\n"; 272 | } 273 | ; 274 | my $ingenome_tag_num=0; 275 | while(){ 276 | chomp; 277 | my @a=split /\n/; 278 | my $id=(split /\s+/,$a[0])[0];#scaffold id 279 | my $seq=join("",@a[1..$#a]);#scaffold seq 280 | $seq=uc($seq); # convert the lowercase to uppercase bases 小写碱基转换为大写 281 | my %hash_genome; 282 | for my $i(0..$#site){# iterate all restriction sites 循环酶切位点 283 | while($seq=~/($site[$i])/g){# digital digestion 284 | my $tag=$1;#序列 285 | my $len=length($tag);#标签长度 286 | my $pos=pos($seq); 287 | my $start=$pos-$len+1;#标签起始位置 288 | pos($seq)=$start;#调整位置 289 | #GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签 290 | $hash_genome{$start}=">$tmp[0]|0|$id|$start|0|-\n$tag\n"; 291 | } 292 | } 293 | # sort the 2b-RAD tags by the genome positions排序后输出 294 | for my $pos(sort {$a <=> $b} keys %hash_genome){ 295 | $ingenome_tag_num++; 296 | my @a=split /\|/,$hash_genome{$pos}; 297 | print OU "$a[0]|$ingenome_tag_num|",join("|",@a[2..$#a]); 298 | } 299 | undef %hash_genome; 300 | } 301 | close IN; 302 | $/="\n"; 303 | $complete++;#完成处理的基因组个数 304 | for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度 305 | if($complete/$genome_total_num*100>=$i){ 306 | print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志 307 | $complete{$i}++; 308 | last; 309 | } 310 | } 311 | } 312 | close LI; 313 | close OU; 314 | print STDOUT "\n###($enzyme) Electron digestion -- complete, ",`date`;# output the log file 315 | }else{#提供酶切文件,跳过电子酶切 316 | print STDOUT "An enzyme digestion file has been provided, skipping electron digestion.\n"; 317 | } 318 | 319 | 320 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- start, ",`date`;#STDOUT 321 | for my $level(sort {$hs_type{$a}<=>$hs_type{$b}} keys %hs_type){ #iterate all taxonomic levels of 2b-RAD database 322 | print STDOUT "###($level) Record the taxonomies of each 2b-RAD tag -- start, ",`date`;#STDOUT 323 | my (%hash_ingenome,%hash,%complete,%hash_gcf_rank); 324 | $/=">"; 325 | if($enzyme_file=~/\.gz$/){#打开酶切文件 326 | open IN,"gzip -dc $enzyme_file|" or die "cannot open $enzyme_file\n"; 327 | }else{ 328 | open IN,"$enzyme_file" or die "cannot open $enzyme_file\n"; 329 | } 330 | ; 331 | while(){ 332 | chomp; 333 | my @tmp=split /\n/; 334 | #GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签 335 | my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0]; 336 | my $tag=$tmp[1]; 337 | $gcfid=~s/^>//; 338 | next unless(exists $hash_gcf2class{$gcfid});#gcfid不在列表中则跳过 339 | $hash_gcf_rank{$gcfid}++; 340 | my @a=split /\t/,$hash_gcf2class{$gcfid}; 341 | my $class=join("\t",@a[1..$hs_type{$level}]);#concatenate the full taxonomic annotation 342 | if(exists $hash{$tag}){#判断在哈希中是否存在 343 | $hash{$tag}{$class}++;#记录标签分类信息 344 | $hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余,则记录标签在基因组内部是否冗余 345 | }else{#反向互补处理 346 | $tag=~tr/ATCG/TAGC/; 347 | $tag=reverse($tag); 348 | $hash{$tag}{$class}++;#记录标签分类信息 349 | $hash_ingenome{$gcfid}{$tag}++ if($remove_redundant eq "yes");#如果需要去除基因组内部冗余,则记录标签在基因组内部是否冗余 350 | } 351 | for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度 352 | if((keys %hash_gcf_rank)/$genome_total_num*100>=$i){ 353 | print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志 354 | $complete{$i}++; 355 | last; 356 | } 357 | } 358 | } 359 | close IN; 360 | $/="\n"; 361 | undef %hash_gcf_rank; 362 | print STDOUT "\n###($level) Record the taxonomies of each 2b-RAD tag -- complete, ",`date`;# STDOUT 363 | 364 | print STDOUT "###($level) Identification of taxa-specifc 2b-RAD tags -- start, ",`date`;# STDOUT 365 | undef %complete;#完成进度清空 366 | my (%hash_genome_tag_num,%hash_genome_unique_tag_num); 367 | $/=">"; 368 | if($enzyme_file=~/\.gz$/){#打开酶切文件 369 | open IN,"gzip -dc $enzyme_file|" or die "cannot open $enzyme_file\n"; 370 | }else{ 371 | open IN,"$enzyme_file" or die "cannot open $enzyme_file\n"; 372 | } 373 | open OU,"|gzip > $outdir/$enzyme.$level.fa.gz" or die "cannot open $outdir/$enzyme.$level.fa.gz\n"; 374 | ; 375 | while(){ 376 | chomp; 377 | my @tmp=split /\n/; 378 | #GCF号|基因组内部标签排序|scaffoldid|startpos|正反向酶切|是否为指定水平下unique&&noredundancy标签 379 | my ($gcfid,$ingenome_tag_num,$scaid,$start,$chain,$unique)=split /\|/,$tmp[0]; 380 | my $tag=$tmp[1]; 381 | $gcfid=~s/^>//; 382 | next unless(exists $hash_gcf2class{$gcfid});#gcfid不在列表中则跳过 383 | $hash_genome_tag_num{$gcfid}++;#基因组电子酶切标签数 384 | unless(exists $hash{$tag}){#如果不存在,则进行反向互补 385 | $tag=~tr/ATCG/TAGC/; 386 | $tag=reverse($tag); 387 | #反向互补后,改变链的方向 388 | if($chain==0){ 389 | $chain=1; 390 | }elsif($chain==1){ 391 | $chain=0; 392 | } 393 | } 394 | if(keys %{$hash{$tag}}==1){#指定水平下为unique 395 | if($remove_redundant eq "yes"){#基因组内部需要去冗余 396 | if($hash_ingenome{$gcfid}{$tag}==1){#在基因组内部只出现过一次(noredundancy) 397 | $unique=1; 398 | $hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数 399 | }else{ 400 | $unique=0; 401 | } 402 | }else{#基因组内部不需要去冗余 403 | $unique=1; 404 | $hash_genome_unique_tag_num{$gcfid}++;#基因组电子酶切unique标签数 405 | } 406 | }else{ 407 | $unique=0; 408 | } 409 | print OU ">$gcfid|$ingenome_tag_num|$scaid|$start|$chain|$unique\n$tag\n"; 410 | for (my $i=100;$i>0;$i=$i-1){ #每完成1%则输出进度 411 | if((keys %hash_genome_tag_num)/$genome_total_num*100>=$i){ 412 | print STDOUT "$i% " unless(exists $complete{$i});#仅没输出过的才会输出日志 413 | $complete{$i}++; 414 | last; 415 | } 416 | } 417 | } 418 | $/="\n"; 419 | close IN; 420 | close OU; 421 | open STAT,"> $outdir/$enzyme.$level.stat.xls" or die "cannot open $outdir/$enzyme.$level.stat.xls\n"; 422 | print STAT "#Unique_Name\tAll_Tag_Num\tUnique_Tag_Num\n"; 423 | if($list=~/\.gz/){ 424 | open LI,"gzip -dc $list|" or die "cannot open $list\n"; 425 | }else{ 426 | open LI,"$list" or die "cannot open $list\n"; 427 | } 428 | while(
  • ){ 429 | next if(/^#/ || /^$/);# remove blank lines or lines starting with # 430 | my $line=$_; 431 | chomp($line); 432 | my @tmp=split /\t/,$line; 433 | print STAT "$tmp[0]"; 434 | if(exists $hash_genome_tag_num{$tmp[0]}){#酶切标签数 435 | print STAT "\t$hash_genome_tag_num{$tmp[0]}"; 436 | }else{ 437 | print STAT "\t0"; 438 | } 439 | if(exists $hash_genome_unique_tag_num{$tmp[0]}){#unique标签数 440 | print STAT "\t$hash_genome_unique_tag_num{$tmp[0]}\n"; 441 | }else{ 442 | print STAT "\t0\n"; 443 | } 444 | # print STAT "$tmp[0]\t$hash_genome_tag_num{$tmp[0]}\t$hash_genome_unique_tag_num{$tmp[0]}\n"; 445 | } 446 | close LI; 447 | close STAT; 448 | print STDOUT "\n###($level) Identification of taxa-specifc 2b-RAD tags -- complete, ",`date`; #STDOUT 449 | } 450 | 451 | print STDOUT "###($enzyme) Record the taxonomies of each 2b-RAD tag and identification of taxa-specifc 2b-RAD tags -- complete, ",`date`;#STDOUT 452 | 453 | 454 | 455 | 456 | sub CheckDir{# create the directory 457 | my $file = shift; 458 | unless( -d $file ){ 459 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 460 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 461 | } 462 | return 1; 463 | } 464 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2bRAD-M 2 | ---------------------------- 3 | This repository provides the 2bRAD-M computational pipeline for microbiome analysis, which has been formally published on Genome Biology: 4 | 5 | [Species-resolved sequencing of low-biomass or degraded microbiomes using 2bRAD-M](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02576-9 6 | ) by Zheng Sun, Shi Huang, Pengfei Zhu, Lam Tzehau, Helen Zhao, Jia Lv, Rongchao Zhang, Lisha Zhou, Qianya Niu, Xiuping Wang, Meng Zhang, Gongchao Jing, Zhenmin Bao, Jiquan Liu, Shi Wang, Jian Xu. Genome Biology, doi: https://doi.org/10.1186/s13059-021-02576-9 7 | 8 | ## How it works 9 | The principle of 2bRAD-M on microbiome analyses: 10 | (1) reliable enzyme-digested sequence tags can be derived that are specific to high-resolution taxa (e.g., species or strain) yet universally applicable for a broad range or all of bacterial, archaeal and fungal genomes; 11 | (2) these taxa-specific, iso-length sequence tags can be evenly amplified and sequenced; 12 | (3) the tag sequences can be mapped to reference genomes to reconstruct faithfully the taxonomic composition. 13 | 14 | You can also find more details for the 2bRAD-M workflow below. 15 | 16 | ![workflow](2bRAD-M_workflow.png) 17 | 18 | * The experimental workflow has two steps: 19 | 20 | (1) BcgI (a commercially available Type IIB restriction enzymes) is used, as an example, to digest total genomic DNA extracted from microbiome samples. BcgI recognizes the sequence of CGA-N6-TGC in the genomic DNA and cleaves on both upstream (12-10 bp) and downstream (10-12 bp) of this signature, producing short and iso-length DNA (32bp without sticky ends) across all loci. 21 | 22 | (2) These so-called “2bRAD fragments” are ligated to adaptors, amplified and then sequenced. 23 | 24 | * The computational workflow. The foundation here is a unique 2bRAD tag database (“2b-Tag-DB”), which contains taxa-specific 2bRAD tags identified from all the sequenced bacteria, fungi and archaea genomes. Mapping the 2bRAD reads against 2b-Tag-DB thus identifies the presence of species in a sample. Subsequently, to estimate relative abundance of the identified taxa, the mean read coverage of all 2bRAD tags specific to each taxon is derived. To improve utilization rate of reads and classification accuracy, a secondary, sample-specific 2b-Tag-DB was dynamically derived from only those candidate taxa identified in a particular sample, which produces more species-specific 2bRAD tags than the original 2b-Tag-DB and results in more accurate modeling of relative abundance of taxa. 25 | 26 | ## Installation 27 | 28 | ### System requirements 29 | 30 | #### Dependencies 31 | All scripts in 2bRAD-M are written using Perl and recommended to run in a conda environment. This program should work properly in the Unix systems, or Mac OSX, as all required packages can be appropreiately download and installed. 32 | 33 | #### Disk space 34 | Construction of a 2bRAD-M standard database (i.e., 2b-Tag-DB) requires approximately 10 GB of disk space. 35 | 36 | #### Memory usage 37 | Running the standard pipeline requires < 30Gb of RAM, which is also compatible with multithreading. For example, the BcgI-derived (default) database size is 9.32 GB, and you will need more than that in RAM if you want to build the default database. In a test early on, the peak memory can reach up to 29GB. 38 | 39 | #### Speed 40 | About 20 minutes are required for loading the 2b-Tag-DB. For a typical gut metagenome, ~40 minutes are required for species profiling. 41 | 42 | ### Download the pipeline 43 | * Clone the latest version from GitHub (recommended): 44 | 45 | `git clone https://github.com/shihuang047/2bRAD-M/` 46 | `cd 2bRAD-M` 47 | 48 | This makes it easy to update the software in the future using `git pull` as bugs are fixed and features are added. 49 | * Alternatively, directly download the whole GitHub repo without installing GitHub: 50 | 51 | `wget https://github.com/shihuang047/2bRAD-M/archive/master.zip` 52 | `unzip master.zip` 53 | `cd 2bRAD-M-master` 54 | 55 | ### Install 2bRAD-M pipeline in a conda environment 56 | * Conda installation 57 | [Miniconda](https://docs.conda.io/en/latest/miniconda.html) provides the conda environment and package manager, and is the recommended way to install 2bRAD-M. 58 | * Create a conda environment for 2bRAD-M pipeline: 59 | After installing Miniconda and opening a new terminal, make sure you’re running the latest version of conda: 60 | 61 | `conda update conda` 62 | 63 | Once you have Miniconda installed, create a conda environment with the yml file `tools/2bRAD-M-20201225-conda.yml`. 64 | 65 | `conda env create -n 2bRAD-M-20201225 --file tools/2bRAD-M-20201225-conda.yml` 66 | 67 | * Activate the 2bRAD-M conda environment by running the following command: 68 | 69 | `source activate 2bRAD-M-20201225` 70 | 71 | ### Construct the reference 2B-Tag database (required) 72 | 73 | The script `tools/Download_2bRADTagDB_NCBI.pl` in this repo can be used to: 74 | 75 | * download the prebuilt 2b-Tag-DB from Figshare based on the NCBI Refseq (Oct., 2019) or GTDB 76 | * download the example datasets for pipeline tutorial 77 | 78 | You can specify $your_database_path locally (`$your_database_path=./2B-RAD-M-ref_db_NCBI/` or `$your_database_path=./2B-RAD-M-ref_db_GTDB/`) and run the script as following: 79 | 80 | `perl tools/Download_2bRADTagDB_NCBI.pl $your_database_path` or 81 | 82 | `perl tools/Download_2bRADTagDB_GTDB.pl $your_database_path` 83 | 84 | It usually can take around 30 mins to save all files in the `$your_database_path`, but it still depends on your internet connenction speed and stability. 85 | 86 | ## 2bRAD-M pipeline tutorial 87 | 88 | ### **Overview** 89 | 90 | The 2bRAD-M analysis pipeline comprises a combination of 2bRAD-M scripts and optimized parameters for analyzing the 2bRAD or shotgun metagenomics sequencing data, which can output the most comprehensive output on each sample. The pipeline includes: 91 | 92 | (1) **The digital restriction digestion** It is required when input DNA sequences are longer than 31bp or 33bp (e.g., 150bp) or derived from the common shotgun sequencing protocols. If input DNA sequences were produced by the 2bRAD sequencing protocol this step will be skipped. 93 | 94 | (2) **Qualitative analysis** Identify the microbes and preliminarily estimate their abundances based on the 2bRAD (such as. BcgI derived) species-specific markers of a prebuilt 2b-Tag-DB based on the NCBI Refseq (Oct., 2019). 95 | 96 | (3) **Quantitative analysis** Estimate the microbial abundances more precisely based on the 2bRAD species-specific markers in a sample-specific 2b-Tag-DB. Firstly, we fetch condidate genomes that were identified in a particular biolgical sample in step (2) from NCBI Refseq to construct a sample-targeted 2b-Tag-DB. Next, we remapped the sequencing reads to this more concise 2b-Tag-DB to estimate the abundance of all detected taxa and used G score to filter potential false positive discovery of microbial features. 97 | 98 | (4) **Merging results from multiple samples** The sample-wise results will be automatically merged into a feature table. 99 | 100 | ### **Usage** 101 | 102 | The main script for implementing those analyses is `bin/2bRADM_Pipline.pl` in this repo. You can check out the usage by printing the help information via `perl bin/2bRADM_Pipline.pl -h`. 103 | 104 | ``` 105 | DESCRIPTION 106 | We here provided a streamlined 2bRAD pipeline for analyzing microbial compositions from the 2bRAD/shotgun metagenomics data based on the species-specific 2bRAD markers. 107 | USAGE 108 | perl bin/2bRADM_Pipline.pl 109 | PARAMETERS 110 | -t The acceptable types of an input sequencing data file. The file path should be also listed in the sample list file (para -l). 111 | [1] generic genome data in a fasta format 112 | [2] shotgun metagenomic data in a fastq format(either SE or PE platform is accepted) 113 | [3] 2bRAD data from a SE sequencing platform in a fastq format 114 | [4] 2bRAD data from a PE sequencing platform in a fastq format 115 | -l The filepath of the sample list. Each line includes an input sample ID and the file path of corresponding DNA sequence data where each field should be separated by . A line in this file that begins with # will be ignored. Only four formats of a sample list file are accepted and should match with parameter -t: 116 | [1] samplesample.fa(.gz) 117 | [2] sampleshotgun.1.fq(.gz)(shotgun.2.fq.gz) 118 | [3] sample2bsingle.fq(.gz or 2bsingle.1.fq.gz) 119 | [4] sample1sample2sample3sample4sample5R1.fq(.gz)R2.fq(.gz) 120 | -d The working path of 2B-Tag-DB. 121 | -o The output directory (if it doesn't exist, will be created automatically as 'outdir'). 122 | OPTIONS of Qualitative Analysis 123 | -p If qualitative analysis applies or not [default: yes] (yes or no) 124 | -s1 The enzymatic site(s) for the qualitative analysis. One or more sites can be specified(comma separated) [default: 5] 125 | It represents which enzymatic recognition site(s) will be used for digital restriction digestion, and contructing 2b-Tag-DB for the following qualitative analysis and quantitative analysis. 126 | [1]CspCI [5]BcgI [9]BplI [13]CjePI [17]AllEnzyme 127 | [2]AloI [6]CjeI [10]FalI [14]Hin4I 128 | [3]BsaXI [7]PpiI [11]Bsp24I [15]AlfI 129 | [4]BaeI [8]PsrI [12]HaeIV [16]BslFI 130 | -t1 The taxonomic rank for 2bRAD markers in the qualitative database, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: species] 131 | OPTIONS of Quantitative Analysis 132 | -q If quantitative analysis applies or not [default: yes] (yes or no) 133 | -gsc G score threshold for identifying the condidate microbes present in a sample in qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB in the quantitative analysis step. [default: 5, it means >5] 134 | -gcf The threshold of the 2bRAD tag number for the presence of a microbial genome (i.e., GCF) in the qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB in the quantitative analysis step. [default: 1, it means >1] 135 | -s2 The enzyme site for the quantitative analysis. (refer to -s1) [default: 5, must be included in para -s1] 136 | -t2 The taxonomic rank for 2bRAD markers in the quantitative analysis, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: species] 137 | OPTIONS of CPU 138 | -c1 The number of CPUs used for parallelizing the digital digestion step for multiple samples. [default: 10] 139 | -c2 The number of CPUs used for parallelizing abundance profiling for multiple samples based on a single enzyme and combining results from multiple enzymes have been set via -s1. [default: 8] (each CPU needs about 15~65G of memory) 140 | OPTIONS of Quality Control 141 | -qc If quality control applies or not. [default: yes] (yes or no) 142 | -qcn The maximum ratio of base "N". [default: 0.08] 143 | -qcq The minimum quality score to keep. [default: 30] 144 | -qcp The minimum percentage of bases that must have [-qcq] quality. [default: 80] 145 | -qcb ASCII+33 or ASCII+64 quality scores as Phred scores [default: 33] 146 | OPTIONS of Merging profiles 147 | -ms The mock-community sample name(s) (separated by commas). The specified samples will be removed from the merged output table. 148 | -ncs The sample name(s) (separated by commas) of negative control that can be used for filtering potential contaminations. 149 | -h|help Print this help information. 150 | 151 | ``` 152 | 153 | ### **Example data** 154 | 155 | * **Analyze a in silico mock community** (synthetic data: `simulate_50.BcgI.fq.gz`) To test the generalizability of our 2bRAD markers for microbial profiling, we designed a mock microbiome structure containing 50 microbial species from a wide range of habitats such as oral, gut and soil environments. Given a specified abundance profile, we simulated the sequencing data based on all related genomes using [wigsim](https://github.com/lh3/wgsim). The sequence data file `simulate_50.BcgI.fq.gz` and its corresponding list file `list_simulation` will be automatically downloaded to `$your_database_path` via `tools/Download_2bRADTagDB.pl` as described above. Once all these downloaded, you can try to run the following command that will output the estimated microbial profile. 156 | 157 | ``` 158 | perl bin/2bRADM_Pipline.pl \ 159 | -t 3 \ 160 | -l $your_database_path/list_simulation \ 161 | -d $your_database_path/ \ 162 | -o outdir \ 163 | -gsc 60 \ 164 | -qc no 165 | ``` 166 | 167 | * **Analyze a mock microbial community: MSA1002** (sequencing data: `MSA1002_R1.fq.gz`) 168 | [MSA1002](https://www.atcc.org/en/Global/Products/MSA-1002.aspx) comprises the genomic material from 20 microbial strains that are evenly mixed. We sequenced this DNA sample using our 2bRAD protocol for optimizing and testing the bioinformatic pipeline. The sequencing data file `MSA1002_R1.fq.gz` and its corresponding list file `list_mock` will be automatically downloaded to `$your_database_path` via `tools/Download_2bRADTagDB.pl` as described above. Once all these downloaded, you can try to run the following command that will output the estimated microbial profile. 169 | 170 | ``` 171 | perl bin/2bRADM_Pipline.pl \ 172 | -t 3 \ 173 | -l $your_database_path/list_mock \ 174 | -d $your_database_path/ \ 175 | -o outdir 176 | ``` 177 | 178 | ### **Output formats** 179 | 180 | 2bRAD-M offers a standard format of sample-wide results. You can find this standard profiling result of a single sample at `$outdir/quantitative/$sample_id.combine.xls`. Taking the `MSA1002` analysis as example, the output is located at `outdir/quantitative/MSA1002.combine.xls`. 2bRAD-M standard sample report format is tab-delimited with one line per taxon. The fields of the output, from left-to-right, are as follows: 181 | 182 | 1 to 7- The taxonomic ranks for a microbial taxon identified: 1 - "Kingdom"; 2 - "Phylum"; 3 - "Class"; 4 - "Order"; 5 - "Family"; 6 - "Genus"; 7 - "Species" 183 | 8 - "Theoretical_Tag_Num": Average number of all 2bRAD marker tags of genomes under this taxon in theory 184 | 9 - "Sequenced_Tag_Num": Number of 2bRAD marker tags detected in the sequencing data under this taxon 185 | 10 - "Percent": The percent of sequenced 2bRAD marker tags under this taxon 186 | 11 - "Sequenced_Reads_Num": Total number of sequenced reads 187 | 12 - "Sequenced_Reads_Num/Theoretical_Tag_Num": The ratio of "Sequenced_Reads_Num" and "Theoretical_Tag_Num", which further used for calculating "relative abundance" of this taxon within a sample via a normalization by the column-wise sum 188 | 13 - "Sequenced_Reads_Num/Sequenced_Tag_Num": The ratio of "Sequenced_Reads_Num" and "Sequenced_Tag_Num" 189 | 14 - "Sequenced_Tag_Num(depth>1)": Number of sequenced tags that have >1 sequencing coverage 190 | 15 - "G_Score": the geometric mean of "Sequenced_Reads_Num" and "Sequenced_Tag_Num", which is used for controlling false positive discovery 191 | 192 | 193 | 2bRAD-M also offer a standard format of the study-wise result. If you provided multiple sample IDs and corresponding fasta/fastq file names in the list file, this pipeline can automatically merge the abundance profiling results from multiple samples into one feature table, which is located at `$outdir/quantitative/Abundance_Stat.all.xls`. If you setup the negative-control samples for filtering potential contaminations in biological samples, you can find the filtered abundance profiles in the `$outdir/quantitative/Abundance_Stat.filtered.xls`. Otherwise, these two files should be identical. The standard study report format is also tab-delimited with one line per taxon. The fields of the output, from left-to-right, are as follows: 194 | 195 | 1 to 7 - The taxonomic ranks for a microbial taxon identified: 1 - "Kingdom"; 2 - "Phylum"; 3 - "Class"; 4 - "Order"; 5 - "Family"; 6 - "Genus"; 7 - "Species" 196 | 8 to N - The column name indicates a sample ID in this study, where you can find the relative abundances of taxa within this sample. `N = (the number of samples) + 7` 197 | 198 | 199 | ## 2bRAD-M scripts for customized analyses 200 | * [Extract 2b tags](scripts/2bRADExtraction.pl) This script conducts digital type-2B-restriction disgestion of DNA data generated by a wide range of sequencing protocols by one of 16 restriction enzymes. For a given type 2b restriction enzyme, it can return a Fasta file including resulting 2b-RAD tags, and a statistical summary including raw number of input sequences, restriction enzyme used, number of restriction fragments produced, percentage of restriction fragments over the whole (meta)genome data. 201 | * [Build your own customized 2b-Tag-DB](scripts/CreateQualDatabase_2bRAD.pl) This script constructs the taxa-specific 2b-RAD reference genome database from a whole-genome reference database. 202 | * [Species profiling for a single sample based on 2bRAD markers of a single enzyme](scripts/CalculateRelativeAbundance_Combined2bEnzymes.pl) This script computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by one or multiple type 2b restriction enzymes. 203 | * [Species profiling for a single sample based on 2bRAD markers of multiple enzymes](scripts/CalculateRelativeAbundance_Single2bEnzymes.pl) This script computes the relative abundance of taxa identified from each of 2b-RAD samples using a precalcuated taxa-specific 2b-RAD reference database by a single type 2b restriction enzyme. 204 | * [Merge species profiles for multiple samples](scripts/MergeProfilesFromMultipleSamples.pl) This script can merge the abundance profiles from mulitple samples and filter potential contaminations in each biological sample using negative control samples. 205 | 206 | ## Reference 207 | * Wang S, Meyer E, McKay JK, Matz MV. 2b-RAD: a simple and flexible method for genome-wide genotyping. Nat Methods. 2012 May 20;9(8):808-10. doi: 10.1038/nmeth.2023. PMID: 22609625. 208 | * Wang S, Liu P, Lv J, Li Y, Cheng T, Zhang L, Xia Y, Sun H, Hu X, Bao Z. Serial sequencing of isolength RAD tags for cost-efficient genome-wide profiling of genetic and epigenetic variations. Nat Protoc. 2016 Nov;11(11):2189-2200. doi: 10.1038/nprot.2016.133. Epub 2016 Oct 6. PMID: 27711051. 209 | 210 | ## Acknowledgement 211 | 212 | This work was funded by Grant 31800088 from National Natural Science Foundation and 2019M652501 from China Postdoctoral Science Foundation, and Taishan Scholar Fund of Shandong Province of China. 213 | 214 | -------------------------------------------------------------------------------- /bin/2bRADM_Pipline.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Authors: Zheng Sun , Rongchao Zhang, Shi Huang 3 | 4 | use warnings; 5 | use strict; 6 | use Getopt::Long; 7 | use FindBin qw($Bin); 8 | use File::Basename qw(dirname basename); 9 | use Parallel::ForkManager; 10 | use Cwd 'abs_path'; 11 | 12 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang"; 13 | my $time="20210102"; 14 | 15 | #scripts path 16 | my $Bin="$Bin/../scripts/"; 17 | 18 | #定性参数 19 | my $qual ||="yes";#是否进行定性 20 | my $site1 ||=5;#酶切位点 21 | my $level1 ||="species";#水平 22 | 23 | #定量参数 24 | my $quan ||="yes";#是否进行定量 25 | my $g_score_threshold ||=5;#对定性的合并结果,进行分类筛选 gscore阈值 26 | my $GCF_threshold ||=1;#鉴定到某个基因组几个标签以上,该基因组才会被纳入定量建库 27 | my $site2 ||=5;#酶切位点 28 | my $level2 ||="species";#水平 29 | 30 | #数据酶切,基因组酶切CPU 31 | my $cpu1 ||=10; 32 | #多酶定性cpu 33 | my $cpu2 ||=8; 34 | 35 | #数据质控参数 36 | my $qc ||="yes";#是否进行质控 37 | my $qc_n ||=0.08; 38 | my $qc_q ||=30; 39 | my $qc_p ||=80; 40 | my $qc_b ||=33; 41 | 42 | my $mock_sample=""; 43 | my $negative_control_sample=""; 44 | 45 | select STDOUT;$|=1;#标准输出清楚缓存 46 | 47 | my($list,$type,$database,$outdir,$help); 48 | GetOptions( 49 | "t:i" => \$type, 50 | "l:s" => \$list, 51 | "d:s" => \$database, 52 | "o:s" => \$outdir, 53 | #初步定性 54 | "p:s" => \$qual, 55 | "s1:s" => \$site1, 56 | "t1:s" => \$level1, 57 | 58 | #精细定量 59 | "q:s" => \$quan, 60 | "gsc:i" => \$g_score_threshold,#筛选分类 61 | "gcf:i" => \$GCF_threshold,#筛选分类中的基因组 62 | "s2:s" => \$site2, 63 | "t2:s" => \$level2, 64 | 65 | #cpu 66 | "c1:i" => \$cpu1, 67 | "c2:i" => \$cpu2, 68 | 69 | #质控参数 70 | "qc:s" => \$qc, 71 | "qcn:f" => \$qc_n, 72 | "qcq:i" => \$qc_q, 73 | "qcp:i" => \$qc_p, 74 | "qcb:i" => \$qc_b, 75 | 76 | #丰度结果过滤 77 | "ms:s" => \$mock_sample, 78 | "ncs:s" => \$negative_control_sample, 79 | 80 | "h|help:s" => \$help, 81 | ); 82 | 83 | sub usage{#帮助 84 | print STDERR "\e[;33;1m 85 | DESCRIPTION 86 | We here provided a streamlined 2bRAD pipeline for analyzing microbial compositions from the 2bRAD/shotgun metagenomics data based on the species-specific 2bRAD markers. 87 | USAGE 88 | perl $0 89 | PARAMETERS 90 | -t The acceptable types of an input sequencing data file. The file path should be also listed in the sample list file (para -l) 91 | [1] generic genome data in a fasta format 92 | [2] shotgun metagenomic data in a fastq format(either SE or PE platform is accepted) 93 | [3] 2bRAD data from a SE sequencing platform in a fastq format 94 | [4] 2bRAD data from a PE sequencing platform in a fastq format 95 | -l The filepath of the sample list. Each line includes an input sample ID and the file path of corresponding DNA sequence data where each field should be separated by . A line in this file that begins with # will be ignored. Only four formats of a sample list file are accepted and should match with parameter -t: 96 | [1] samplesample.fa(.gz) 97 | [2] sampleshotgun.1.fq(.gz)(shotgun.2.fq.gz) 98 | [3] sample2bsingle.fq(.gz or 2bsingle.1.fq.gz) 99 | [4] sample1sample2sample3sample4sample5R1.fq(.gz)R2.fq(.gz) 100 | -d The working path of 2B-Tag-DB 101 | -o The output directory (if it doesn't exist, will be created automatically as 'outdir') 102 | OPTIONS of Qualitative Analysis 103 | -p If qualitative analysis applies or not [default: $qual] (yes or no) 104 | -s1 The enzymatic site(s) for the qualitative analysis. One or more sites can be specified(comma separated) [default: $site1] 105 | It represents which enzymatic recognition site(s) will be used for digital restriction digestion, and contructing 2b-Tag-DB for the following qualitative analysis and quantitative analysis. 106 | [1]CspCI [5]BcgI [9]BplI [13]CjePI [17]AllEnzyme 107 | [2]AloI [6]CjeI [10]FalI [14]Hin4I 108 | [3]BsaXI [7]PpiI [11]Bsp24I [15]AlfI 109 | [4]BaeI [8]PsrI [12]HaeIV [16]BslFI 110 | -t1 The taxonomic level for 2bRAD markers in the qualitative database, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: $level1] 111 | OPTIONS of Quantitative Analysis 112 | -q If quantitative analysis applies or not [default: $quan] (yes or no) 113 | -gsc G score threshold for identifying the condidate microbes present in a sample in qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB in the quantitative analysis step. [default: $g_score_threshold, it means >$g_score_threshold] 114 | -gcf The threshold of the 2bRAD tag number for the presence of a microbial genome (i.e., GCF) in the qualitative analysis, which also determines the membership of sample-specific 2B-Tag-DB database in the quantitative analysis step. [default: $GCF_threshold, it means >$GCF_threshold] 115 | -s2 The enzymatic site for the quantitative analysis. (refer to -s1) [default: $site2, must be included in para -s1] 116 | -t2 The taxonomic level for 2bRAD markers in the quantitative analysis, which should be one of the following: kingdom,phylum,class,order,family,genus,species,strain. [default: $level2] 117 | OPTIONS of CPU 118 | -c1 The number of CPUs used for parallelizing the digital digestion step for multiple samples. [default: 10] 119 | -c2 The number of CPUs used for parallelizing abundance profiling for multiple samples based on a single enzyme and combining results from multiple enzymes have been set via -s1. [default: 8] (each CPU needs about 15~65G of memory) 120 | OPTIONS of Quality Control 121 | -qc If quality control apply or not [default: $qc] (yes or no) 122 | -qcn The maximum ratio of base \"N\" [default: $qc_n] 123 | -qcq The minimum quality score to keep [default: $qc_q] 124 | -qcp The minimum percentage of bases that must have [-qcq] quality [default: $qc_p] 125 | -qcb ASCII+33 or ASCII+64 quality scores as Phred scores [default: $qc_b] 126 | OPTIONS of Merging profiles 127 | -ms The mock-community sample name(s) (separated by commas). The specified samples will be removed from the merged output table. 128 | -ncs The sample name(s) (separated by commas) of negative control that can be used for filtering potential contaminations. 129 | -h | help Print this help information. 130 | 131 | AUTHOR: $author $time\e[0m\n"; 132 | } 133 | 134 | if(defined($help)){ 135 | &usage; 136 | exit 0; 137 | } 138 | 139 | 140 | my %hs_site2enzyme=(#酶切位点对应表 141 | '1' => 'CspCI', '2' => 'AloI', 142 | '3' => 'BsaXI', '4' => 'BaeI', 143 | '5' => 'BcgI', '6' => 'CjeI', 144 | '7' => 'PpiI', '8' => 'PsrI', 145 | '9' => 'BplI', '10' => 'FalI', 146 | '11' => 'Bsp24I', '12' => 'HaeIV', 147 | '13' => 'CjePI', '14' => 'Hin4I', 148 | '15' => 'AlfI', '16' => 'BslFI', 149 | ); 150 | #参数检测 151 | unless($type && $list && $database && $outdir){ 152 | &usage; 153 | print STDERR "Parameter -t -l -d or -o error.\n"; 154 | exit 1; 155 | } 156 | 157 | #转化为绝对路径 158 | $list=abs_path($list); 159 | $database=abs_path($database); 160 | $outdir=abs_path($outdir); 161 | 162 | #输入数据类型检测 163 | unless($type==1 || $type==2 || $type==3 || $type==4){ 164 | &usage; 165 | print STDERR "Parameter -t is wrong."; 166 | exit 1; 167 | } 168 | 169 | #数据库文件检测 170 | unless(-e "$database/abfh_classify_with_speciename.txt.gz"){ 171 | print STDERR "incomplete database, $database/abfh_classify_with_speciename.txt.gz does not exists\n"; 172 | exit 1; 173 | } 174 | 175 | #定性参数检测 176 | unless($qual eq "no" || $qual eq "yes"){ 177 | &usage; 178 | print STDERR "Parameter -p is wrong. Cannot get $qual\n"; 179 | exit 1; 180 | } 181 | #定性鉴定水平检测 182 | unless($level1 eq "kingdom" || $level1 eq "phylum" || $level1 eq "class" || $level1 eq "order" || $level1 eq "family" || $level1 eq "genus" || $level1 eq "species" || $level1 eq "strain"){ 183 | &usage; 184 | print STDERR "Parameter -t1 is wrong. Cannot get $level1\n"; 185 | exit 1; 186 | } 187 | #定性酶切位点检查及数据库检测 188 | my %hs_site1; 189 | if($site1=~/17/){ 190 | $site1="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16"; 191 | } 192 | my @site1=split /,/,$site1; 193 | for my $i(@site1){ 194 | $hs_site1{$i}++; 195 | unless(exists $hs_site2enzyme{$i}){ 196 | &usage; 197 | print STDERR "parameter -s1 is wrong, $i does not exists\n"; 198 | exit 1; 199 | } 200 | #检测定性数据库并检测species数据库,用于定量(findgenome脚本) 201 | unless(-e "$database/$hs_site2enzyme{$i}.$level1.fa.gz" && -e "$database/$hs_site2enzyme{$i}.species.fa.gz"){ 202 | &usage; 203 | print STDERR "incomplete database, $database/$hs_site2enzyme{$i}.$level1.fa.gz or $database/$hs_site2enzyme{$i}.species.fa.gz does not exists\n"; 204 | exit 1; 205 | } 206 | } 207 | 208 | #定量参数检测 209 | #定性鉴定水平检测 210 | unless($level2 eq "kingdom" || $level2 eq "phylum" || $level2 eq "class" || $level2 eq "order" || $level2 eq "family" || $level2 eq "genus" || $level2 eq "species" || $level2 eq "strain"){ 211 | &usage; 212 | print STDERR "Parameter -t2 is wrong. Cannot get $level2\n"; 213 | exit 1; 214 | } 215 | #定性酶切位点检查 216 | my @site2; 217 | if($quan eq "yes"){ 218 | if($site2=~/17/){ 219 | $site2="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16"; 220 | } 221 | @site2=split /,/,$site2; 222 | for my $i(@site2){ 223 | unless(exists $hs_site2enzyme{$i}){#检测site2输入是否正确 224 | &usage; 225 | print STDERR "parameter -s2 is wrong, $i does not exists\n"; 226 | exit 1; 227 | } 228 | unless(exists $hs_site1{$i}){#检测site2是否包含于site1 229 | &usage; 230 | print STDERR "parameter -s2 is wrong, $i is not included in para -s1\n"; 231 | exit 1; 232 | } 233 | } 234 | }elsif($quan eq "no"){ 235 | ; 236 | }else{ 237 | &usage; 238 | print STDERR "parameter -q is wrong\n"; 239 | exit 1; 240 | } 241 | 242 | #质控参数检查 243 | unless($qc eq "yes" || $qc eq "no"){ 244 | &usage; 245 | print STDERR "Parameter -qc is wrong. Cannot get $qc\n"; 246 | } 247 | 248 | print STDOUT "###COMMAND: perl $0 -t $type -l $list -d $database -o $outdir -p $qual -s1 $site1 -t1 $level1 -q $quan -gsc $g_score_threshold -gcf $GCF_threshold -s2 $site2 -t2 $level2 -c1 $cpu1 -c2 $cpu2 -qc $qc -qcn $qc_n -qcq $qc_q -qcp $qc_p -qcb $qc_b -ms $mock_sample -ncs $negative_control_sample\n"; 249 | &CheckDir($outdir); 250 | #数据酶切 251 | &CheckDir("$outdir/enzyme_result"); 252 | print STDOUT "###Electronic digestion started, ",`date`; 253 | open LIST,"$list" or die "cannot open $list\n"; 254 | my $pm=new Parallel::ForkManager($cpu1); #多线程 255 | while(){ 256 | my $line=$_; 257 | if(/^#/ || /^$/){;}else{#去除注释行和空行 258 | chomp($line); 259 | my @tmp=split /\t/,$line; 260 | for my $i(@site1){ 261 | my $pid=$pm->start and next; 262 | if($type!=4){#除2brad五标签之外其他数据处理 263 | unless(-e "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz"){ 264 | &execute("perl $Bin/2bRADExtraction.pl -i @tmp[1..$#tmp] -t $type -s $i -od $outdir/enzyme_result -op $tmp[0] -gz yes -qc $qc -n $qc_n -q $qc_q -p $qc_p -b $qc_b -fm fa 1>/dev/null"); 265 | `sleep 1s`; 266 | }else{ 267 | print STDOUT "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz already exists, skip.\n"; 268 | `sleep 1s`; 269 | } 270 | }else{#2brad五标签处理 271 | unless(-e "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz" && 272 | -e "$outdir/enzyme_result/$tmp[1].$hs_site2enzyme{$i}.fa.gz" && 273 | -e "$outdir/enzyme_result/$tmp[2].$hs_site2enzyme{$i}.fa.gz" && 274 | -e "$outdir/enzyme_result/$tmp[3].$hs_site2enzyme{$i}.fa.gz" && 275 | -e "$outdir/enzyme_result/$tmp[4].$hs_site2enzyme{$i}.fa.gz" ){ 276 | &execute("perl $Bin/2bRADExtraction.pl -i $tmp[-2] $tmp[-1] -t $type -s $i -od $outdir/enzyme_result -op @tmp[0..4] -gz yes -qc $qc -n $qc_n -q $qc_q -p $qc_p -b $qc_b -fm fa 1>/dev/null"); 277 | }else{ 278 | print STDOUT "$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz && "; 279 | print STDOUT "$outdir/enzyme_result/$tmp[1].$hs_site2enzyme{$i}.fa.gz && "; 280 | print STDOUT "$outdir/enzyme_result/$tmp[2].$hs_site2enzyme{$i}.fa.gz && "; 281 | print STDOUT "$outdir/enzyme_result/$tmp[3].$hs_site2enzyme{$i}.fa.gz && "; 282 | print STDOUT "$outdir/enzyme_result/$tmp[4].$hs_site2enzyme{$i}.fa.gz already exist, skip\n"; 283 | `sleep 1s`; 284 | } 285 | } 286 | $pm->finish; 287 | } 288 | } 289 | } 290 | $pm->wait_all_children; 291 | close LIST; 292 | print STDOUT "###Electronic digestion completed, ",`date`; 293 | 294 | 295 | ##整理列表 296 | &CheckDir("$outdir/list"); 297 | for my $i(@site1){ 298 | open LIST,"$list" or die "cannot open $list\n"; 299 | open OU,">$outdir/list/$hs_site2enzyme{$i}.list" or die "cannot open $outdir/list/$hs_site2enzyme{$i}.list\n"; 300 | while(){ 301 | next if(/^#/ || /^$/); 302 | chomp; 303 | my @tmp=split /\t/; 304 | if($type!=4){#除2brad五标签之外其他数据处理 305 | print OU "$tmp[0]\t$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz\n"; 306 | }else{#2brad五标签处理 307 | print OU "$tmp[0]\t$outdir/enzyme_result/$tmp[0].$hs_site2enzyme{$i}.fa.gz\n"; 308 | print OU "$tmp[1]\t$outdir/enzyme_result/$tmp[1].$hs_site2enzyme{$i}.fa.gz\n"; 309 | print OU "$tmp[2]\t$outdir/enzyme_result/$tmp[2].$hs_site2enzyme{$i}.fa.gz\n"; 310 | print OU "$tmp[3]\t$outdir/enzyme_result/$tmp[3].$hs_site2enzyme{$i}.fa.gz\n"; 311 | print OU "$tmp[4]\t$outdir/enzyme_result/$tmp[4].$hs_site2enzyme{$i}.fa.gz\n"; 312 | } 313 | } 314 | close LIST; 315 | close OU; 316 | } 317 | if($type==4){#2brad五标签样品名格式行转化成列 318 | open LIST,"$list" or die "cannot open $list\n"; 319 | open OULI,">$outdir/list/2brad_5tag.list" or die "cannot open $outdir/list/2brad_5tag.list\n"; 320 | while(){ 321 | next if(/^#/ || /^$/);#去除注释行和空行 322 | chomp; 323 | my @tmp=split /\t/; 324 | for my $i(0..4){ 325 | print OULI "$tmp[$i]\t$outdir/quantitative/$tmp[$i]/$tmp[$i].combine.xls\n"; 326 | } 327 | } 328 | close LIST; 329 | close OULI; 330 | }else{ 331 | open LIST,"$list" or die "cannot open $list\n"; 332 | open OULI,">$outdir/list/Abundance_Stat.list" or die "cannot open $outdir/list/Abundance_Stat.list\n"; 333 | while(){ 334 | next if(/^#/ || /^$/);#去除注释行和空行 335 | chomp; 336 | my @tmp=split /\t/; 337 | print OULI "$tmp[0]\t$outdir/quantitative/$tmp[0]/$tmp[0].combine.xls\n"; 338 | } 339 | close LIST; 340 | close OULI; 341 | } 342 | 343 | ##多线程初步定性 344 | if($qual eq "yes"){#是否需要定性 345 | print STDOUT "###Qualitative analysis started, ",`date`; 346 | &CheckDir("$outdir/qualitative"); 347 | $pm=new Parallel::ForkManager($cpu2); 348 | for my $i(@site1){ 349 | my $pid=$pm->start and next; 350 | &execute("perl $Bin/CalculateRelativeAbundance_Single2bEnzyme.pl -l $outdir/list/$hs_site2enzyme{$i}.list -d $database/ -t $level1 -s $i -o $outdir/qualitative -g 0 -v yes 1> /dev/null");#未对G_score过滤 351 | `sleep 1s`; 352 | $pm->finish; 353 | } 354 | $pm->wait_all_children; 355 | 356 | ##定性结果合并 357 | if($type!=4){#除2brad五标签之外其他数据处理 358 | &execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $list -s $site1 -io $outdir/qualitative -m combine -g 0");#未对G_score过滤 359 | }else{#2brad五标签处理 360 | &execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $outdir/list/2brad_5tag.list -s $site1 -io $outdir/qualitative -m combine -g 0");#未对G_score过滤 361 | } 362 | print STDOUT "###Qualitative completed, ",`date`; 363 | }else{ 364 | if($quan eq "no"){ 365 | print STDOUT "All Done, ",`date`; 366 | exit 0; 367 | } 368 | } 369 | 370 | 371 | if($quan eq "no"){ 372 | print STDOUT "All Done, ",`date`; 373 | exit 0; 374 | } 375 | #精细定量 376 | print STDOUT "###Quantitative analysis started, ",`date`; 377 | 378 | &CheckDir("$outdir/quantitative_sdb"); 379 | &CheckDir("$outdir/quantitative"); 380 | 381 | #FindGenome_ByQualitative 382 | if($type!=4){#除2brad五标签之外其他数据处理 383 | &execute("perl $Bin/FindGenome_ByQualitative.pl -l $list -d $database -o $outdir/quantitative_sdb -qualdir $outdir/qualitative -gscore $g_score_threshold -gcf $GCF_threshold"); 384 | }else{#2brad五标签处理 385 | &execute("perl $Bin/FindGenome_ByQualitative.pl -l $outdir/list/2brad_5tag.list -d $database -o $outdir/quantitative_sdb -qualdir $outdir/qualitative -gscore $g_score_threshold -gcf $GCF_threshold"); 386 | } 387 | 388 | if($type!=4){#除2brad五标签之外其他数据处理 389 | open LIST,"$list" or die "cannot open $list\n"; 390 | }else{#2brad五标签处理 391 | open LIST,"$outdir/list/2brad_5tag.list" or die "cannot open $outdir/list/2brad_5tag.list\n"; 392 | } 393 | $pm=new Parallel::ForkManager($cpu2);#样品间酶切位点间多线程并行 394 | my $rm; 395 | while(){ 396 | my $line=$_; 397 | if(/^#/ || /^$/){;}else{#去除注释行和空行 398 | chomp($line); 399 | my $sample_name=(split /\t/,$line)[0]; 400 | $rm .=" $outdir/quantitative_sdb/$sample_name/database "; 401 | print STDOUT "Analyze $sample_name, ",`date`; 402 | &CheckDir("$outdir/quantitative_sdb/$sample_name/database"); 403 | &execute("cp $outdir/quantitative_sdb/$sample_name/sdb.list $outdir/quantitative_sdb/$sample_name/database/abfh_classify_with_speciename.txt && gzip -f $outdir/quantitative_sdb/$sample_name/database/abfh_classify_with_speciename.txt"); 404 | #精细定量开始 405 | for my $i(@site2){ 406 | my $pid=$pm->start and next; 407 | open SA,">$outdir/quantitative_sdb/$sample_name/$hs_site2enzyme{$i}.list" or die "cannot open $outdir/quantitative_sdb/$sample_name/$hs_site2enzyme{$i}.list\n"; 408 | print SA "$sample_name\t$outdir/enzyme_result/$sample_name.$hs_site2enzyme{$i}.fa.gz\n"; 409 | close SA; 410 | if(-e "$outdir/quantitative_sdb/$sample_name/sdb.list"){#检测sdb.list文件 411 | my $file_wc="$outdir/quantitative_sdb/$sample_name/sdb.list"; 412 | my @wc_l=split /\s+/,`wc -l $file_wc`;#检测sdb.list文件 基因组行数 413 | if($wc_l[0]!=0){#list中有基因组 414 | &execute("perl $Bin/CreateQuanDatabase_2bRAD.pl -l $outdir/quantitative_sdb/$sample_name/sdb.list -e $database/$hs_site2enzyme{$i}.species.fa.gz -s $i -t $level2 -o $outdir/quantitative_sdb/$sample_name/database -r no 1> /dev/null");#建库 415 | &execute("perl $Bin/CalculateRelativeAbundance_Single2bEnzyme.pl -l $outdir/quantitative_sdb/$sample_name/$hs_site2enzyme{$i}.list -d $outdir/quantitative_sdb/$sample_name/database -t $level2 -s $i -o $outdir/quantitative -g 0 -v yes 1> /dev/null");#定量 不对gscore进行过滤 416 | }else{#list中无基因组 417 | print STDERR "[ERROR] $outdir/quantitative_sdb/$sample_name/sdb.list does not exist or the content is empty, Sample $sample_name could not be quantitatively analyzed.\n"; 418 | } 419 | } 420 | $pm->finish; 421 | } 422 | } 423 | } 424 | $pm->wait_all_children; 425 | close LIST; 426 | &execute("rm -rf $rm");#删除database 427 | 428 | #精细定量结果合并 429 | if($type!=4){#除2brad五标签之外其他数据处理 430 | &execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $list -s $site2 -io $outdir/quantitative -m combine -g 0 1> /dev/null");#不对gscore过滤 431 | }else{#2brad五标签处理 432 | &execute("perl $Bin/CalculateRelativeAbundance_Combined2bEnzymes.pl -l $outdir/list/2brad_5tag.list -s $site2 -io $outdir/quantitative -m combine -g 0 1> /dev/null");#不对gscore过滤 433 | } 434 | print STDOUT "###Quantitative analysis complete, ",`date`; 435 | 436 | print STDOUT "###Merging abundance profiles from multiple samples started, ",`date`; 437 | if($type!=4){#除2brad五标签之外其他数据处理 438 | &execute("perl $Bin/MergeProfilesFromMultipleSamples.pl -l $outdir/list/Abundance_Stat.list -o $outdir/quantitative -p Abundance_Stat -m $mock_sample -c $negative_control_sample"); 439 | }else{#2brad五标签处理 440 | &execute("perl $Bin/MergeProfilesFromMultipleSamples.pl -l $outdir/list/2brad_5tag.list -o $outdir/quantitative -p Abundance_Stat -m $mock_sample -c $negative_control_sample"); 441 | } 442 | 443 | print STDOUT "###Merging abundance profiles completed, ",`date`; 444 | 445 | print STDOUT "All Done, ",`date`; 446 | 447 | sub CheckDir{#创建目录 448 | my $file = shift; 449 | unless( -d $file ){ 450 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 451 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 452 | } 453 | return 1; 454 | } 455 | 456 | 457 | sub execute{#打出命令并执行 458 | my $cmd = shift; 459 | print STDOUT "$cmd\n"; 460 | my $exit_code=system($cmd); 461 | if($exit_code!=0){ 462 | print STDERR "Command $cmd failed with an exit code of $exit_code.\n"; 463 | exit($exit_code >> 8); 464 | } 465 | } 466 | -------------------------------------------------------------------------------- /scripts/2bRADExtraction.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Authors: Zheng Sun, Rongchao Zhang, Shi Huang 3 | # Last update: 2020.06.03 4 | use strict; 5 | use warnings; 6 | use Getopt::Long; 7 | use FindBin qw($Bin); 8 | use File::Basename qw(dirname basename); 9 | no strict 'refs'; 10 | 11 | my $author="Zheng Sun, Rongchao Zhang, Shi Huang"; 12 | my $time="2020.06.03"; 13 | 14 | #set default parameters 15 | my $ncount ||=0.08; 16 | my $quality ||=30; 17 | my $percent ||=80; 18 | my $qbase ||=33; 19 | my $format ||="fa"; 20 | my $gz ||="yes"; 21 | my $q_control ||="yes"; 22 | my $pear_cpu ||=1; 23 | #set software path 24 | my $pear ||="pear"; 25 | 26 | select STDOUT;$|=1;# Standard output for clearing cache 27 | 28 | my (@input,$type,$site,$outdir,@outprefix); 29 | my $help; 30 | GetOptions( 31 | "i:s{1,2}" => \@input,# single-end or double-end reads 32 | "t:i" => \$type, #fa:reference genome data;fq:shotgun metagenomics data,single 2b-RAD tags,five concatenated 2b-RAD tags 33 | "s:i" => \$site, # restriction sites 34 | "od:s" => \$outdir, # output directory 35 | "op:s{1,5}" => \@outprefix, # output prefix 36 | 37 | "gz:s" => \$gz, # do compression or not for the outputs 38 | 39 | "qc:s" => \$q_control, # QC: Whether QC is required 40 | "n:f" => \$ncount, # QC: The # of ambiguity nucleotides N allowed 41 | "q:i" => \$quality, # QC: Minimum per-base quality score (XX) 42 | "p:i" => \$percent, # Minimum percentage of bases that must have per-base quality score over [-q] 43 | "b:i" => \$qbase, # Quality values of bases 44 | "fm:s" => \$format, # The output file format: fa/fq 45 | 46 | #software 47 | "pe:s" => \$pear,#pear soft 48 | "pc:i" => \$pear_cpu,#cpu of pear soft 49 | 50 | "h|help:s" => \$help, 51 | ); 52 | 53 | sub usage{ 54 | print STDERR "\e[;33;1m 55 | Description 56 | It performs digital type-2B-restriction disgest of DNA data generated by a wide range of sequencing protocols by one of 16 restriction enzymes.For a given type 2b restriction enzyme, it can return a Fasta file including resulting 2b-RAD tags, and a statistical summary including raw number ofinput sequences, restriction enzyme used, number of restriction fragments produced, percentage of restriction fragments over the whole (meta)genomedata. The four application scenarios of this script are listed as below: 57 | 1.To extract 2b-RAD tags from reference genome(s) data, run: perl EeTt.pl -i genome.fa(.gz) -t 1 -s 1 -od . -op sample 58 | 2.To extract 2b-RAD tags from shotgun metagenomic sequencing data, run: perl EeTt.pl -i shotgun.R1.fq(.gz) (or shotgun.R1.fq.gz shotgun.R2.fq.gz) -t 2 -s 1 -od . -op sample 59 | 3.To extract single 2b-RAD tags from SE or PE sequencing data, run: perl EeTt.pl -i 2b-RADsingle.fq(.gz) (or 2b-RADsingle.R1.fq.gz from PE platform) -t 3 -s 1 -od . -op sample 60 | 4.To split five concatenated 2b-RAD tags from PE sequencing data, run: perl EeTt.pl -i 2b-RAD.R1.fq(.gz) 2b-RAD.R2.fq(.gz) -t 4 -s 1 -od . -op sample1 sample2 sample3 sample4 sample5 61 | Usage 62 | perl $0 -i -t -s -od -op [options]* 63 | Required: 64 | -i Input fasta/fastq filepath (.gz supported) 65 | -t The type of input fasta/fastq file 66 | [1] Reference genome data in the Fasta format 67 | [2] Shotgun (meta)genome data in the Fastq format 68 | [3] SE/PE sequencing data in the Fastq format 69 | [4] PE sequencing data in the Fastq format 70 | -s One of the type 2b restriction enzymes (sites). 71 | [1]CspCI [9]BplI 72 | [2]AloI [10]FalI 73 | [3]BsaXI [11]Bsp24I 74 | [4]BaeI [12]HaeIV 75 | [5]BcgI [13]CjePI 76 | [6]CjeI [14]Hin4I 77 | [7]PpiI [15]AlfI 78 | [8]PsrI [16]BslFI 79 | -od The output directory (automatically create if it does not exist) 80 | -op The output prefix (recommended: sample name(s)) 81 | Optional: 82 | -gz Whether the output file is compressed (yes or no) [$gz] 83 | -h|help print this help 84 | Optional (only applicable when -t equals 2, or 3, or 4, i.e. taking fastq data as input): 85 | -qc Whether quality control is required (yes or no) [$q_control] 86 | -n Maximum percentage of ambiguity bases \"N\" [default: $ncount] 87 | -q Minimum per-base quality score [default: $quality] 88 | -p Minimum percentage of bases that must have per-base quality score over [-q] [default: $percent] 89 | -b Phred quality score type [default: $qbase] 90 | -fm Output file format (fa or fq) [default: $format] 91 | -pe Path of pear soft [$pear] 92 | -pc Cpu of pear soft [$pear_cpu] 93 | Author $author $time\e[0m\n"; 94 | } 95 | 96 | if(defined($help)){ 97 | &usage; 98 | exit 0; 99 | } 100 | 101 | unless(@input && $type && $site && $outdir && @outprefix){# parameters checking 102 | &usage; 103 | print STDERR "Please check parameter -i -t -s -od -op\n"; 104 | exit 1; 105 | } 106 | 107 | 108 | 109 | # checking input args 110 | unless($gz eq "yes" || $gz eq "no"){ 111 | &usage; 112 | print STDERR "Parameter -gz is wrong\n"; 113 | exit 1; 114 | } 115 | unless($format eq "fa" || $format eq "fq"){ 116 | &usage; 117 | print STDERR "Parameter -fm is wrong\n"; 118 | exit 1; 119 | } 120 | unless($q_control eq "yes" || $q_control eq "no"){ 121 | &usage; 122 | print STDERR "Parameter -qc is wrong\n"; 123 | exit 1; 124 | } 125 | 126 | # Define the DNA sequences at a given restriction enzyme site 127 | my ($enzyme,@site,@start,@end,$minpear,$maxpear); 128 | if( 1 == $site ){#CspCI 129 | @site = ( 130 | '[AGCT]{11}CAA[AGCT]{5}GTGG[AGCT]{10}', 131 | '[AGCT]{10}CCAC[AGCT]{5}TTG[AGCT]{11}', 132 | ); 133 | $enzyme="CspCI"; 134 | @start = (0,37,78,119,160); 135 | @end = (41,82,123,164,205); 136 | }elsif( 2 == $site ){#AloI 137 | @site = ( 138 | '[AGCT]{7}GAAC[AGCT]{6}TCC[AGCT]{7}', 139 | '[AGCT]{7}GGA[AGCT]{6}GTTC[AGCT]{7}', 140 | ); 141 | $enzyme="AloI"; 142 | @start = (0,38,80,122,164); 143 | @end = (42,84,126,168,210); 144 | }elsif( 3 == $site ){#BsaXI 145 | @site = ( 146 | '[AGCT]{9}AC[AGCT]{5}CTCC[AGCT]{7}', 147 | '[AGCT]{7}GGAG[AGCT]{5}GT[AGCT]{9}', 148 | ); 149 | $enzyme="BsaXI"; 150 | @start = (0,33,69,105,141); 151 | @end = (35,71,107,143,180); 152 | $minpear ||=173; 153 | $maxpear ||=181; 154 | }elsif( 4 == $site ){#BaeI 155 | @site = ( 156 | '[AGCT]{10}AC[AGCT]{4}GTA[CT]C[AGCT]{7}', 157 | '[AGCT]{7}G[AG]TAC[AGCT]{4}GT[AGCT]{10}', 158 | ); 159 | $enzyme="BaeI"; 160 | @start = (0,38,79,120,161); 161 | @end = (40,81,122,163,205); 162 | $minpear ||=198; 163 | $maxpear ||=206; 164 | }elsif( 5 == $site ){#BcgI 165 | @site = ( 166 | '[AGCT]{10}CGA[AGCT]{6}TGC[AGCT]{10}', 167 | '[AGCT]{10}GCA[AGCT]{6}TCG[AGCT]{10}', 168 | ); 169 | $enzyme="BcgI"; 170 | @start = (0,36,75,114,153); 171 | @end = (38,77,116,155,195); 172 | $minpear ||=188; 173 | $maxpear ||=196; 174 | }elsif( 6 == $site ){#CjeI 175 | @site = ( 176 | '[AGCT]{8}CCA[AGCT]{6}GT[AGCT]{9}', 177 | '[AGCT]{9}AC[AGCT]{6}TGG[AGCT]{8}', 178 | ); 179 | $enzyme="CjeI"; 180 | @start = (0,40,83,126,169); 181 | @end = (42,85,128,171,214); 182 | }elsif( 7 == $site ){#PpiI 183 | @site = ( 184 | '[AGCT]{7}GAAC[AGCT]{5}CTC[AGCT]{8}', 185 | '[AGCT]{8}GAG[AGCT]{5}GTTC[AGCT]{7}', 186 | ); 187 | $enzyme="PpiI"; 188 | @start = (0,37,77,117,157); 189 | @end = (39,79,119,159,199); 190 | }elsif( 8 == $site ){#PsrI 191 | @site = ( 192 | '[AGCT]{7}GAAC[AGCT]{6}TAC[AGCT]{7}', 193 | '[AGCT]{7}GTA[AGCT]{6}GTTC[AGCT]{7}', 194 | ); 195 | $enzyme="PsrI"; 196 | @start = (0,37,77,117,157); 197 | @end = (39,79,119,159,199); 198 | }elsif( 9 == $site ){#BplI 199 | @site = ( 200 | '[AGCT]{8}GAG[AGCT]{5}CTC[AGCT]{8}', #palindromes 201 | ); 202 | $enzyme="BplI"; 203 | @start = (0,37,77,117,157); 204 | @end = (39,79,119,159,199); 205 | }elsif( 10 == $site ){#FalI 206 | @site = ( 207 | '[AGCT]{8}AAG[AGCT]{5}CTT[AGCT]{8}', #palindromes 208 | ); 209 | $enzyme="FalI"; 210 | @start = (0,37,77,117,157); 211 | @end = (39,79,119,159,200); 212 | $minpear ||=193; 213 | $maxpear ||=201; 214 | }elsif( 11 == $site ){#Bsp24I 215 | @site = ( 216 | '[AGCT]{8}GAC[AGCT]{6}TGG[AGCT]{7}', 217 | '[AGCT]{7}CCA[AGCT]{6}GTC[AGCT]{8}', 218 | ); 219 | $enzyme="Bsp24I"; 220 | @start = (0,37,77,117,157); 221 | @end = (39,79,119,159,200); 222 | }elsif( 12 == $site ){#HaeIV 223 | @site = ( 224 | '[AGCT]{7}GA[CT][AGCT]{5}[AG]TC[AGCT]{9}', 225 | '[AGCT]{9}GA[CT][AGCT]{5}[AG]TC[AGCT]{7}', 226 | ); 227 | $enzyme="HaeIV"; 228 | @start = (0,38,79,120,161); 229 | @end = (40,81,122,163,204); 230 | }elsif( 13 == $site ){#CjePI 231 | @site = ( 232 | '[AGCT]{7}CCA[AGCT]{7}TC[AGCT]{8}', 233 | '[AGCT]{8}GA[AGCT]{7}TGG[AGCT]{7}', 234 | ); 235 | $enzyme="CjePI"; 236 | @start = (0,39,81,123,165); 237 | @end = (41,83,125,167,209); 238 | }elsif( 14 == $site ){#Hin4I 239 | @site = ( 240 | '[AGCT]{8}GA[CT][AGCT]{5}[GAC]TC[AGCT]{8}', 241 | '[AGCT]{8}GA[CTG][AGCT]{5}[AG]TC[AGCT]{8}', 242 | ); 243 | $enzyme="Hin4I"; 244 | @start = (0,37,77,117,157); 245 | @end = (39,79,119,159,199); 246 | }elsif( 15 == $site ){#AlfI 247 | @site = ( 248 | '[AGCT]{10}GCA[AGCT]{6}TGC[AGCT]{10}', #palindromes 249 | ); 250 | $enzyme="AlfI"; 251 | @start = (0,36,75,114,153); 252 | @end = (38,77,116,155,194); 253 | }elsif( 16 == $site ){#BslFI ??some question?? single enzyme 254 | @site = ( 255 | '[AGCT]{6}GGGAC[AGCT]{14}', 256 | '[AGCT]{14}GTCCC[AGCT]{6}', 257 | ); 258 | $enzyme="BslFI"; 259 | @start = (0,34,72,110,148); 260 | @end = (38,76,114,152,190); 261 | }else{ 262 | &usage; 263 | print STDERR "The parameter -s is wrong\n"; 264 | exit 1; 265 | } 266 | 267 | &CheckDir($outdir); 268 | my $raw_reads_num=0;# check # of raw reads 如果涉及到拼接,shotgun双端 需要单独记录拼接前的reads数目 269 | 270 | if($#input==0 && $type==1 && $#outprefix==0){# reference genome(s) 271 | print STDOUT "COMMAND: perl $0 -i $input[0] -t 1 -s $site -od $outdir -op $outprefix[0] -gz $gz\n"; 272 | print STDOUT "Electronic enzyme digestion of input genome(s) -- Start, ",`date`; 273 | &Electronic_enzyme; 274 | print STDOUT "Electronic enzyme digestion of input genome(s) -- End, ",`date`; 275 | }elsif($#input==0 && $type==2 && $#outprefix==0){# single-end reads from shotgun metagenomics 276 | if($q_control eq "yes"){# need QC 277 | print STDOUT "COMMAND: perl $0 -i $input[0] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format\n"; 278 | }else{#no QC 279 | print STDOUT "COMMAND: perl $0 -i $input[0] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -fm $format\n"; 280 | } 281 | print STDOUT "Tags extraction from shotgun metagenomics data -- Start, ",`date`; 282 | &fastq; 283 | print STDOUT "Tags extraction from shotgun metagenomics data -- End, ",`date`; 284 | }elsif($#input==1 && $type==2 && $#outprefix==0){# paired-end reads from shotgun metagenomics 285 | # unless(-e "$pear"){ 286 | # &usage; 287 | # print STDERR "Can not find software $pear\n"; 288 | # exit 1; 289 | # } 290 | if($q_control eq "yes"){# need QC 291 | print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format -pe $pear -pc $pear_cpu\n"; 292 | }else{# no QC 293 | print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 2 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -fm $format -pe $pear -pc $pear_cpu\n"; 294 | } 295 | print STDOUT "Tags extraction from shotgun metagenomics data -- Start, ",`date`; 296 | &pear; # Merge the paired-end reads using PEAR. PEAR is a very fast and accurate software tool to merge paired-end reads from next-generation sequencing experiments. 297 | $input[0]="$outdir/$outprefix[0].$enzyme.pear.fastq.gz"; 298 | &fastq; 299 | &execute("rm -f $outdir/$outprefix[0].$enzyme.pear.fastq.gz"); 300 | print STDOUT "Tags extraction from shotgun metagenomics data -- End, ",`date`; 301 | }elsif($#input==0 && $type==3 && $#outprefix==0){# single iso-RAD tags 302 | if($q_control eq "yes"){# need QC 303 | print STDOUT "COMMAND: perl $0 -i $input[0] -t 3 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format\n"; 304 | }else{#no QC 305 | print STDOUT "COMMAND: perl $0 -i $input[0] -t 3 -s $site -od $outdir -op $outprefix[0] -gz $gz -qc $q_control -fm $format\n"; 306 | } 307 | print STDOUT "Data Split for single isoRAD tags from SE Platform -- Start, ",`date`; 308 | &Single_Lable; 309 | print STDOUT "Data Split for Single isoRAD tags from SE Platform -- End, ",`date`; 310 | }elsif($#input==1 && $type==4 && $#outprefix==4){# five concatenated isoRAD tags 311 | # unless(-e "$pear"){ 312 | # &usage; 313 | # print STDERR "Can not find $pear\n"; 314 | # exit 1; 315 | # } 316 | if($q_control eq "yes"){# need QC 317 | print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 4 -s $site -od $outdir -op ",join(" ",@outprefix[0..4])," -gz $gz -qc $q_control -n $ncount -q $quality -p $percent -b $qbase -fm $format -pe $pear -pc $pear_cpu\n"; 318 | }else{#不需要质控 319 | print STDOUT "COMMAND: perl $0 -i $input[0] $input[1] -t 4 -s $site -od $outdir -op ",join(" ",@outprefix[0..4])," -gz $gz -qc $q_control -fm $format -pe $pear -pc $pear_cpu\n"; 320 | } 321 | print STDOUT "Data split for five concatenated isoRAD tags from PE platform -- Start, ",`date`; 322 | &Five_Lable; 323 | print STDOUT "Data split for five concatenated isoRAD tags from PE platform -- End, ",`date`; 324 | }else{ 325 | &usage; 326 | print STDERR "Please check parameter -i -t -op\n"; 327 | exit 1; 328 | } 329 | 330 | 331 | sub pear{ 332 | my $r1=$input[0]; 333 | my $r2=$input[1]; 334 | if($r1=~/\.gz$/){# count the # of raw reads 335 | open R,"gzip -dc $r1|" or die "cannot open $r1\n"; 336 | }else{ 337 | open R,"$r1" or die "cannot open $r1\n"; 338 | } 339 | while(){#统计双端shotgun数据reads数 340 | $raw_reads_num++; 341 | ; 342 | ; 343 | ; 344 | } 345 | close R; 346 | my $outprefix="$outprefix[0].$enzyme"; 347 | &execute("$pear -f $r1 -r $r2 -e -o $outdir/$outprefix -j $pear_cpu");#pear 拼接,当插入片段过短时,会无法拼接 348 | &execute("cat $outdir/$outprefix.assembled.fastq $outdir/$outprefix.unassembled.forward.fastq $outdir/$outprefix.unassembled.reverse.fastq | gzip > $outdir/$outprefix.pear.fastq.gz");#拼接后的reads和不能拼接的R1R2合并 349 | &execute("rm -f $outdir/$outprefix.assembled.fastq $outdir/$outprefix.unassembled.forward.fastq $outdir/$outprefix.unassembled.reverse.fastq"); 350 | &execute("rm -f $outdir/$outprefix.discarded.fastq"); 351 | } 352 | sub fastq{ 353 | my $fastq=$input[0]; 354 | my $outprefix=$outprefix[0]; 355 | if($fastq=~/\.gz$/){ 356 | open IN,"gzip -dc $fastq|" or die "cannot open $fastq\n"; 357 | }else{ 358 | open IN,"$fastq" or die "cannot open $fastq\n"; 359 | } 360 | if($gz eq "yes"){ 361 | open OU,"|gzip >$outdir/$outprefix.$enzyme.$format.gz" or die "cannot open $outdir/$outprefix.$enzyme.$format.gz\n"; 362 | }else{ 363 | open OU,">$outdir/$outprefix.$enzyme.$format" or die "cannot open $outdir/$outprefix.$enzyme.$format\n"; 364 | } 365 | open STAT,">$outdir/$outprefix.$enzyme.stat.xls" or die "cannot open $outdir/$outprefix.$enzyme.stat.xls\n"; 366 | if($raw_reads_num==0){# single-end reads from shotgun metagenomics 367 | print STAT "sample\tenzyme\tinput_reads_num\tenzyme_reads_num\tpercent\n"; 368 | }else{#双端shotgun数据 369 | print STAT "sample\tenzyme\tinput_reads_num\tcombine_uncombineR1R2_reads_num\tenzyme_reads_num\tpercent\n"; 370 | } 371 | my($input_reads_num,$enzyme_reads_num,$percent_sub); 372 | $enzyme_reads_num=0; 373 | while(){ 374 | $input_reads_num++;# When inputting paired-end shotgun metagenomics reads,we only record the # of merged reads 当输入数据是双端shotgun数据时,此时记录的是拼接后的reads数以及未能拼接的R1R2之和 375 | my $line=$_ . . . ; 376 | if($q_control eq "yes"){# QC 377 | next unless(&CheckN($line)); 378 | next unless(&CheckQ($line)); 379 | } 380 | my @tmp=split /\n/,$line; 381 | my %uniq;# deredundancy as some restriction sites will have palindromes 382 | for my $site(@site){ 383 | while($tmp[1]=~/($site)/g){ 384 | my $pos=pos($tmp[1]); 385 | my $seq=$1; 386 | my $len=length($seq); 387 | $pos=$pos-$len+1; 388 | my $qual=substr($tmp[3],$pos-1,$len); 389 | pos($tmp[1])=$pos; 390 | $uniq{$pos}{$len}="$tmp[0]-$pos\n$seq\n+\n$qual\n"; 391 | } 392 | } 393 | for my $pos(sort {$a<=>$b} keys %uniq){ 394 | for my $len(sort {$a<=>$b} keys %{$uniq{$pos}}){ 395 | $enzyme_reads_num++; 396 | my @a=split /\n/,$uniq{$pos}{$len}; 397 | if($format eq "fq"){ 398 | print OU "$a[0]\n$a[1]\n$a[2]\n$a[3]\n"; 399 | }elsif($format eq "fa"){ 400 | $a[0]=~s/^@/>/; 401 | print OU "$a[0]\n$a[1]\n"; 402 | } 403 | } 404 | } 405 | undef %uniq; 406 | } 407 | close IN; 408 | close OU; 409 | if($raw_reads_num==0){#single-end reads from shotgun metagenomics 410 | $percent_sub=sprintf "%.2f",$enzyme_reads_num/$input_reads_num*100; 411 | print STAT "$outprefix\t$enzyme\t$input_reads_num\t$enzyme_reads_num\t$percent_sub%\n"; 412 | }else{#双端shotgun数据 413 | $percent_sub=sprintf "%.2f",$enzyme_reads_num/$raw_reads_num*100; 414 | print STAT "$outprefix\t$enzyme\t$raw_reads_num\t$input_reads_num\t$enzyme_reads_num\t$percent_sub%\n"; 415 | } 416 | close STAT; 417 | } 418 | 419 | 420 | 421 | sub Five_Lable{# Data split for five concanated 2b-RAD tags 422 | my $r1=$input[0]; 423 | my $r2=$input[1]; 424 | my ($output,@fhandle); 425 | my $outprefix=$outprefix[0];# rename intermediate files using the first sample name 426 | my $input_reads_num; 427 | if($r1=~/\.gz$/){# record the # of raw reads 428 | open R,"gzip -dc $r1|" or die "cannot open $r1\n"; 429 | }else{ 430 | open R,"$r1" or die "cannot open $r1\n"; 431 | } 432 | while(){ 433 | $input_reads_num++; 434 | ; 435 | ; 436 | ; 437 | } 438 | close R; 439 | &execute("$pear -f $r1 -r $r2 -e -n $minpear -m $maxpear -o $outdir/$outprefix -j $pear_cpu");# merge data 440 | open IN,"$outdir/$outprefix.assembled.fastq" or die "cannot open $outdir/$outprefix.assembled.fastq\n"; 441 | for my $i(1..$#start+1){# open file handle 442 | my $fh="OU" . $i; 443 | my $j=$i-1; 444 | $output="$outdir/$outprefix[$j].$enzyme.$format"; 445 | if($gz eq "yes"){ 446 | open $fh,"|gzip > $output.gz" or die "cannot open $output.gz\n"; 447 | }elsif($gz eq "no"){ 448 | open $fh,"> $output" or die "cannot open $output\n"; 449 | } 450 | push @fhandle,$fh; 451 | } 452 | my $stat_name=join("-",@outprefix[0..4]); 453 | open STAT,">$outdir/$stat_name.$enzyme.stat.xls" or die "cannot open $outdir/$stat_name.$enzyme.stat.xls\n"; 454 | print STAT "sample\tenzyme\tinput_reads_num\tcombine_reads_num\tenzyme_reads_num\tqc_reads_num\tpercent\n"; 455 | my($combine_reads_num,%enzyme_reads_num,%qc_reads_num); 456 | $combine_reads_num=0; 457 | for my $i(0..$#start){ 458 | $qc_reads_num{$i}=0; 459 | $enzyme_reads_num{$i}=0; 460 | } 461 | while(){ 462 | $combine_reads_num++;#拼接后reads统计 463 | my $line=$_ . . .; 464 | my @tmp=split /\n/,$line; 465 | for my $i(0..$#start){ 466 | my $fh=$fhandle[$i]; 467 | my $num=$i+1; 468 | my $id="$tmp[0]:$num";#1 469 | my $seq=substr($tmp[1],$start[$i],$end[$i]-$start[$i]+1);#2 470 | my $qual=substr($tmp[3],$start[$i],$end[$i]-$start[$i]+1);#4 471 | for my $j(0..$#site){ 472 | if($seq=~s/^(\S*?)($site[$j])\S*/$2/){ 473 | $enzyme_reads_num{$i}++;#含酶切位点reads 474 | my $begin=length($1); 475 | my $len=length($2); 476 | $qual=substr($qual,$begin,$len); 477 | my $sub_line="$id\n$seq\n$tmp[2]\n$qual"; 478 | if($q_control eq "yes"){# check if QC needed 479 | next unless(&CheckN($sub_line)); 480 | next unless(&CheckQ($sub_line)); 481 | } 482 | $qc_reads_num{$i}++;#质控后reads 483 | if($format eq "fa"){ 484 | $id=~s/^@/>/; 485 | print $fh "$id\n$seq\n"; 486 | }elsif($format eq "fq"){ 487 | print $fh "$sub_line\n"; 488 | } 489 | last; 490 | } 491 | } 492 | } 493 | } 494 | close IN; 495 | for(@fhandle){ 496 | close $_; 497 | } 498 | for my $i(0..$#start){ 499 | my $percent_sub=sprintf "%.2f",$qc_reads_num{$i}/$input_reads_num*100; 500 | print STAT "$outprefix[$i]\t$enzyme\t$input_reads_num\t$combine_reads_num\t$enzyme_reads_num{$i}\t$qc_reads_num{$i}\t$percent_sub%\n"; 501 | } 502 | close STAT; 503 | &execute("rm -f $outdir/$outprefix.assembled.fastq"); 504 | &execute("rm -f $outdir/$outprefix.unassembled.forward.fastq $outdir/$outprefix.unassembled.reverse.fastq"); 505 | &execute("rm -f $outdir/$outprefix.discarded.fastq"); 506 | } 507 | 508 | 509 | sub Single_Lable{ 510 | my $r=$input[0]; 511 | my $outprefix=$outprefix[0]; 512 | my $output; 513 | if($r=~/\.gz$/){ 514 | open IN,"gzip -dc $r|" or die "cannot open $r\n"; 515 | }else{ 516 | open IN,"$r" or die "cannot open $r\n"; 517 | } 518 | $output="$outdir/$outprefix.$enzyme.$format"; 519 | if($gz eq "yes"){ 520 | open OU,"|gzip > $output.gz" or die "cannot open $output.gz\n"; 521 | }elsif($gz eq "no"){ 522 | open OU,"> $output" or die "cannot open $output\n"; 523 | } 524 | open STAT,">$outdir/$outprefix.$enzyme.stat.xls" or die "cannot open $outdir/$outprefix.$enzyme.stat.xls\n"; 525 | print STAT "sample\tenzyme\tinput_reads_num\tenzyme_reads_num\tqc_reads_num\tpercent\n"; 526 | my ($input_reads_num,$enzyme_reads_num,$qc_reads_num,$percent_sub); 527 | $qc_reads_num=0; 528 | while(){ 529 | $input_reads_num++;#原始数据reads 530 | my $line=$_ . . . ; 531 | my @tmp=split /\n/,$line; 532 | if(length($tmp[1])>50){# if input read length exceeds 50bp (such as those from PE150 platform), we will chop the sequence to the first 50-bp for the following analysis 533 | $tmp[1]=substr($tmp[1],0,50); 534 | $tmp[3]=substr($tmp[3],0,50); 535 | } 536 | for my $i(0..$#site){ 537 | if($tmp[1]=~s/^(\S*?)($site[$i])\S*/$2/){#取的是第一个酶切序列,非贪婪匹配 538 | $enzyme_reads_num++;#有酶切位点的reads 539 | my $begin=length($1); 540 | my $len=length($2); 541 | $tmp[3]=substr($tmp[3],$begin,$len);# quality score 542 | my $sub_line=join("\n",@tmp[0..3]); 543 | if($q_control eq "yes"){# check if QC needed 544 | next unless(&CheckN($sub_line)); 545 | next unless(&CheckQ($sub_line)); 546 | } 547 | $qc_reads_num++;#质控后的reads 548 | if($format eq "fa"){ 549 | $tmp[0]=~s/^@/>/; 550 | print OU "$tmp[0]\n$tmp[1]\n"; 551 | }elsif($format eq "fq"){ 552 | print OU "$sub_line\n"; 553 | } 554 | last; 555 | } 556 | } 557 | } 558 | close IN; 559 | close OU; 560 | $percent_sub=sprintf "%.2f",$qc_reads_num/$input_reads_num*100; 561 | print STAT "$outprefix\t$enzyme\t$input_reads_num\t$enzyme_reads_num\t$qc_reads_num\t$percent_sub%\n"; 562 | close STAT; 563 | } 564 | 565 | sub CheckN{ 566 | my $line=shift; 567 | my @tmp=split /\n/,$line; 568 | my $length=length($tmp[1]); 569 | @tmp=split //,$tmp[1]; 570 | my $count=0; 571 | for my $base(@tmp){ 572 | if($base eq "N"){$count++;}; 573 | } 574 | if($ncount>0 && $ncount<1){ 575 | if($ncount>= $count/$length){ 576 | return 1; 577 | }else{ 578 | return 0; 579 | } 580 | }elsif($ncount==0 || $ncount>=1){ 581 | if($ncount>=$count){ 582 | return 1; 583 | }else{ 584 | return 0; 585 | } 586 | } 587 | } 588 | 589 | sub CheckQ{ 590 | my $line = shift; 591 | my @array = split /\n/,$line; 592 | @array = split //,$array[3]; 593 | my $count = 0; 594 | foreach my $i( @array ){ 595 | next unless( ord($i) >= $quality + $qbase ); 596 | $count ++; 597 | } 598 | if( $count >= scalar(@array) * $percent / 100 ){ 599 | return 1; 600 | }else{ 601 | return 0; 602 | } 603 | } 604 | 605 | sub Electronic_enzyme{ 606 | my $genome=$input[0]; 607 | my $outprefix=$outprefix[0]; 608 | my $cnt=0; 609 | $/=">"; 610 | if($genome=~/\.gz$/){ 611 | open IN,"gzip -dc $genome|" or die "cannot open $genome\n"; 612 | }else{ 613 | open IN,"$genome" or die "cannot open $genome\n"; 614 | } 615 | if($gz eq "yes"){ 616 | open OU,"|gzip > $outdir/$outprefix.$enzyme.fa.gz" or die "cannot open $outdir/$outprefix.$enzyme.fa.gz\n"; 617 | }else{ 618 | open OU,">$outdir/$outprefix.$enzyme.fa" or die "cannot open $outdir/$outprefix.$enzyme.fa\n"; 619 | } 620 | open STAT,">$outdir/$outprefix.$enzyme.stat.xls" or die "cannot open $outdir/$outprefix.$enzyme.stat.xls\n"; 621 | print STAT "sample\tenzyme\tinput_reads_num\tenzyme_reads_num\tpercent\n"; 622 | my($input_reads_num,$enzyme_reads_num,$percent_sub); 623 | $enzyme_reads_num=0; 624 | ; 625 | while(){ 626 | chomp; 627 | $input_reads_num++; 628 | my @tmp=split /\n/; 629 | my $id=(split /\s+/,$tmp[0])[0]; 630 | my $seq=join("",@tmp[1..$#tmp]); 631 | $seq=uc($seq); # convert the lowercase to uppercase bases 小写碱基转换为大写 632 | my %hash;my %hash_tag; 633 | for my $i(0..$#site){ # iterate all restriction sites 循环酶切位点 634 | while($seq=~/($site[$i])/g){ # digital digestion 635 | my $tag=$1; 636 | my $len=length($tag); 637 | my $pos=pos($seq); 638 | $pos=$pos-$len+1; 639 | my $pos_end=$pos+$len-1; 640 | pos($seq)=$pos; #调整位置 641 | $hash{$pos}{$pos_end}="$id-$pos-$pos_end"; 642 | $hash_tag{$pos}{$pos_end}=$tag; 643 | } 644 | } 645 | # sort the 2b-RAD tags by the genome positions排序后输出 646 | for my $pos(sort {$a<=>$b} keys %hash){ 647 | for my $pos_end(sort {$a<=>$b} keys %{$hash{$pos}}){ 648 | $cnt++; 649 | $enzyme_reads_num++; 650 | print OU ">$hash{$pos}{$pos_end}-$cnt\n"; #染色体id-起始位置-终止位置-第n个标签 651 | print OU "$hash_tag{$pos}{$pos_end}\n"; 652 | } 653 | } 654 | undef %hash; 655 | undef %hash_tag; 656 | } 657 | close IN; 658 | close OU; 659 | $percent_sub=sprintf "%.2f",$enzyme_reads_num/$input_reads_num*100; 660 | print STAT "$outprefix\t$enzyme\t$input_reads_num\t$enzyme_reads_num\t$percent_sub%\n"; 661 | close STAT; 662 | $/="\n"; 663 | } 664 | 665 | 666 | sub execute{ 667 | my $cmd = shift; 668 | print "$cmd\n"; 669 | my $exit_code=system($cmd); 670 | if($exit_code!=0){ 671 | print STDERR "Command $cmd failed with an exit code of $exit_code.\n"; 672 | exit($exit_code >> 8); 673 | } 674 | } 675 | sub CheckDir{ 676 | my $file = shift; 677 | unless( -d $file ){ 678 | if( -d dirname($file) && -w dirname($file) ){system("mkdir $file");} 679 | else{print STDERR "$file not exists and cannot be built\n";exit 1;} 680 | } 681 | return 1; 682 | } 683 | --------------------------------------------------------------------------------