├── KAKS_SHAIXUAN.pl ├── README.md ├── cal_N50.pl ├── chr.info ├── colline_v2.pl ├── config1.txt ├── config2.txt ├── domain_xulie.pl ├── fq2fa.pl ├── gene_family_analysis_pipeline.sh ├── geneid_to_mRNAid.pl ├── genome.txt ├── get_data_by_id.pl ├── get_fa_by_id.pl ├── get_fa_by_id_from_bed.pl ├── get_fa_by_id_from_gff.pl ├── get_gene_bed.pl ├── get_gene_exon_from_gff.pl ├── get_gene_position.pl ├── get_gene_weizhi.pl ├── get_gtf.pl ├── get_promoter.pl ├── get_tandem_gene.pl ├── link.txt ├── mRNAid_to_geneid.pl ├── mcscan_seqid_染色体编号配置文件 ├── mcscan_图层配置文件 ├── mcscanx物种内共线性分析.sh ├── select_redundant_mRNA.pl ├── stat_protein_fa.pl ├── text.txt ├── upload_code ├── vimrc ├── 下载数据 ├── 共线性图.pdf ├── 添加基因家族物种共线性关系配置 ├── 物种间基因家族查找.pl ├── 物种间基因家族比较分析流程.txt └── 生成基因家族配置simple文件.pl /KAKS_SHAIXUAN.pl: -------------------------------------------------------------------------------- 1 | use Getopt::Long; 2 | my %opts; 3 | use Data::Dumper; 4 | GetOptions (\%opts,"in1=s","in2=s","out=s","h"); 5 | if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){ 6 | &USAGE; 7 | } 8 | open (IN1,"$opts{in1}") || die "open $opts{in} failed\n"; 9 | open (IN2,"$opts{in2}") || die "open $opts{ina} failed\n"; 10 | open (OUT,">$opts{out}") || die "open $opts{out} failed\n"; 11 | my %cds_length; 12 | while(){ 13 | chomp; 14 | my @line = split("\t",$_); 15 | $cds_length{$line[0]}= $line[1]; 16 | #print "$cds_length{$line[0]}\n"; 17 | } 18 | 19 | while( ){ 20 | 21 | chomp($_); 22 | my @line1 = split ("\t",$_); 23 | #print @line1; 24 | #print "\n"; 25 | my $max_length = $cds_length{$line1[0]} > $cds_length{$line1[1]} ? $cds_length{$line1[0]}:$cds_length{$line1[1]}; 26 | if(($line1[0] ne $line1[1]) && ($line1[2] > 70 )&& ($line1[3] > 0.70*$max_length)){ 27 | print OUT $_."\t$max_length\n"; 28 | 29 | } 30 | #print $cds_length1{$line1[0]}; 31 | 32 | } 33 | 34 | 35 | close(IN1); 36 | close(IN2); 37 | close(OUT); 38 | sub USAGE { 39 | print "usage: perl $0 -in1 cds_length -in2 result.txt -out shaixuan_result.txt"; 40 | exit; 41 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gene family analysis 2 | -------------------------------------------------------------------------------- /cal_N50.pl: -------------------------------------------------------------------------------- 1 | #/usr/bin/perl -w 2 | use strict; 3 | use List::Util qw(sum min max); 4 | use Getopt::Long; 5 | use File::Basename; 6 | 7 | # Parameter variables 8 | my $file; 9 | my $helpAsked; 10 | my $outfile = ""; 11 | 12 | GetOptions( 13 | "i=s" => \$file, 14 | "h|help" => \$helpAsked, 15 | "o|outputFile=s" => \$outfile, 16 | ); 17 | if(defined($helpAsked)) { 18 | prtUsage(); 19 | exit; 20 | } 21 | if(!defined($file)) { 22 | prtError("No input files are provided"); 23 | } 24 | 25 | my ($fileName, $filePath) = fileparse($file); 26 | $outfile = $file . "_n50_stat" if($outfile eq ""); 27 | 28 | 29 | 30 | 31 | open IN, "$file" or die "$!"; 32 | open O,">$outfile" or die "$!"; 33 | my @len; 34 | my $totseq; 35 | 36 | sub median (\@){ 37 | 38 | my @x = sort {$a <=> $b} @{$_[0]}; 39 | my $len = scalar @x; 40 | my $median; 41 | if ($len % 2 == 0) { 42 | $median = ($x[$len/2 - 1] + $x[$len/2]) / 2; 43 | 44 | }else { 45 | 46 | $median = $x[($len + 1) / 2]; 47 | } 48 | return $median; 49 | } 50 | 51 | 52 | my $As = 0; 53 | my $Ts = 0; 54 | my $Gs = 0; 55 | my $Cs = 0; 56 | my $Ns = 0; 57 | 58 | #-------------------------------------- 子程序 ----------------------------------------------- 59 | 60 | sub basecount (\$){ 61 | my $seq = ${$_[0]}; 62 | 63 | my $tAs += $seq =~ s/A/A/gi; 64 | my $tGs += $seq =~ s/G/G/gi; 65 | my $tCs += $seq =~ s/C/C/gi; 66 | my $tTs += $seq =~ s/T/T/gi; 67 | my $Ns = (length $seq) - $tAs - $tTs - $tCs - $tGs; 68 | $As += $tAs; 69 | $Ts += $tTs; 70 | $Gs += $tGs; 71 | $Cs += $tCs; 72 | 73 | } 74 | 75 | ##计算N50 76 | 77 | #### 从大到小排序--> for循环相加,直到>= $ /2 78 | 79 | sub calN50 (\@$){ 80 | my @x = sort {$b <=> $a} @{$_[0]}; 81 | my $n = $_[1]; 82 | my $totlen = sum(@x); 83 | my ($tot,$n50) = (0,0); 84 | for (my $i = 0;$i <@x;$i++) { 85 | $tot += $x[$i]; 86 | if ($tot >= $totlen*$n/100) { 87 | $n50 = $x[$i]; 88 | last; 89 | } 90 | 91 | 92 | } 93 | return $n50; 94 | } 95 | 96 | sub prtHelp { 97 | print "\n$0 options:\n\n"; 98 | print "### Input reads/sequences (FASTA) (Required)\n"; 99 | print " -i \n"; 100 | print " Read/Sequence in fasta format\n"; 101 | print "\n"; 102 | print "### Other options [Optional]\n"; 103 | print " -h | -help\n"; 104 | print " Prints this help\n"; 105 | print " -o | -outputFile \n"; 106 | print " Output will be stored in the given file\n"; 107 | print " default: By default, N50 statistics file will be stored where the input file is\n"; 108 | print "\n"; 109 | } 110 | 111 | sub prtError { 112 | my $msg = $_[0]; 113 | print STDERR "+======================================================================+\n"; 114 | printf STDERR "|%-70s|\n", " Error:"; 115 | printf STDERR "|%-70s|\n", " $msg"; 116 | print STDERR "+======================================================================+\n"; 117 | prtUsage(); 118 | exit; 119 | } 120 | 121 | sub prtUsage { 122 | print "\nUsage: perl $0 \n"; 123 | prtHelp(); 124 | } 125 | 126 | 127 | #----------------------------- 主程序 ----------------------------------------------- 128 | 129 | while (defined(my $line = )) { 130 | chomp $line; 131 | if ($line =~ /^>/) { 132 | my $genenID = $line; 133 | my $seq = ; 134 | chomp $seq; 135 | push @len,length $seq; 136 | $totseq .= $seq; 137 | } 138 | 139 | } 140 | #print "@len\n"; 141 | my $totlen = sum(@len); 142 | my $totreads = scalar @len; 143 | my $min = min(@len); 144 | my $max = max(@len); 145 | my $avg = sprintf "%0.2f", $totlen/$totreads; 146 | my $median = median(@len); 147 | #print "$totseq\n"; 148 | #print "$median\n"; 149 | 150 | basecount($totseq); 151 | my $n25 = calN50(@len,25); 152 | my $n50 = calN50(@len,50); 153 | my $n75 = calN50(@len,75); 154 | my $n90 = calN50(@len,90); 155 | my $n95 = calN50(@len,95); 156 | 157 | 158 | printf O "%-25s %d\n", "Total sequences",$totreads; 159 | printf O "%-25s %d\n", "total base",$totlen; 160 | printf O "%-25s %d\n" , "Min sequence length", $min; 161 | printf O "%-25s %d\n" , "Max sequence length", $max; 162 | printf O "%-25s %0.2f\n", "Average sequence length", $avg; 163 | printf O "%-25s %0.2f\n", "Median sequence length", $median; 164 | printf O "%-25s %d\n", "N25 length", $n25; 165 | printf O "%-25s %d\n", "N50 length", $n50; 166 | printf O "%-25s %d\n", "N75 length", $n75; 167 | printf O "%-25s %d\n", "N90 length", $n90; 168 | printf O "%-25s %d\n", "N95 length", $n95; 169 | printf O "%-25s %0.2f %s\n", "As", $As/$totlen*100, "%"; 170 | printf O "%-25s %0.2f %s\n", "Ts", $Ts/$totlen*100, "%"; 171 | printf O "%-25s %0.2f %s\n", "Gs", $Gs/$totlen*100, "%"; 172 | printf O "%-25s %0.2f %s\n", "Cs", $Cs/$totlen*100, "%"; 173 | printf O "%-25s %0.2f %s\n", "(A + T)s", ($As+$Ts)/$totlen*100, "%"; 174 | printf O "%-25s %0.2f %s\n", "(G + C)s", ($Gs+$Cs)/$totlen*100, "%"; 175 | printf O "%-25s %0.2f %s\n", "Ns", $Ns/$totlen*100, "%"; 176 | 177 | print "N50 Statistics file: $outfile\n"; 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /chr.info: -------------------------------------------------------------------------------- 1 | 2 | # 染色体配置文件 3 | 4 | 5 | chr - chr01 Chr1 0 302956453 chr1 6 | chr - chr02 Chr2 0 241307389 chr2 7 | chr - chr03 Chr3 0 232953155 chr3 8 | chr - chr04 Chr4 0 242006640 chr4 9 | chr - chr05 Chr5 0 215148664 chr5 10 | chr - chr06 Chr6 0 165010417 chr6 11 | 12 | # 设置:前两列规定,表明时染色体配置文件 第三列:gff 文件实际编号 第四列:你希望在图片上展示的染色体名字 第五列+第六列:染色体长度 第六列:官网上规定每一个 chr 编号对应一种颜色 13 | 14 | chr - chr01 Chr1 0 302956453 chr1 15 | chr - chr02 Chr2 0 241307389 chr1 16 | chr - chr03 Chr3 0 232953155 chr1 17 | chr - chr04 Chr4 0 242006640 chr1 18 | chr - chr05 Chr5 0 215148664 chr1 19 | chr - chr06 Chr6 0 165010417 chr1 20 | chr - chr06 Chr6 0 302956453 chr2 21 | chr - chr05 Chr5 0 241307389 chr2 22 | chr - chr04 Chr4 0 232953155 chr2 23 | chr - chr03 Chr3 0 242006640 chr2 24 | chr - chr02 Chr2 0 215148664 chr2 25 | chr - chr01 Chr1 0 165010417 chr2 26 | 27 | # 上面这个展示物种间的共线性图 染色体的配置文件,两个物种染色体编号顺序要相反 28 | -------------------------------------------------------------------------------- /colline_v2.pl: -------------------------------------------------------------------------------- 1 | use Getopt::Long; 2 | use strict; 3 | use Cwd qw(abs_path getcwd); 4 | 5 | 6 | 7 | 8 | my %opts; 9 | 10 | GetOptions (\%opts,"list=s","od=s","colline=s","gff=s","name=s"); 11 | 12 | 13 | 14 | my $od=$opts{od}; 15 | $od||=getcwd; 16 | $od=abs_path($od); 17 | unless(-d $od){ mkdir $od;} 18 | 19 | 20 | 21 | #############gff for cir text###########3 22 | 23 | open (IN,"$opts{gff}") || die "open $opts{gff} failed\n"; 24 | my %gff; 25 | my @info; 26 | my $chr; 27 | my $start; 28 | my $end; 29 | my $gene; 30 | while(){ 31 | chomp; 32 | next if /^#/; 33 | 34 | @info=split(/\t/,$_); 35 | 36 | next unless($info[2]=~/gene/); 37 | ($gene)=($info[8]=~/ID=([^;]+)/); 38 | 39 | $chr=$info[0]; 40 | $start=$info[3]; 41 | $end=$info[4]; 42 | $gff{$gene}=$chr."\t".$start."\t".$end; 43 | } 44 | 45 | close(IN); 46 | 47 | 48 | 49 | ####################### list ############## 50 | 51 | 52 | my %list; 53 | my $pair; 54 | my $Len; 55 | my $Agene; 56 | my $Bgene; 57 | my %text; 58 | 59 | open (IN,"$opts{list}") || die "open $opts{list} failed\n"; 60 | open (OUT,">$od/$opts{name}.txt") || die "open $od/$opts{name}.txt failed\n"; 61 | open (OUTL,">$od/$opts{name}.link.txt") || die "open $od/$opts{name}.link.txt failed\n"; 62 | 63 | while(){ 64 | chomp; 65 | @info=split(/\t/,$_); 66 | $Len = @info; 67 | print $Len; 68 | 69 | my $len=1; 70 | while($len<$Len){ 71 | $pair=$info[$len]; ####�ų�0��Ҳ���ǵ�һ��λ�ã��˴�Ϊ���� 72 | ($Agene,$Bgene)=split(/:/,$pair,2); 73 | print OUT $Agene."\t".$Bgene."\n"; 74 | 75 | if(exists $gff{$Agene} && exists $gff{$Bgene}){ #######ʵ���Ͽ϶����ڣ� 76 | 77 | print OUTL $gff{$Agene}."\t".$gff{$Bgene}."\n"; 78 | 79 | $text{$Agene}=$gff{$Agene}."\t".$Agene; ##��ȥ�ؿ���ֱ�ӿ�ʼ��� 80 | $text{$Bgene}=$gff{$Bgene}."\t".$Bgene; 81 | 82 | } 83 | $len=$len+1; 84 | } 85 | 86 | } 87 | close(IN); 88 | close(OUT); 89 | 90 | 91 | ##ȥ���ظ�ID��text 92 | open (OUT,">$od/$opts{name}.text.txt") || die "open $od/$opts{name}.text.txt failed\n"; 93 | my $loc; 94 | while(($gene,$loc)=each %text){ 95 | print OUT $loc."\n"; 96 | } 97 | close(OUT); 98 | 99 | ######### collinearity for genome block colline ##### 100 | 101 | open (IN,"$opts{colline}") || die "open $opts{colline} failed\n"; 102 | open (OUT,">$od/genome.blocklink.txt") || die "open $od/genome.blocklink.txt failed\n"; 103 | open (OUTA,">$od/genome.align.blocklink.txt") || die "open $od/genome.align.blocklink.txt failed\n"; 104 | my $n; 105 | my $align; 106 | my $colline; 107 | my %block; 108 | my $Agene1S; 109 | my $AgeneNE; 110 | my $Bgene1S; 111 | my $BgeneNE; 112 | my $Achr; 113 | my $Bchr; 114 | while(){ 115 | chomp; 116 | if(/^#/){ 117 | if(/Alignment/){ 118 | $n=1; 119 | $_=~/Alignment ([^:]*)/; 120 | $align="Alignment".$1; 121 | } 122 | next; 123 | } 124 | 125 | $colline=$_; 126 | @info=split("\t",$colline); 127 | $Agene=$info[1]; 128 | $Bgene=$info[2]; 129 | 130 | if(exists $gff{$Agene} && exists $gff{$Bgene} ){ 131 | 132 | if($n ==1 ){ 133 | 134 | ($chr,$start,$end)=split(/\t/,$gff{$Agene}); 135 | $Agene1S=$start; 136 | $Achr=$chr; 137 | 138 | ($chr,$start,$end)=split(/\t/,$gff{$Bgene}); 139 | $Bgene1S=$start; 140 | $Bchr=$chr; 141 | 142 | }else{ 143 | 144 | 145 | ($chr,$start,$end)=split(/\t/,$gff{$Agene}); 146 | $AgeneNE=$end; 147 | 148 | ($chr,$start,$end)=split(/\t/,$gff{$Bgene}); 149 | $BgeneNE=$end; 150 | 151 | } 152 | } 153 | $n=$n+1; 154 | $block{$align}=$Achr."\t".$Agene1S."\t".$AgeneNE."\t".$Bchr."\t".$Bgene1S."\t".$BgeneNE; 155 | 156 | 157 | } 158 | 159 | close(IN); 160 | 161 | my $block_info; 162 | 163 | while(($align,$block_info)=each %block){ 164 | print OUT $block_info."\n"; 165 | print OUTA $align."\t".$block_info."\n"; 166 | } 167 | close(OUT); 168 | close(OUTA); 169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /config1.txt: -------------------------------------------------------------------------------- 1 | chromosomes_units=1000000 # 染色体显示单元,一般以 兆b 显示 2 | chromosomes_reverse=/chr[01]/ 3 | 4 | # 染色体配置 5 | 6 | 7 | fill=yes 8 | label_font=default 9 | label_parallel=yes 10 | label_radius=dims(image,radius)-60p 11 | label_size=45 12 | radius=0.90r # 染色体离圆心的位置 0.9 表示离圆心 90% 的位置 13 | show_label=yes 14 | 15 | default=0.005r # 两个染色体间的间隙 16 | 17 | stroke_color=dgrey 18 | stroke_thickness=2p 19 | thickness=0.03r 20 | 21 | karyotype=/home/manager/share/cir/chr.info # 染色体配置文件路径 22 | 23 | 24 | # 共线性文件配置(基因共线性和区块共线性) 25 | 26 | # 两个共线性文件主配置 27 | bezier_radius=0r 28 | bezier_radius_purity=0.75 29 | color=black 30 | crest=0.5 31 | # 基因共线性 32 | bezier_radius=0r 33 | bezier_radius_purity=0.75 34 | color=set2-8-qual-1 35 | crest=0.5 36 | file=/home/manager/share/cir/link.txt # 基因共线性配置文件路径 37 | radius=0.88r 38 | 39 | 40 | color=red 41 | condition=var(intrachr) # 染色体内部共线性颜色配置 42 | 43 | 44 | color=red 45 | condition=var(interchr) # 染色体间共线性颜色配置 46 | 47 | 48 | thickness=6 49 | z=20 50 | 51 | # 区块共线性 52 | bezier_radius=0r 53 | bezier_radius_purity=0.75 54 | color=230,230,230,0.2 #背景区块颜色,此处设置的是灰色,透明度为 0.2 55 | crest=0.5 56 | ribbon=yes # 区块连线设置为条带状 57 | file=/home/manager/share/cir/genome.txt # 区块共线性配置文件路径 58 | radius=0.88r 59 | 60 | 61 | condition=var(intrachr) 62 | 63 | 64 | condition=var(interchr) 65 | 66 | 67 | thickness=1 68 | z=15 69 | 70 | radius=0.40r 71 | thickness=1 72 | 73 | 74 | # 基因名注释配置 75 | 76 | 77 | color=set2-8-qual-2 78 | file=/home/manager/share/cir/text.txt #基因名注释文件路径 79 | label_font=light 80 | link_color=black 81 | link_dims=0p,2p,5p,2p,2p 82 | link_thickness=2p 83 | r0=0.88r 84 | r1=0.99r 85 | rpadding=5p 86 | show_links=no 87 | type=text 88 | 89 | type=histogram 90 | 91 | show_tick_labels=yes 92 | show_ticks=yes 93 | spacing=10u 94 | 95 | # 染色体刻度的设置 96 | 97 | 98 | color=black 99 | format=%d 100 | multiplier=1e-6 101 | radius=1r 102 | thickness=2p 103 | 104 | size=10p 105 | spacing=5u 106 | 107 | 108 | color=black 109 | format=%d 110 | label_offset=10p 111 | label_size=25p 112 | show_label=yes 113 | size=15p 114 | spacing=25u 115 | thickness=4p 116 | 117 | 118 | 119 | <> 120 | <> 121 | #<> 122 | #<> 123 | #<> 124 | 125 | 126 | 127 | <> 128 | 129 | 130 | 131 | <> 132 | 133 | <> 134 | 135 | -------------------------------------------------------------------------------- /config2.txt: -------------------------------------------------------------------------------- 1 | chromosomes_units=1000000 2 | chromosomes_reverse=/chr[01]/ 3 | 4 | fill=yes 5 | label_font=default 6 | label_parallel=yes 7 | label_radius=dims(image,radius)-60p 8 | label_size=45 9 | radius=0.90r 10 | show_label=yes 11 | 12 | default=0.005r 13 | 14 | stroke_color=dgrey 15 | stroke_thickness=2p 16 | thickness=0.03r 17 | 18 | karyotype=/home/manager/share/cir/chr.info 19 | 20 | bezier_radius=0r 21 | bezier_radius_purity=0.75 22 | color=black 23 | crest=0.5 24 | 25 | bezier_radius=0r 26 | bezier_radius_purity=0.75 27 | color=set2-8-qual-1 28 | crest=0.5 29 | file=/home/manager/share/cir/link.txt 30 | radius=0.88r 31 | 32 | 33 | color=green 34 | condition=var(intrachr) 35 | 36 | 37 | color=green 38 | condition=var(interchr) 39 | 40 | 41 | thickness=8 42 | z=20 43 | 44 | 45 | bezier_radius=0r 46 | bezier_radius_purity=0.75 47 | color=230,230,230,0.2 48 | crest=0.5 49 | ribbon=yes 50 | file=/home/manager/share/cir/genome.txt 51 | radius=0.88r 52 | # 背景区块个性化配置,不同染色体间背景区块的颜色 53 | 54 | color=255,225,255,0.2 55 | condition=between(chr01,chr02) 56 | 57 | 58 | color=102,205,170,0.2 59 | condition=between(chr04,chr05) 60 | 61 | 62 | color=208,32,114,0.2 63 | condition=between(chr01,chr05) 64 | 65 | 66 | color=150,0,0,0.2 67 | condition=between(chr02,chr03) 68 | 69 | 70 | 71 | condition=var(interchr) 72 | 73 | 74 | thickness=1 75 | z=15 76 | 77 | radius=0.40r 78 | thickness=1 79 | 80 | 81 | 82 | color=set2-8-qual-2 83 | file=/home/manager/share/cir/text.txt 84 | label_font=light 85 | link_color=black 86 | link_dims=0p,2p,5p,2p,2p 87 | link_thickness=2p 88 | r0=0.88r 89 | r1=0.99r 90 | rpadding=5p 91 | show_links=no 92 | type=text 93 | 94 | type=histogram 95 | 96 | show_tick_labels=yes 97 | show_ticks=yes 98 | spacing=10u 99 | 100 | color=black 101 | format=%d 102 | multiplier=1e-6 103 | radius=1r 104 | thickness=2p 105 | 106 | size=10p 107 | spacing=5u 108 | 109 | 110 | color=black 111 | format=%d 112 | label_offset=10p 113 | label_size=25p 114 | show_label=yes 115 | size=15p 116 | spacing=25u 117 | thickness=4p 118 | 119 | 120 | 121 | <> 122 | <> 123 | #<> 124 | #<> 125 | #<> 126 | 127 | 128 | 129 | <> 130 | 131 | 132 | 133 | <> 134 | 135 | <> 136 | 137 | -------------------------------------------------------------------------------- /domain_xulie.pl: -------------------------------------------------------------------------------- 1 | #北京组学生物科技有限公司 2 | #email: huangls@biomics.com.cn 3 | 4 | die "perl $0 " unless ( @ARGV == 4 ); 5 | use Math::BigFloat; 6 | use Bio::SeqIO; 7 | use Bio::Seq; 8 | $in = Bio::SeqIO->new( 9 | -file => "$ARGV[1]", 10 | -format => 'Fasta' 11 | ); 12 | $out = Bio::SeqIO->new( 13 | -file => ">$ARGV[2]", 14 | -format => 'Fasta' 15 | ); 16 | my %keep = (); 17 | open IN, "$ARGV[0]" or die "$!"; 18 | 19 | while () { 20 | chomp; 21 | next if /^#/; 22 | 23 | my @a = split /\s+/; 24 | next if $a[6] > $ARGV[3]; 25 | my @b = ( $a[17], $a[18] ); 26 | my $keys = $a[0]; 27 | if ($a[9]==1 and !exists $keep{$keys} ) { #提取序列中第一个结构域所在的序列 28 | $keep{$keys} = \@b; 29 | 30 | } 31 | } 32 | close(IN); 33 | while ( my $seq = $in->next_seq() ) { 34 | my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc ); 35 | 36 | if ( exists $keep{$id} ) { 37 | my $subseq = $seq->subseq( $keep{$id}->[0], $keep{$id}->[1]); #截取序列 38 | my $newseqobj = Bio::Seq->new( 39 | -seq => $subseq, 40 | -desc => "domain:$keep{$id}[0]-$keep{$id}[1]", 41 | -id => "$id", 42 | ); 43 | 44 | $out->write_seq($newseqobj); 45 | } 46 | } 47 | $in->close(); 48 | $out->close(); 49 | -------------------------------------------------------------------------------- /fq2fa.pl: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | use strict; 3 | use Getopt::Long; 4 | use File::Basename; 5 | 6 | #---------------------------------------- 模板 ---------------------------------------------------------------- 7 | #定义命令行参数 8 | my $file; 9 | my $help; 10 | my $outFile = ""; 11 | 12 | GetOptions( 13 | "i=s" => \$file, 14 | "h|help" => \$help, 15 | "o|outputfile" => \$outFile, 16 | ); 17 | 18 | #检查参数 19 | 20 | #定义help文档 21 | sub prtHelp { 22 | print "\n$0 options:\n\n"; 23 | print "### Input reads (FASTQ) (Required)\n"; 24 | print " -i \n"; 25 | print " Read file in FASTQ format\n"; 26 | print "\n"; 27 | print "### Other options [Optional]\n"; 28 | print " -h | -help\n"; 29 | print " Prints this help\n"; 30 | print " -o | -outputFile \n"; 31 | print " Output will be stored in the given file\n"; 32 | print " default: By default, file will be stored where the input file is\n"; 33 | print "\n"; 34 | } 35 | 36 | 37 | sub prtError { 38 | my $msg = $_[0]; 39 | print STDERR "+======================================================================+\n"; 40 | printf STDERR "|%-60s|\n", " Error:"; 41 | printf STDERR "|%-70s|\n", " $msg"; 42 | print STDERR "+======================================================================+\n"; 43 | prtUsage(); 44 | exit; 45 | } 46 | 47 | 48 | #定义usage 49 | 50 | sub prtUsage { 51 | print "\nUsage:perl $0 "; 52 | prtHelp(); 53 | } 54 | 55 | 56 | if (defined($help)) { 57 | prtUsage(); 58 | exit; 59 | } 60 | 61 | if (!defined($file)) { 62 | prtError("NO input files are provided"); 63 | } 64 | 65 | #---------------------------------------- 模板 ---------------------------------------------------------------- 66 | 67 | 68 | 69 | 70 | 71 | #-----------------------------------------主程序 --------------------------------------------------------------- 72 | #自定义输出路径 73 | 74 | my ($filename,$filepath) = fileparse ($file); 75 | $outFile = $file . "_fasta" if ($outFile eq ""); 76 | 77 | open I, "<$file" or die "can not open file:$file\n"; 78 | open OF, ">$outFile" or die "can not open file:$outFile\n"; 79 | 80 | sub formatseq { 81 | my $seq = $_[0]; 82 | $seq =~ s/(\w{50})/$1\n/g; 83 | return $seq; 84 | } 85 | #-----------------------------------------主程序 --------------------------------------------------------------- 86 | 87 | 88 | 89 | #下面这是修改前的源代码 90 | 91 | #-----------------------------------------------------------------------------# 92 | #sub foramtseq { 93 | # my $seq = $_[0]; 94 | # my $newseq = ""; 95 | # my $ch = 60; 96 | # for (my $i = 0; $i )) { 106 | chomp $line; 107 | my $id = $line; 108 | $id =~ s/^\@//; 109 | print OF ">$id\n"; 110 | my $seq = ; 111 | print OF formatseq($seq); 112 | ; 113 | ; 114 | } 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /gene_family_analysis_pipeline.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | # ----------------------------------------------- 基因家族分析 --------------------------------------------------- 4 | 5 | 6 | #-------------------------- 1. 搜索物种的基因家族信息 ---------------------------------------- 7 | 8 | Things required: 9 | 所需文件: 10 | 11 | 基因组 fasta 文件 whole genome about your species 12 | 13 | 基因组 CDS fasta 文件 CDS file 14 | 15 | 基因组 蛋白质 fasta 文件 protein file 16 | 17 | 基因注释文件 gff gff file 18 | 19 | 基因家族 pfam 文件 your protein PFAM accession number (.hmm file) 20 | 21 | 22 | #下载拟南芥基因组信息 23 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz 24 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz 25 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz 26 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.41.gff3.gz 27 | # 28 | #解压压缩文件 decompressed gz file 29 | #gunzip *gz 30 | 31 | # optional step (just change some file format) 32 | 33 | #处理GFF 文件里面ID中一些不必要的信息,gene: transcript: 删除;与蛋白质中的ID保持一致:Arabidopsis_thaliana.TAIR10.pep.all.fa 34 | #sed -i 's#gene:##' Arabidopsis_thaliana.TAIR10.41.gff3 35 | #sed -i 's#transcript:##' Arabidopsis_thaliana.TAIR10.41.gff3 36 | #sed -i 's#CDS:##' Arabidopsis_thaliana.TAIR10.41.gff3 37 | 38 | 39 | #------------------- gff文件 获取基因与mRNA的对应关系 40 | 41 | perl script/mRNAid_to_geneid.pl Arabidopsis_thaliana.TAIR10.41.gff3 mRNA2geneID.txt 42 | perl script/geneid_to_mRNAid.pl Arabidopsis_thaliana.TAIR10.41.gff3 geneid2mRNAid.txt 43 | 44 | #与Arabidopsis_thaliana.TAIR10.pep.all.fa 文件中的ID保持一致,如果第20-21行没有做,这里可以补做; 45 | #sed -i 's#gene:##' mRNA2geneID.txt 46 | #sed -i 's#transcript:##' mRNA2geneID.txt 47 | #sed -i 's#CDS:##' mRNA2geneID.txt 48 | 49 | #--------------------- 在 蛋白文件 中搜索 基因家族 保守结构域 50 | 51 | hmmsearch --domtblout WRKY_hmm_out.txt --cut_tc WRKY.hmm Arabidopsis_thaliana.TAIR10.pep.all.fa 52 | 53 | 54 | # ------------------- 筛选比对搜索结果,并提取 保守结构域 序列 55 | 56 | #提取结构域序列,脚本最后的evalue参数1.2e-28,根据实际情况可调,大于这个E值脚本会跳过这个一行;注意脚本提取的是第一个domain,如要提取其他domain,请修改脚本27行$a[9]==1为第一个,$a[9]==2为第二个,依次类推 57 | 58 | perl script/domain_xulie.pl WRKY_hmm_out.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_domain.fa 1.2e-28 59 | 60 | 61 | 62 | ###########以下部分为建立物种特异模型再次搜索,可根据自己基因家族情况选做这部分内容############################# ( 这一步可选 ) 63 | 64 | 65 | #clusterW多序列比对快捷方法 66 | 67 | echo "1\nWRKY_domain.fa\n2\n1\nWRKY_domain.aln\nWRKY_domain.dnd\nX\n\n\nX\n" |clustalw 68 | 69 | #利用比对结果建立物种特异hmm模型 70 | hmmbuild WRKY_domain_new.hmm WRKY_domain.aln 71 | 72 | #新建物种特异hmm模型,再次搜索 73 | 74 | hmmsearch --domtblout WRKY_domain_new_out.txt --cut_tc WRKY_domain_new.hmm Arabidopsis_thaliana.TAIR10.pep.all.fa 75 | 76 | ############################################################################################################ 77 | 78 | 79 | 80 | # ------------------------ 转录本选择,去冗余 81 | 82 | #筛选 hmm搜索结果,可以用excel手动筛选,筛选标准, 83 | #1.E-value值小于0.001; 84 | #2.如果有多个转录本选第一个转录本 85 | #3.只有一个转录本,就选那个转录本 86 | 87 | #筛选EValue <0.001 88 | #如果只想用hmmer搜索一次,可将下面的文件:WRKY_domain_new_out.txt 替换成 57行 生成的文件:WRKY_hmm_out.txt 89 | grep -v "^#" WRKY_domain_new_out.txt|awk '$7<0.001 {print}' >WRKY_domain_new_out_selected.txt 90 | 91 | 92 | #去除重复的hmmer搜索的转录本ID,多个转录本ID保留一个作为基因的代表,此步建议对脚本输出的文件手动筛选,挑选ID: 93 | perl script/select_redundant_mRNA.pl mRNA2geneID.txt WRKY_domain_new_out_selected.txt WRKY_remove_redundant_IDlist.txt 94 | 95 | 96 | #请手动挑选完mRNA的ID放在第一列,也就是挑选一个转录本ID代表这个基因,存成新的文件WRKY_removed_redundant_IDlist.txt: 97 | 98 | 99 | # ------------------------ 提取 筛选过后 转录本所对应的 基因的序列 100 | 101 | #利用脚本得到对应基因的蛋白序列,脚本会读取第一个文件的第一列ID,把对应ID的序列提取出来: 102 | perl script/get_fa_by_id.pl WRKY_removed_redundant_IDlist.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_pep_need_to_confirm.fa 103 | 104 | 105 | #将上面WRKY_pep_need_to_confirm.fa文件中的蛋白序列,再手动验证一下,把不需要的ID删除,最终确认:WRKY_removed_redundant_IDlist.txt 存成新文件:WRKY_removed_redundant_and_confirmed_IDlist.txt 106 | 107 | #手动确认结构域,CDD,SMART,PFAM 108 | #确定分子量大小:http://web.expasy.org/protparam/ 109 | #perl script/stat_protein_fa.pl WRKY_pep_need_to_confirm.fa WRKY_pep_need_to_confirm.MW.txt 110 | #三大数据库网站,筛选之后去除一些不确定的基因ID,最终得到可靠的基因家族基因列表,存储在文件:WRKY_removed_redundant_and_confirmed_IDlist.txt ; 111 | 112 | 113 | #脚本提取hmm结果文件,重新筛选一下hmm的结果: 114 | 115 | perl script/get_data_by_id.pl WRKY_removed_redundant_and_confirmed_IDlist.txt WRKY_domain_new_out_selected.txt WRKY_domain_new_out_removed_redundant.txt 116 | 117 | #截取得到序列上的保守结构域序列,注意脚本提取的是第一个domain,如要提取其他domain,请修改脚本27行$a[9]==1为第一个,$a[9]==2为第二个,依次类推 118 | 119 | perl script/domain_xulie.pl WRKY_domain_new_out_removed_redundant.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_domain_confirmed.fa 0.1 120 | 121 | #得到对应基因的蛋白序列全长: 122 | 123 | perl script/get_fa_by_id.pl WRKY_domain_new_out_removed_redundant.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_pep_confirmed.fa 124 | 125 | #得到对应基因的cds序列: 126 | 127 | perl script/get_fa_by_id.pl WRKY_domain_new_out_removed_redundant.txt Arabidopsis_thaliana.TAIR10.cds.all.fa WRKY_cds_confirmed.fa 128 | 129 | 130 | 131 | 132 | 133 | 134 | ########################进化树分析########################################## 135 | 136 | 所需文件 137 | 138 | 上面搜索到的 基因家族蛋白质保守结构域序列(也可以用蛋白序列全长) 139 | 140 | mega 分析绘制 树 141 | 142 | Evoview 进行美化 143 | 144 | #cd $workdir 回到工作路径 145 | 146 | mkdir gene_tree_analysis 147 | cd gene_tree_analysis 148 | cp ../WRKY_domain_confirmed.fa . 149 | cp ../WRKY_pep_confirmed.fa . 150 | cp ../WRKY_cds_confirmed.fa . 151 | cp ../WRKY_domain_new_out_removed_redundant.txt . 152 | 153 | 154 | #########################利用meme软件做motif分析################################33 155 | 156 | 157 | 所需文件 158 | 159 | 基因家族蛋白质 全长 160 | 161 | #cd $workdir 162 | mkdir meme_motif_analysis 163 | cd meme_motif_analysis 164 | #搜索结构域: 165 | #-nmotifs 10 搜索motif的总个数 166 | #-minw 6 motif的最短长度 167 | #-maxw 50 motif的最大长度 168 | 169 | # meme 输出 motif 图 170 | 171 | /biosoft/meme/meme-v4.12.0/bin/meme ../WRKY_pep_confirmed.fa -protein -oc ./ -nostatus -time 18000 -maxsize 6000000 -mod anr -nmotifs 10 -minw 6 -maxw 100 172 | 173 | 174 | 175 | 176 | ##################################基因结构分析structure#################### 177 | 178 | 所需文件 179 | 180 | 基因蛋白结构域所对应的 geneID 信息 181 | 182 | gff 文件 183 | 184 | #cd $workdir 回到工作路径 185 | cd $workdir 186 | mkdir gene_structure_analysis 187 | cd gene_structure_analysis 188 | cp ../WRKY_domain_new_out_removed_redundant.txt . 189 | 190 | 191 | 192 | #获得基因的在染色体上的外显子,CDS,UTR位置信息,用于绘制基因结构图 193 | 194 | perl ../script/get_gene_exon_from_gff.pl -in1 WRKY_domain_new_out_removed_redundant.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out gene_exon_info.gff 195 | 196 | # 将上述的 gff文件放到 GSDS 网站即可绘制图形 197 | 198 | ################################基因定位到染色体############################################### (蜈蚣图) 199 | 200 | 所需文件 201 | 202 | 基因家族 geneID 203 | 204 | 染色体 长度信息 205 | 206 | #cd $workdir 回到工作路径 207 | cd $workdir 208 | mkdir map_to_chr 209 | cd map_to_chr 210 | cp ../WRKY_domain_new_out_removed_redundant.txt . #WRKY基因家族文件 211 | 212 | 213 | #获得基因的在染色体上的位置信息,用于绘制染色体位置图,注意提取的是基因位置还是mRNA位置,以下代码是提取的 mRNA位置 214 | perl ../script/get_gene_weizhi.pl -in1 WRKY_domain_new_out_removed_redundant.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out mrna_location.txt 215 | 216 | #获得基因组染色体长度: 217 | samtools faidx ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa 218 | 219 | cp ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai . 220 | 221 | #绘图参考:http://www.omicsclass.com/article/397 222 | 223 | 224 | ###############################blast方法 复制基因查找 及KAKS分析################################# 225 | 226 | 基因串联重复 227 | 228 | 两个基因比对率 > 70% (相对于较长的基因),且基因的比对相似性 > 70% 229 | 两个基因在同一条染色体上,且位置 < 100 kb 230 | 231 | 232 | 所需文件 233 | 234 | 基因家族 geneID 235 | 236 | 基因家族 CDS 序列 237 | 238 | 239 | #cd $workdir 回到工作路径 240 | mkdir gene_duplication_kaks_blast 241 | cd gene_duplication_kaks_blast 242 | cp ../WRKY_domain_new_out_removed_redundant.txt . 243 | cp ../WRKY_cds_confirmed.fa . 244 | #blast建库,DNA序列,all vs all 比对,结果说明见:http://www.omicsclass.com/article/505 245 | makeblastdb -in WRKY_cds_confirmed.fa -dbtype nucl -title WRKY_cds_confirmed.fa 246 | blastall -i WRKY_cds_confirmed.fa -d WRKY_cds_confirmed.fa -p blastn -e 1e-20 -m 8 -o WRKY_cds_confirmed_blast.out 247 | 248 | #获取基因cds序列的长度: 249 | samtools faidx WRKY_cds_confirmed.fa 250 | 251 | perl ../script/KAKS_SHAIXUAN.pl -in1 WRKY_cds_confirmed.fa.fai -in2 WRKY_cds_confirmed_blast.out -out duplication_gene.out 252 | 253 | 254 | # 对上面的结果进行去重复 255 | 基因之间两两比对,会存在 AvsB BvsA 因此用下面的脚本去重复 256 | 257 | perl ../clean_blastall.pl duplication_gene.out 258 | 259 | 260 | ###kaks 分析### 261 | 262 | 所需文件 263 | 上面的 基因家族 复制基因 264 | 265 | 266 | #提取成对基因的序列 267 | echo "AT1G66600.1\nAT1G66560.1" >dupid.txt 268 | perl ../script/get_fa_by_id.pl dupid.txt WRKY_cds_confirmed.fa dup_gene_paired1.fa 269 | 270 | #多序列比对 clustalw 271 | echo "1\ndup_gene_paired1.fa\n2\n9\n4\n\n1\ndup_gene_paired1.aln\ndup_gene_paired1.dnd\nX\n\n\nX\n" |clustalw 272 | 273 | #格式转换axt 如果遇到报错not equal,可参考:http://www.omicsclass.com/article/700 ( KAKS 一次只能进行 一对基因的分析 ) 274 | /biosoft/KaKs_Calculator2.0/src/AXTConvertor dup_gene_paired1.aln dup_gene_paired1.axt 275 | /biosoft/KaKs_Calculator2.0/bin/Linux/KaKs_Calculator -i dup_gene_paired1.axt -o dup_gene_paired1.kaks.result 276 | 277 | #分离时间计算:http://www.omicsclass.com/question/896 278 | 279 | 280 | ###########################################以下blast为可选分析内容######################################################################## 281 | 282 | 所需文件 283 | 284 | 从 NCBI 下载蛋白序列 285 | 286 | #blastp比对寻找基因家族成员,WRKY部分 287 | #参考文献:https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4955-8 288 | #NCBI上搜索WRKY蛋白序列:搜索条件:WRKY[title] NOT putative[title] AND plants[filter] 289 | 290 | 291 | #blast比对首先建库 292 | #makeblastdb -in WRKY_NCBI_pep.fasta -dbtype prot -title WRKY_NCBI_pep.fasta #蛋白质序列 293 | # 294 | #blastp比对 295 | #blastall -i ../Arabidopsis_thaliana.TAIR10.pep.all.fa -d WRKY_NCBI_pep.fasta -p blastp -e 1e-10 -b 1 -v 1 -m 8 -o ncbi_WRKY_blast.out 296 | 297 | 利用上述的比对结果提取 序列间的局部匹配区域,利用匹配区域,进行 clustalw 多序列比对,然后通过 hmmbuild 构建物种的hmm文件 298 | 299 | 300 | #######################基因上游顺势作用原件分析####################################### 301 | 302 | 所需文件 303 | 304 | geneID 信息 305 | 306 | gff 文件 307 | 308 | 基因组文件 309 | 310 | #回到工作路径 311 | cd $workdir 312 | mkdir gene_promoter 313 | cd gene_promoter 314 | cp ../WRKY_domain_new_out_removed_redundant.txt . 315 | 316 | #得到基因在染色体上的位置,此脚本会把基因组所有的序列读入内存,如果基因组较大,可能因为内存不足使脚本运行不成功,可以分染色体分开分析: 317 | perl ../script/get_gene_weizhi.pl -in1 WRKY_domain_new_out_removed_redundant.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out mrna_location.txt 318 | #根据位置信息提取,promoter序列 1500 319 | perl ../script/get_promoter.pl ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa mrna_location.txt promoter.fa 320 | 321 | #生成 GSDS配置文件 322 | cat WRKY_domain_new_out_removed_redundant.txt|awk 'BEGIN{OFS="\t"}{print $1,"0","1500","CDS","."}' >gene.bed 323 | #生成feature文件 324 | cat PlantCARE_9210__plantCARE/plantCARE_output_PlantCARE_9210.tab|grep "Arabidopsis"|awk -F"\t" 'BEGIN{OFS="\t"} {print $1,$4,$4+length($3),$2}'>feature.bed 325 | -------------------------------------------------------------------------------- /geneid_to_mRNAid.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use Cwd qw(abs_path getcwd); 4 | use Getopt::Long; 5 | use Data::Dumper; 6 | 7 | die "perl $0 " unless(@ARGV==2); 8 | my$gff=$ARGV[0]; 9 | my%gene=(); 10 | my%gene_region=(); 11 | 12 | open IN,"$gff" or die "$!"; 13 | 14 | while(){ 15 | chomp; 16 | next if (/^#/); 17 | my@tmp=split(/\t/); 18 | 19 | if($tmp[2] =~/^gene/){ 20 | my($id)=($tmp[8]=~/ID=([^;]+)/); 21 | $gene{$id}=[]; 22 | $gene_region{$id}="$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]"; 23 | } 24 | if($tmp[2] =~/mRNA|transcript/i){ 25 | my($id)=($tmp[8]=~/ID=([^;]+)/); 26 | my($pid)=($tmp[8]=~/Parent=([^;]+)/); 27 | 28 | 29 | if(exists $gene{$pid}){ 30 | push @{$gene{$pid}},$id; 31 | }else{ 32 | die "please check mRNA $id has gene ID \n"; 33 | } 34 | 35 | } 36 | } 37 | 38 | close(IN); 39 | 40 | open OUT ,">$ARGV[1]" or die "$!"; 41 | print OUT "#gene_ID\tchr\tstart\tend\tstrand\ttranscript_id\n"; 42 | for my $id(keys %gene) { 43 | print OUT "$id\t$gene_region{$id}\t".join("\t",sort @{$gene{$id}})."\n"; 44 | } 45 | 46 | close(OUT); 47 | -------------------------------------------------------------------------------- /genome.txt: -------------------------------------------------------------------------------- 1 | 2 | # 背景区块的配置文件 3 | # 基于mcscanx分析结果,选取染色体上对应基因的第一个和最后一个 4 | 5 | chr04 235985192 238520393 chr05 89127420 77919313 6 | chr01 177745754 179756069 chr01 263221976 261689189 7 | chr02 226660231 227489681 chr04 21034937 23267356 8 | chr01 257237981 259662579 chr05 16839069 18254076 9 | chr02 7313083 10264901 chr04 29042531 25887059 10 | chr01 72781187 78190879 chr02 213991895 211892225 11 | chr03 10161437 12538552 chr06 126316653 127947519 12 | chr01 13199428 16659014 chr05 33280286 36117488 13 | chr02 11039996 22253136 chr05 197970559 185883292 14 | chr01 247942338 252515573 chr05 21179147 18361206 15 | chr03 9637008 38688558 chr08 18392025 3044143 16 | chr04 235545789 235928643 chr05 89236231 93996019 17 | chr01 263545596 301474757 chr05 13038255 960992 18 | chr02 224426294 224827815 chr04 16011093 15055409 19 | chr04 235076669 235495146 chr05 101090117 95339508 20 | chr01 100393815 104124989 chr05 43593097 44784264 21 | chr05 40695082 42005127 chr06 89614367 90338798 22 | chr01 79166192 82359578 chr02 15670012 13962160 23 | chr01 3635462 4144525 chr01 78813994 79779467 24 | chr02 191431054 193577343 chr05 207321134 206112159 25 | chr01 3131661 3450838 chr01 228088856 226688602 26 | chr01 225786615 242895913 chr05 20236510 33124260 27 | chr02 236303593 238275076 chr05 3518540 4544483 28 | chr05 57879365 64457951 chr06 85758193 80992737 29 | chr04 237005472 238418562 chr06 90241588 85971864 30 | chr03 12324004 15125672 chr06 120931084 123310948 31 | chr02 28317840 32416047 chr04 143101873 135253384 32 | chr02 25672897 27242359 chr04 147014326 144062146 33 | chr02 38300581 42362725 chr04 129149685 125170216 34 | chr03 175562023 177176703 chr06 146135846 146803277 35 | chr02 140597842 146529585 chr03 140747563 142885052 36 | chr02 147210539 159658223 chr04 191153761 185126620 37 | chr01 186306392 187840847 chr01 188087101 189667363 38 | chr01 209188508 210401506 chr04 218276481 216110408 39 | -------------------------------------------------------------------------------- /get_data_by_id.pl: -------------------------------------------------------------------------------- 1 | print "perl $0 \n" and die unless(@ARGV==3); 2 | 3 | open IN,"$ARGV[0]" or die "$!"; 4 | 5 | my%t; 6 | my$head; 7 | while(){ 8 | chomp; 9 | my@tmp=split(/\s+/); 10 | 11 | 12 | $t{$tmp[0]}=1; 13 | } 14 | 15 | close(IN); 16 | 17 | open IN,"$ARGV[1]" or die "$!"; 18 | 19 | open OUT,">$ARGV[2]" or die "$!"; 20 | while(){ 21 | chomp; 22 | if (/^#/){ 23 | print OUT "$_\n"; 24 | next ; 25 | } 26 | 27 | my@tmp=split(/\s+/); 28 | 29 | if(exists $t{$tmp[0]}){ 30 | print OUT "$_\n"; 31 | }else{ 32 | #print "$tmp[0]\n"; 33 | } 34 | } 35 | close(IN); 36 | 37 | close(OUT); 38 | -------------------------------------------------------------------------------- /get_fa_by_id.pl: -------------------------------------------------------------------------------- 1 | #北京组学生物科技有限公司 2 | #email: huangls@biomics.com.cn 3 | 4 | die "perl $0 " unless ( @ARGV == 3 ); 5 | use Math::BigFloat; 6 | use Bio::SeqIO; 7 | use Bio::Seq; 8 | 9 | #读入蛋白序列 10 | $in = Bio::SeqIO->new( 11 | -file => "$ARGV[1]", 12 | -format => 'Fasta' 13 | ); 14 | 15 | #输出蛋白序列: 16 | $out = Bio::SeqIO->new( 17 | -file => ">$ARGV[2]", 18 | -format => 'Fasta' 19 | ); 20 | 21 | #读取需要提取基因ID 22 | my %keep = (); 23 | open IN, "$ARGV[0]" or die "$!"; 24 | 25 | while () { 26 | chomp; 27 | next if /^#/; 28 | my @a = split /\s+/; 29 | $keep{$a[0]}=1; 30 | } 31 | close(IN); 32 | 33 | #输出想要的基因的序列 34 | while ( my $seq = $in->next_seq() ) { 35 | my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc ); 36 | 37 | if ( exists $keep{$id} ) { 38 | $out->write_seq($seq); 39 | } 40 | } 41 | $in->close(); 42 | $out->close(); -------------------------------------------------------------------------------- /get_fa_by_id_from_bed.pl: -------------------------------------------------------------------------------- 1 | #北京组学生物科技有限公司 2 | #email: huangls@biomics.com.cn 3 | 4 | die "perl $0 " unless ( @ARGV == 3 ); 5 | use Math::BigFloat; 6 | use Bio::SeqIO; 7 | use Bio::Seq; 8 | 9 | #读入蛋白序列 10 | $in = Bio::SeqIO->new( 11 | -file => "$ARGV[1]", 12 | -format => 'Fasta' 13 | ); 14 | 15 | #输出蛋白序列: 16 | $out = Bio::SeqIO->new( 17 | -file => ">$ARGV[2]", 18 | -format => 'Fasta' 19 | ); 20 | 21 | #读取需要提取基因ID 22 | my %keep = (); 23 | open IN, "$ARGV[0]" or die "$!"; 24 | 25 | while () { 26 | chomp; 27 | next if /^#/; 28 | my @a = split /\t/; 29 | $keep{"$a[3].1"}=1; ##注意提取第一个转录本 30 | } 31 | close(IN); 32 | 33 | #输出想要的基因的序列 34 | while ( my $seq = $in->next_seq() ) { 35 | my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc ); 36 | 37 | if ( exists $keep{$id} ) { 38 | $out->write_seq($seq); 39 | } 40 | } 41 | $in->close(); 42 | $out->close(); -------------------------------------------------------------------------------- /get_fa_by_id_from_gff.pl: -------------------------------------------------------------------------------- 1 | #script www.omicsclass.com 2 | die "perl $0 " unless(@ARGV==3); 3 | 4 | use Bio::SeqIO; 5 | use Bio::Seq; 6 | 7 | my$in = Bio::SeqIO->new(-file => "$ARGV[1]" , 8 | -format => 'Fasta'); 9 | my$out = Bio::SeqIO->new(-file => ">$ARGV[2]" , 10 | -format => 'Fasta'); 11 | my%keep=(); 12 | 13 | open IN ,"$ARGV[0]" or die "$!"; 14 | while(){ 15 | chomp; 16 | next if /^#/; 17 | my@tmp=split(/\s+/); 18 | $keep{"$tmp[1].1"}=1; 19 | } 20 | close(IN); 21 | while ( my $seq = $in->next_seq() ) { 22 | my($id,$sequence,$desc)=($seq->id,$seq->seq,$seq->desc); 23 | if( exists $keep{$id}){ 24 | $out->write_seq($seq); 25 | } 26 | } 27 | $in->close(); 28 | $out->close(); 29 | 30 | -------------------------------------------------------------------------------- /get_gene_bed.pl: -------------------------------------------------------------------------------- 1 | use Getopt::Long; 2 | my %opts; 3 | use Data::Dumper; 4 | GetOptions( \%opts, "in1=s", "out=s", "h" ); 5 | if ( !defined( $opts{in1} ) || !defined( $opts{out} ) || defined( $opts{h} ) ) { 6 | &USAGE; 7 | } 8 | open( IN1, "$opts{in1}" ) || die "open $opts{in1} failed\n"; 9 | open( OUT, ">$opts{out}" ) || die "open $opts{out} failed\n"; 10 | 11 | while () { 12 | chomp; 13 | my @a = split /\t/, $_; 14 | if ( $a[2] eq "gene" ) { 15 | #if ($a[2] eq "mRNA") { 16 | $a[8] =~ m/ID=([^;]*)/; #注意这里匹配基因的ID信息 17 | $id = $1; 18 | 19 | print OUT "$a[0]\t$a[3]\t$a[4]\t$id\t$a[7]\t$a[6]\n"; 20 | 21 | } 22 | 23 | } 24 | close OUT; 25 | close IN1; 26 | close IN2; 27 | 28 | sub USAGE { 29 | print "usage: perl $0 -in1 gff -out gene_location.bed "; 30 | exit; 31 | } 32 | -------------------------------------------------------------------------------- /get_gene_exon_from_gff.pl: -------------------------------------------------------------------------------- 1 | use Getopt::Long; 2 | my %opts; 3 | use Data::Dumper; 4 | GetOptions( \%opts, "in1=s", "in2=s", "out=s", "h" ); 5 | if ( !defined( $opts{in1} ) 6 | || !defined( $opts{in2} ) 7 | || !defined( $opts{out} ) 8 | || defined( $opts{h} ) ) 9 | { 10 | &USAGE; 11 | } 12 | open( IN1, "$opts{in1}" ) || die "open $opts{in1} failed\n"; 13 | open( IN2, "$opts{in2}" ) || die "open $opts{in2} failed\n"; 14 | open( OUT, ">$opts{out}" ) || die "open $opts{out} failed\n"; 15 | my %gffs; 16 | while () { 17 | chomp; 18 | next if /^#/; 19 | my @b = split/\s+/, $_; 20 | $gffs{$b[0]} = 1; 21 | } 22 | 23 | #print Dumper(\%gffs); 24 | while () { 25 | chomp; 26 | next if (/^#/); 27 | my @a = split /\t/, $_; 28 | next if $a[2]=~/exon/i; 29 | if ($a[2] =~/^mRNA$/i or $a[2] =~/^transcript$/i ) { 30 | ($id1) = ($a[8] =~ m/ID=([^;]*)/); 31 | 32 | }elsif ( $a[2] =~/^CDS$/i or $a[2] =~/utr/i ) { 33 | 34 | ($id1) = ($a[8] =~ m/Parent=([^;]*)/); 35 | }else{ 36 | next; 37 | } 38 | 39 | if ( exists $gffs{$id1} ) { 40 | print OUT "$_\n"; 41 | } 42 | 43 | } 44 | close OUT; 45 | close IN1; 46 | close IN2; 47 | 48 | sub USAGE { 49 | print "usage: perl $0 -in1 mRNA_id.txt -in2 genome.gff3 -out gene_location.txt "; 50 | exit; 51 | } 52 | -------------------------------------------------------------------------------- /get_gene_position.pl: -------------------------------------------------------------------------------- 1 | 2 | open IN,"$ARGV[0]" or die "$!"; 3 | open OUT,">$ARGV[1]" or die "$!"; 4 | while(){ 5 | chomp; 6 | 7 | next if /^#/; 8 | @tmp=split(/\t/); 9 | #if($tmp[2]=~/gene/ && $tmp[0]=~/^\d+/ && $tmp[-1]=~/protein_coding/){ 10 | if($tmp[2]=~/gene/){ 11 | my($id)=($tmp[-1]=~/ID=([^;]+)/); 12 | print OUT "$tmp[0]\t$id\t$tmp[3]\t$tmp[4]\n"; 13 | } 14 | } 15 | 16 | close(IN); 17 | close(OUT); 18 | -------------------------------------------------------------------------------- /get_gene_weizhi.pl: -------------------------------------------------------------------------------- 1 | use Getopt::Long; 2 | my %opts; 3 | use Data::Dumper; 4 | GetOptions (\%opts,"in1=s","in2=s","out=s","h"); 5 | if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){ 6 | &USAGE; 7 | } 8 | open (IN1,"$opts{in1}") || die "open $opts{in1} failed\n"; 9 | open (IN2,"$opts{in2}") || die "open $opts{in2} failed\n"; 10 | open (OUT,">$opts{out}") || die "open $opts{out} failed\n"; 11 | my%gffs; 12 | while () { 13 | next if (/^#/); 14 | chomp; 15 | my@b=split,$_; 16 | $keys= $b[0]; 17 | 18 | $values= $b[0]; 19 | 20 | $gffs{$keys} = $values; 21 | 22 | } 23 | 24 | while () { 25 | chomp; 26 | my @a=split /\t/,$_; 27 | #if ($a[2] eq "gene") { 28 | if ($a[2] eq "mRNA") { 29 | $a[8]=~ m/ID=([^;]*)/;#注意这里匹配基因的ID信息 30 | $id1=$1; 31 | 32 | if ( exists $gffs{$id1} ) { 33 | 34 | print OUT "$gffs{$id1}\t$a[0]\t$a[3]\t$a[4]\t$a[6]\n"; 35 | } 36 | } 37 | 38 | } 39 | close OUT; 40 | close IN1; 41 | close IN2; 42 | 43 | sub USAGE { 44 | print "usage: perl $0 -in1 gene_id.txt -in2 genome.gff3 -out gene_location.txt "; 45 | exit; 46 | } -------------------------------------------------------------------------------- /get_gtf.pl: -------------------------------------------------------------------------------- 1 | use Getopt::Long; 2 | my %opts; 3 | use Data::Dumper; 4 | GetOptions (\%opts,"in1=s","in2=s","out=s","h"); 5 | if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){ 6 | &USAGE; 7 | } 8 | open (IN1,"$opts{in1}") || die "open $opts{in1} failed\n"; 9 | open (IN2,"$opts{in2}") || die "open $opts{in2} failed\n"; 10 | open (OUT,">$opts{out}") || die "open $opts{out} failed\n"; 11 | my%gffs; 12 | while () { 13 | chomp; 14 | my@b=split,$_; 15 | $keys= $b[0]; 16 | # print "$keys\n"; 17 | $values= $b[0]; 18 | # print "$values"; 19 | $gffs{$keys} = $values; 20 | #print "$gffs{$_}\n"; 21 | } 22 | #print Dumper(\%gffs); 23 | while () { 24 | chomp; 25 | my @a=split /\t/,$_; 26 | 27 | $a[8]=~ m/transcript_id "([^\"]*)/; 28 | $id1=$1; 29 | #print "$id1\t"; 30 | if ( exists $gffs{$id1} ) { 31 | # print "aaa/n"; 32 | #print OUT join ("\t",@a)."\n"; 33 | print OUT "$a[0]\t$a[1]\t$a[2]\t$a[3]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\ttranscript_id \"$gffs{$id1}\";\n"; 34 | } 35 | 36 | 37 | } 38 | 39 | close OUT; 40 | close IN1; 41 | close IN2; 42 | sub USAGE { 43 | print "usage: perl test1.pl -in1 gene_id.txt -in2 基因组gtf文件 -out 结果文件"; 44 | exit; 45 | } -------------------------------------------------------------------------------- /get_promoter.pl: -------------------------------------------------------------------------------- 1 | die "perl $0 " unless(@ARGV==3 ); 2 | use Math::BigFloat; 3 | use Bio::SeqIO; 4 | use Bio::Seq; 5 | $in = Bio::SeqIO -> new(-file => "$ARGV[0]", 6 | -format => 'Fasta'); 7 | $out = Bio::SeqIO -> new(-file => ">$ARGV[2]", 8 | -format => 'Fasta'); 9 | my %keep=() ; 10 | open IN,"$ARGV[0]" or die "$!"; 11 | my%ref=(); 12 | while ( my $seq = $in->next_seq() ) { 13 | my($id,$sequence,$desc)=($seq->id,$seq->seq,$seq->desc); 14 | 15 | $ref{$id}=$seq; 16 | 17 | } 18 | 19 | $in->close(); 20 | 21 | open IN,"$ARGV[1]" or die "$!"; 22 | while () { 23 | chomp; 24 | next if /^#/; 25 | my @a= split /\t/; 26 | my$seq=0; 27 | if(exists $ref{$a[1]}){ 28 | $seq=$ref{$a[1]}; 29 | }else{ 30 | print "chromosome $a[1] not in reference file\n"; 31 | next; 32 | } 33 | 34 | print "$a[1]"; 35 | if( $a[4] eq "-" ){ 36 | $start= $a[3]+1; 37 | $end=$a[3]+1500; 38 | if($end>$seq->length){ 39 | print "Note: $seq->id: upstream don't have enough sequence to cut for $a [0] and skiped\n"; 40 | next; 41 | 42 | } 43 | 44 | my$seq_string=$seq->subseq($start,$end); 45 | my$newseqobj1=Bio::Seq -> new(-seq => $seq_string, 46 | -id => "$a[0]" 47 | ) ; 48 | my$reseq = $newseqobj1 ->revcom(); 49 | $out->write_seq($reseq); 50 | }elsif ( $a[4] eq "+" ){ 51 | $start= $a[2]-1500; 52 | if ($start<0){ 53 | print "Note: $seq->id: upstream don't have enough sequence to cut for $a[0] and skiped\n"; 54 | next; 55 | } 56 | $end=$a[2]-1; 57 | 58 | my$seq_string=$seq->subseq($start,$end); 59 | 60 | my$newseqobj1=Bio::Seq -> new(-seq => $seq_string, 61 | -id => "$a[0]" 62 | 63 | ) ; 64 | 65 | $out->write_seq($newseqobj1); 66 | } 67 | 68 | } 69 | close (IN); 70 | $in->close(); 71 | $out->close(); 72 | -------------------------------------------------------------------------------- /get_tandem_gene.pl: -------------------------------------------------------------------------------- 1 | use Data::Dumper; 2 | use Getopt::Long; 3 | use strict; 4 | use Cwd qw(abs_path getcwd); 5 | my %opts; 6 | 7 | GetOptions (\%opts,"id=s","tandem=s","od=s","name=s"); 8 | 9 | 10 | if (! defined($opts{id}) ||! defined($opts{tandem})||! defined($opts{name}) || defined($opts{h})){ 11 | &USAGE; 12 | } 13 | 14 | sub USAGE { 15 | 16 | 17 | print "perl $0 -id gene_family.id -tandem gene.tandem -name gene_famil -od ./\n"; 18 | exit; 19 | } 20 | 21 | 22 | my $od=$opts{od}; 23 | $od||=getcwd; 24 | $od=abs_path($od); 25 | unless(-d $od){ mkdir $od;} 26 | 27 | ####get target gene id 28 | 29 | my $gene; 30 | my @info; 31 | my %hashG; 32 | open (IN,"$opts{id}") || die "open $opts{id} failed\n"; 33 | while(){ 34 | chomp; 35 | @info=split(/\s+/,$_); 36 | $gene=$info[0]; 37 | $hashG{$gene}=$gene; 38 | } 39 | close(IN); 40 | 41 | 42 | #######select tandem 43 | 44 | 45 | my $Agene; 46 | my $Bgene; 47 | open(OUT,">$od/$opts{name}.tandem")||die "open $od/$opts{name}.tandem failed\n"; 48 | open (IN,"$opts{tandem}") || die "open $opts{tandem} failed\n"; 49 | while(){ 50 | chomp; 51 | @info=split(/,/,$_); 52 | $Agene=$info[0]; 53 | $Bgene=$info[1]; 54 | if(exists $hashG{$Agene} && exists $hashG{$Bgene}){ 55 | print OUT $Agene."\t".$Bgene."\n"; 56 | } 57 | 58 | } 59 | close(IN); 60 | close(OUT); -------------------------------------------------------------------------------- /link.txt: -------------------------------------------------------------------------------- 1 | 2 | # 基因组内共线性配置文件(某个基因家族或者自己想展示的基因对应关系) 3 | 4 | chr01 230647852 230649724 chr05 23322345 23324725 5 | chr01 249064852 249066345 chr05 20077643 20079139 6 | chr02 43381422 43385332 chr05 172468991 172471289 7 | chr04 239656815 239657190 chr05 73785793 73787640 8 | 9 | 10 | 11 | 12 | # 前三列 和 后三列 是一个对应关系 表示共线性基因对的位置信息 13 | -------------------------------------------------------------------------------- /mRNAid_to_geneid.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | use Cwd qw(abs_path getcwd); 4 | use Getopt::Long; 5 | use Data::Dumper; 6 | 7 | 8 | die "perl $0 " unless(@ARGV==2); 9 | 10 | 11 | 12 | my$gff=$ARGV[0]; 13 | my%gene=(); 14 | my%gene_region=(); 15 | my%mRNA2Gene=(); 16 | open IN,"$gff" or die "$!"; 17 | open OUT ,">$ARGV[1]" or die "$!"; 18 | print OUT "#mRNA_ID\tgene_ID\tchr\tstart\tend\tstrand\n"; 19 | while(){ 20 | chomp; 21 | next if (/^#/); 22 | my@tmp=split(/\t/); 23 | 24 | 25 | 26 | if($tmp[2] =~/^gene/){ 27 | my($id)=($tmp[8]=~/ID=([^;]+)/); 28 | $gene{$id}=1; 29 | $gene_region{$id}=[$tmp[0],$tmp[3],$tmp[4],$tmp[6]]; 30 | 31 | 32 | #print "gene:$id\n"; 33 | #my$gene_chr->{$id}=$tmp[0]; 34 | } 35 | if($tmp[2] =~/mRNA|transcript/i){ 36 | my($id)=($tmp[8]=~/ID=([^;]+)/); 37 | my($pid)=($tmp[8]=~/Parent=([^;]+)/); 38 | print OUT "$id\t$pid\t"; 39 | 40 | if(exists $gene{$pid}){ 41 | print OUT "$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; 42 | } 43 | #print "mRNA:$id\n"; 44 | } 45 | } 46 | 47 | close(IN); 48 | close(OUT); 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /mcscan_seqid_染色体编号配置文件: -------------------------------------------------------------------------------- 1 | 1,5,3,2,4 # 一个物种的 染色体编号 --- 按染色体长度排序 (可选) 2 | A09,A03,A01,A02,A06,A05,A07,A08,A04,A10 # 另一个物种的 染色体编号 3 | -------------------------------------------------------------------------------- /mcscan_图层配置文件: -------------------------------------------------------------------------------- 1 | # y, xstart, xend, rotation, color, label, va, bed 2 | .8, .2, .75, 0, red, ATH, top, ATH.bed(要在当前路径或者绝对路径) 3 | .2, .2, .75, 0, green, rapa, bottom, rapa.bed(要在当前路径或者绝对路径) 4 | # edges 5 | e, 0, 1, ATH.rapa.anchors.simple.c(绘图文件,经过上一步分析可以得到) 6 | -------------------------------------------------------------------------------- /mcscanx物种内共线性分析.sh: -------------------------------------------------------------------------------- 1 | 2 | # mcscanx 物种内基因家族共线性分析 3 | 4 | 所需文件 5 | 6 | cds 文件 7 | gff 文件 8 | pep 文件 9 | 10 | 基因家族 ID 信息 11 | 12 | #www.omicsclass.com 13 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz 14 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-39/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.39.gff3.gz 15 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz 16 | #gunzip *gz 17 | 18 | #mkdir mcscan 19 | #perl get_gene_position.pl Arabidopsis_thaliana.TAIR10.39.gff3 AT.gff 20 | #sed -i 's#gene:##' AT.gff 21 | #perl get_fa_by_id.pl AT.gff Arabidopsis_thaliana.TAIR10.pep.all.fa pep.fa 22 | #sed -i 's#\.1##' pep.fa 23 | makeblastdb -in pep.fa -dbtype prot -title pep.fa 24 | blastall -i pep.fa -d pep.fa -e 1e-10 -p blastp -b 5 -v 5 -m 8 -o mcscan/AT.blast 25 | cp AT.gff mcscan/AT.gff 26 | 27 | /biosoft/MCScanX/MCScanX/MCScanX mcscan/AT 28 | 29 | # 编辑物种染色体信息 30 | wget http://chibba.pgml.uga.edu/mcscan2/examples/family.ctl 31 | 32 | 33 | # 基因家族 geneID 文件 34 | wget http://chibba.pgml.uga.edu/mcscan2/examples/MADS_box_family.txt 35 | 36 | sed -i 's#at##g' family.ctl 37 | 38 | # 绘制图形 39 | 40 | cd /biosoft/MCScanX/MCScanX/downstream_analyses 41 | java family_circle_plotter -g /home/manager/share/mcscan/mcscan/AT.gff -s /home/manager/share/mcscan/mcscan/AT.collinearity -c /home/manager/share/mcscan/family.ctl -f /home/manager/share/mcscan/MADS_box_family.txt -o /home/manager/share/mcscan/mcscan/MADS.circle.PNG 42 | 43 | 44 | # 分析基因家族的串联重复 以及 基因组内所有串联重复信息 45 | cd /biosoft/MCScanX/MCScanX/downstream_analyses 46 | perl detect_collinearity_within_gene_families.pl -i /home/manager/share/mcscan/MADS_box_family.txt -d /home/manager/share/mcscan/mcscan/AT.collinearity -o /home/manager/share/mcscan/mcscan/MADS.collinear.pairs 47 | 48 | # 由于绘制的圈图不是很美观,因此我们会对分析结果利用 circos 进行图形的绘制 49 | -------------------------------------------------------------------------------- /select_redundant_mRNA.pl: -------------------------------------------------------------------------------- 1 | die "perl $0 " unless(@ARGV==3); 2 | 3 | 4 | 5 | open IN ,"$ARGV[0]" or die "$!"; 6 | open OUT,">$ARGV[2]" or die "$!"; 7 | my%mRNA2geneData; 8 | my%mRNA2gene; 9 | 10 | my%gene2mRNA; 11 | 12 | print OUT "#geneID\tmRNAID\n"; 13 | while (){ 14 | chomp; 15 | if(/^#/){ 16 | 17 | next; 18 | 19 | } 20 | 21 | my@tmp=split(/\t/); 22 | $mRNA2gene{$tmp[0]}=$tmp[1]; 23 | $mRNA2geneData{$tmp[0]}=$_; 24 | $gene2mRNA{$tmp[1]}{$tmp[0]}=1; 25 | 26 | } 27 | 28 | close(IN); 29 | 30 | open IN,"$ARGV[1]" or die "$!"; 31 | my%uniqGene; 32 | 33 | while (){ 34 | chomp; 35 | next if /^#/; 36 | my@tmp=split(/\s+/); 37 | $uniqGene{$mRNA2gene{$tmp[0]}}{$tmp[0]}=1; 38 | 39 | } 40 | close(IN); 41 | 42 | 43 | for my$geneID(keys %uniqGene){ 44 | 45 | my$transcriptIDNumber=scalar keys %{$uniqGene{$geneID}}; 46 | my@transIDs=keys %{$uniqGene{$geneID}}; 47 | 48 | print OUT "$geneID\t".join("\t",sort{$a cmp $b} @transIDs)."\n"; 49 | } 50 | 51 | close(OUT); 52 | -------------------------------------------------------------------------------- /stat_protein_fa.pl: -------------------------------------------------------------------------------- 1 | #北京组学生物科技有限公司 2 | #email: huangls@biomics.com.cn 3 | 4 | die "perl $0 " unless(@ARGV==2); 5 | use Bio::SeqIO; 6 | use Bio::Seq; 7 | use Bio::Tools::SeqStats; 8 | use Bio::Tools::pICalculator; 9 | use Data::Dumper; 10 | #读入序列 11 | my $in = Bio::SeqIO->new( 12 | -file => "$ARGV[0]", 13 | -format => 'Fasta' 14 | ); 15 | 16 | open OUT,">$ARGV[1]" or die "$!"; 17 | print OUT "#ID\tlength\tMW(Da)\tpI\n"; 18 | my $calc = Bio::Tools::pICalculator->new(-places => 2,-pKset => 'EMBOSS'); 19 | 20 | 21 | #逐条读取序列并计算 22 | while ( my $seq = $in->next_seq() ) { 23 | #my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc ); 24 | my $weight = Bio::Tools::SeqStats ->get_mol_wt($seq); 25 | $calc->seq($seq); 26 | my $iep = $calc->iep; 27 | print OUT sprintf("%s\t%s\t%s\t%s\n", 28 | $seq->id, 29 | $seq->length, 30 | "$weight->[0]", 31 | $iep); 32 | } 33 | $in->close(); 34 | close(OUT); 35 | -------------------------------------------------------------------------------- /text.txt: -------------------------------------------------------------------------------- 1 | 2 | # 配置基因显示名称 3 | chr01 230647852 230649724 Zm00008a004052 4 | chr05 23322345 23324725 Zm00008a019932 5 | chr01 249064852 249066345 Zm00008a004460 6 | chr05 20077643 20079139 Zm00008a019854 7 | chr02 43381422 43385332 Zm00008a007262 8 | chr05 172468991 172471289 Zm00008a022078 9 | chr04 239656815 239657190 Zm00008a018843 10 | chr05 73785793 73787640 Zm00008a020843 11 | -------------------------------------------------------------------------------- /upload_code: -------------------------------------------------------------------------------- 1 | git init 2 | git add README.md 3 | git commit -m "first commit" 4 | git remote add origin git@github.com:chensole/bioinformatics.git 5 | git push -u origin master 6 | -------------------------------------------------------------------------------- /vimrc: -------------------------------------------------------------------------------- 1 | "颜色主题 2 | 3 | colorscheme molokai 4 | set t_Co=256 5 | set background=dark 6 | 7 | "Vundle相关。Vundle是vim插件管理器,使用它来管理插件很方便,而且功能强大 8 | 9 | set nocompatible " be iMproved, required 10 | filetype off " required 11 | 12 | 13 | " set the runtime path to include Vundle and initialize 14 | set rtp+=~/.vim/bundle/Vundle.vim 15 | call vundle#begin() 16 | " alternatively, pass a path where Vundle should install plugins 17 | "call vundle#begin('~/some/path/here') 18 | 19 | " let Vundle manage Vundle, required 20 | " Plugin 'VundleVim/Vundle.vim' 21 | " The following are examples of different formats supported. 22 | Plugin 'https://github.com/scrooloose/nerdtree.git' 23 | Plugin 'Valloric/YouCompleteMe' "{ 24 | "配置默认文件路径 25 | let g:ycm_global_ycm_extra_conf='~/.vim/bundle/YouCompleteMe/third_party/ycmd/cpp/ycm/.ycm_extra_conf.py' 26 | 27 | "语法关键字补全 28 | let g:ycm_seed_identifiers_with_syntax = 1 29 | let g:ycm_add_preview_to_completeopt = 0 30 | let g:ycm_show_diagnostics_ui = 0 31 | let g:ycm_server_log_level = 'info' 32 | let g:ycm_min_num_identifier_candidate_chars = 2 33 | let g:ycm_collect_identifiers_from_comments_and_strings = 1 34 | 35 | "字符串开启补全 36 | let g:ycm_complete_in_strings=1 37 | let g:ycm_key_invoke_completion = '' 38 | set completeopt=menu,preview 39 | 40 | "补全后自动关闭预览窗口 41 | let g:ycm_autoclose_preview_window_after_completion = 1 42 | noremap 43 | 44 | "回车选中匹配项 45 | inoremap pumvisible() ? "\" : "\" 46 | 47 | "语义补全触发条件 48 | let g:ycm_semantic_triggers = { 49 | \ 'c' : ['->', '.'], 50 | \ 'objc' : ['->', '.', 're!\[[_a-zA-Z]+\w*\s', 're!^\s*[^\W\d]\w*\s', 51 | \ 're!\[.*\]\s'], 52 | \ 'ocaml' : ['.', '#'], 53 | \ 'cpp,objcpp' : ['->', '.', '::'], 54 | \ 'perl' : ['->'], 55 | \ 'php' : ['->', '::'], 56 | \ 'cs,java,javascript,typescript,d,python,perl6,scala,vb,elixir,go' : ['.'], 57 | \ 'ruby' : ['.', '::'], 58 | \ 'lua' : ['.', ':'], 59 | \ 'erlang' : [':'], 60 | \ } 61 | let g:ycm_semantic_triggers = { 62 | \ 'c,cpp,python,java,go,erlang,perl': ['re!\w{2}'], 63 | \ 'cs,lua,javascript': ['re!\w{2}'], 64 | \ } 65 | "} 66 | " Keep Plugin commands between vundle#begin/end. 67 | " plugin on GitHub repo 68 | " Plugin 'tpope/vim-fugitive' 69 | " plugin from http://vim-scripts.org/vim/scripts.html 70 | " Plugin 'L9' 71 | " Git plugin not hosted on GitHub 72 | " Plugin 'git://git.wincent.com/command-t.git' 73 | " git repos on your local machine (i.e. when working on your own plugin) 74 | " Plugin 'file:///home/gmarik/path/to/plugin' 75 | " The sparkup vim script is in a subdirectory of this repo called vim. 76 | " Pass the path to set the runtimepath properly. 77 | " Plugin 'rstacruz/sparkup', {'rtp': 'vim/'} 78 | " Install L9 and avoid a Naming conflict if you've already installed a 79 | " different version somewhere else. 80 | " Plugin 'ascenator/L9', {'name': 'newL9'} 81 | 82 | " All of your Plugins must be added before the following line 83 | call vundle#end() " required 84 | filetype plugin indent on " required 85 | " To ignore plugin indent changes, instead use: 86 | "filetype plugin on 87 | " 88 | " Brief help 89 | " :PluginList - lists configured plugins 90 | " :PluginInstall - installs plugins; append `!` to update or just :PluginUpdate 91 | " :PluginSearch foo - searches for foo; append `!` to refresh local cache 92 | " :PluginClean - confirms removal of unused plugins; append `!` to auto-approve removal 93 | " 94 | " see :h vundle for more details or wiki for FAQ 95 | " Put your non-Plugin stuff after this line 96 | 97 | "vim支持鼠标点击 98 | "set mouse=a 99 | 100 | set wildmenu 101 | 102 | " 显示行号 103 | set number 104 | 105 | " 自动对齐文中行缩进 106 | set autoindent 107 | 108 | "智能缩进使用了代码语法和样式来对齐 109 | set smartindent 110 | 111 | " tab键的宽度 112 | set tabstop=4 113 | 114 | set cursorline 115 | 116 | "在遍历文件时识别括弧的起始和结束位置 117 | set showmatch 118 | 119 | "在文件中高亮显示搜索关键词 120 | set hlsearch 121 | 122 | set encoding=utf-8 123 | set nocompatible 124 | syntax on 125 | 126 | let python_highlight_all=1 127 | au Filetype python set tabstop=4 128 | au Filetype python set softtabstop=4 129 | au Filetype python set shiftwidth=4 130 | au Filetype python set textwidth=79 131 | au Filetype python set expandtab 132 | au Filetype python set autoindent 133 | au Filetype python set fileformat=unix 134 | autocmd Filetype python set foldmethod=indent 135 | autocmd Filetype python set foldlevel=99 136 | 137 | map :call CompileRunGcc() 138 | func! CompileRunGcc() 139 | exec "w" 140 | if &filetype == 'c' 141 | exec "!g++ % -o %<" 142 | exec "!time ./%<" 143 | elseif &filetype == 'cpp' 144 | exec "!g++ % -o %<" 145 | exec "!time ./%<" 146 | elseif &filetype == 'java' 147 | exec "!javac %" 148 | exec "!time java %<" 149 | elseif &filetype == 'sh' 150 | :!time bash % 151 | elseif &filetype == 'python' 152 | exec "!clear" 153 | exec "!time python3 %" 154 | elseif &filetype == 'html' 155 | exec "!firefox % &" 156 | elseif &filetype == 'go' 157 | " exec "!go build %<" 158 | exec "!time go run %" 159 | elseif &filetype == 'mkd' 160 | exec "!~/.vim/markdown.pl % > %.html &" 161 | exec "!firefox %.html &" 162 | endif 163 | endfunc 164 | 165 | "自动补全 166 | :inoremap < <>i 167 | :inoremap > =ClosePair('>') 168 | :inoremap ( ()i 169 | :inoremap ) =ClosePair(')') 170 | :inoremap { {}i 171 | :inoremap } =ClosePair('}') 172 | :inoremap [ []i 173 | :inoremap ] =ClosePair(']') 174 | :inoremap " ""i 175 | :inoremap ' ''i 176 | function! ClosePair(char) 177 | if getline('.')[col('.') - 1] == a:char 178 | return "\" 179 | else 180 | return a:char 181 | endif 182 | endfunction 183 | 184 | "NERDTree config 185 | map :NERDTreeToggle " F4一键开关目录树 186 | autocmd bufenter * if (winnr("$") == 1 && exists("b:NERDTreeType") &&b:NERDTreeType == "primary") | q | endif " 当目录树窗口为最后一个窗口时自动退出vim 187 | 188 | 189 | """"""""""""""""" 新文件标题""""""""""""""""""""""""""" 190 | 191 | "新建.pl文件,自动插入文件头 192 | 193 | autocmd BufNewFile *.pl exec ":call SetPerlTitle()" 194 | 195 | func SetPerlTitle() 196 | call setline(1,"#!usr/bin/perl -w") 197 | call append( line("."),"use strict;") 198 | call append(line(".")+1," ") 199 | call append(line(".")+2, "\# File Name: ".expand("%")) 200 | call append(line(".")+3, "\# Author: chensole") 201 | call append(line(".")+4, "\# mail: 1278371386@qq.com") 202 | call append(line(".")+5, "\# Created Time: ".strftime("%Y-%m-%d",localtime())) 203 | endfunc 204 | 205 | 206 | " 键盘命令 207 | 208 | " 映射全选+复制 ctrl+a 209 | 210 | map ggVGY 211 | 212 | map! ggVGY 213 | 214 | map gg=G 215 | 216 | " 选中状态下 Ctrl+c 复制 217 | 218 | vmap "+y 219 | 220 | "去空行 221 | 222 | nnoremap :g/^\s*$/d 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /下载数据: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##1. 从NCBI上面下载nr数据库,不要用wget,相当慢 4 | 5 | 6 | 使用 aspera 下载NCBI FTP 站点数据会非常快 7 | 8 | ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /blast/db/FASTA/nr.gz ./ 9 | 10 | ### 11 | Aspera的用法: $ ascp [参数] 目标文件 目的地址 12 | Aspera的常用参数: 13 | -T 不进行加密。若不添加此参数,可能会下载不了。 14 | -i string 输入私钥,安装 aspera 后有在目录 ~/.aspera/connect/etc/ 下有几个私钥,使用 linux 服务器的时候一般使用 asperaweb_id_dsa.openssh 文件作为私钥。 15 | --host string ftp的host名,NCBI的为ftp-private.ncbi.nlm.nih.gov;EBI的为fasp.sra.ebi.ac.uk。 16 | --user string 用户名,NCBI的为anonftp,EBI的为era-fasp。 17 | --mode string 选择模式,上传为 send,下载为 recv。 18 | -l string 设置最大传输速度,比如设置为 200M 则表示最大传输速度为 200m/s。若不设置该参数,则一般可达到10m/s的速度,而设置了,传输速度可以更高。 19 | 20 | ##2. 从NCBI上面下载分类数据库文件 21 | 22 | ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /pub/taxonomy/taxdump.tar.gz ./ 23 | ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /pub/taxonomy/accession2taxid/prot.accession2taxid.gz ./ 24 | 25 | 26 | 其中taxdump.tar.gz中主要有两个文件很有用 27 | 28 | names.dmp 记录物种名及其分类编号 29 | nodes.dmp 记录分类编号的节点信息 30 | 31 | 例如 植物大类的编号为 3193 32 | 33 | 34 | 35 | 36 | 37 | ### NCBI gene 数据库 38 | 39 | https://www.ncbi.nlm.nih.gov/gene/?term= 40 | 41 | 这个数据库中存储了所有物种的gene 信息 42 | 43 | 从它所在的FTP站点,可以下载有用的文件 44 | 45 | ftp://ftp.ncbi.nih.gov/gene/DATA/ 46 | 47 | 48 | 常见的有 49 | 50 | gene2accession.gz geneID(Entrez ID)与accession(收集的序列大多来自 swissport、RefSeq等)间的对应关系,这个文件是一个非常详细的数据文件 51 | 52 | 53 | gene2go.gz geneID与Go间的对应关系 54 | 55 | 56 | 57 | ### NCBI taxonmony 数据库 58 | 59 | NCBI 中存储了大量的物种,Taxonmony 数据库专门存储每个物种在 NCBI 中的 ID号 60 | 61 | 62 | 63 | 64 | 65 | ### 批量下载 SRA 数据 66 | 67 | 68 | 1. 把要下载的数据 SRR号写入一个文件,如 srr.txt,每行是一个 SRR id 69 | 2. 利用 SRA toolkit 的 prefetch 下载,并指定下载方式为 ascp 70 | 71 | prefetch -t ascp --ascp-path "/home/chenzhi/.aspera/connect/bin/ascp|/home/chenzhi/.aspera/connect/etc/asperaweb_id_dsa.openssh" --option-file srr.txt -O . 72 | 73 | 74 | 75 | 76 | ## nr 数据库注释 77 | 78 | 79 | 整个nr数据库非常大,130多个G,如果想要进行物种注释会消耗大量的时间。这里我推荐使用 diamond 这个工具,可以只进行某一类物种比对,如植物类的 80 | 81 | taxmony ID是3193. 82 | 83 | 84 | 1.利用diamond 建库,需要额外提供两个文件(可从NCBI上下载) 85 | 86 | prot.accession2taxid (蛋白accession登录号与taxid的对应关系) 87 | 88 | taxdump文件夹下的nodes.dmp文件 89 | 90 | 91 | nohup ~/biosoft/diamond makedb -p 30 --taxonmap ../prot.accession2taxid --taxonnodes ../taxdump/nodes.dmp --in nr --db nr_tax & 92 | 93 | 94 | 利用上面这条命令就可以构建索引,且每个索引都有对应的 taxid 号 95 | 96 | 97 | 98 | 2. 比对 99 | 100 | 101 | 可以指定 -taxonlist 参数值,即对nr索引中某类生物比对 102 | 103 | nohup ~/biosoft/diamond blastp -p 30 -q /date/cjt/ref/CM3.6.1_pep.fasta --db nr_index/nr_tax.dmnd --taxonlist 3193 -f 6 --max-hsps 1 --max-target-seqs 1 -o cme_blast@nr.m6 & 104 | 105 | 106 | 上面是将甜瓜蛋白序列与nr数据库中所有的植物序列比对,比对后即可获得甜瓜蛋白序列与nr数据库相似的蛋白的 accession 登录号 107 | 108 | 109 | 3. 将比对结果结合 gene2accession(NCBI下载) 文件,可得到甜瓜蛋白序列对应 gene symbol,同时可以综合 eggnog-mapper-1.0.3的比对结果,进行总体注释 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /共线性图.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chensole/bioinformatics/e1e98c053f5b3d22afd059cccdfe00e5dee24cbb/共线性图.pdf -------------------------------------------------------------------------------- /添加基因家族物种共线性关系配置: -------------------------------------------------------------------------------- 1 | # 在物种共线性区块图形中显示自己研究的基因家族的共线性关系 2 | 3 | 在 .simple 文件后面加入基因家族物种间共线性序列-------------------- 格式如下 4 | 5 | 6 | 7 | # 每一行的开头加颜色值 ,第五列自定义数值 第六列自定义 + or — ,其中每个基因 ID 重复一列展示(这样会变成一条线) 8 | 9 | #FF0000*AT1G69310 AT1G69310 Bra038313 Bra038313 10 + 10 | #FF0000*AT1G69810 AT1G69810 Bra007884 Bra007884 10 + 11 | #FF0000*AT1G80840 AT1G80840 Bra008435 Bra008435 10 + 12 | #FF0000*AT1G55600 AT1G55600 Bra038006 Bra038006 10 + 13 | #FF0000*AT1G13960 AT1G13960 Bra019697 Bra019697 10 + 14 | #FF0000*AT1G69310 AT1G69310 Bra004002 Bra004002 10 + 15 | #FF0000*AT1G29860 AT1G29860 Bra030178 Bra030178 10 + 16 | #FF0000*AT1G68150 AT1G68150 Bra004285 Bra004285 10 + 17 | #FF0000*AT1G69310 AT1G69310 Bra004370 Bra004370 10 + 18 | #FF0000*AT1G80840 AT1G80840 Bra035148 Bra035148 10 + 19 | #FF0000*AT1G80840 AT1G80840 Bra003588 Bra003588 10 + 20 | #FF0000*AT1G18860 AT1G18860 Bra016535 Bra016535 10 + 21 | 22 | -------------------------------------------------------------------------------- /物种间基因家族查找.pl: -------------------------------------------------------------------------------- 1 | 2 | use strict; 3 | use List::Util qw(any); 4 | 5 | my $f1 = shift; 6 | my $f2 = shift; 7 | my $f3 = shift; 8 | 9 | my (@a1,@a2); 10 | 11 | open I1,$f1 or die "$!"; 12 | open I2,$f2 or die "$!"; 13 | open I3,$f3 or die "$!"; 14 | 15 | while (defined(my $line = )) { 16 | chomp $line; 17 | 18 | push @a1,$line; 19 | 20 | 21 | } 22 | while (defined(my $line = )) { 23 | chomp $line; 24 | 25 | push @a2,$line; 26 | 27 | 28 | } 29 | while (defined(my $line = )) { 30 | chomp $line; 31 | 32 | next if $line =~ /^#/; 33 | 34 | my @tmp = split(/\s+/,$line); 35 | 36 | if ((any {/$tmp[0]/} @a1) and (any {/$tmp[1]/} @a2)) { 37 | 38 | print $line."\n"; 39 | } 40 | 41 | } 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /物种间基因家族比较分析流程.txt: -------------------------------------------------------------------------------- 1 | # ----------------------------- 物种间 mcscanx 基因共线性分析 (基于 python 版本的 mcscanx 软件) 2 | 3 | 软件下载地址 : https://github.com/tanghaibao/jcvi 4 | 5 | 所需文件 6 | 7 | gene 位置信息 gff文件中回去 8 | 9 | CDS.fasta 文件 10 | 11 | 基因 CDS文件 只选择一个转录本作为该基因的代表序列 (否则软件会报错) 从CDS文件中进行挑选 12 | 13 | 14 | 15 | #得到基因组上所有基因的位置信息,bed文件;以及cds序列;这里的两个脚本不在script下,需要自行拷贝; 16 | perl get_gene_bed.pl -in1 Arabidopsis_thaliana.TAIR10.41.gff3 -out ATH.bed 17 | perl get_fa_by_id_from_bed.pl ATH.bed Arabidopsis_thaliana.TAIR10.cds.all.fa ATH.cds 18 | 19 | ##统一成基因ID 20 | sed 's#\.1##' ATH.cds 21 | 22 | 23 | #同样的道理准备,准备白菜的基因组,bed文件和,cds文件; 24 | perl get_gene_bed.pl -in1 Brassica_rapa.Brapa_1.0.41.chr.gff3 -out rapa.bed 25 | perl get_fa_by_id_from_bed.pl rapa.bed Brassica_rapa.Brapa_1.0.cds.all.fa rapa.cds 26 | 27 | 28 | /biosoft/miniconda/miniconda2/bin/python -m jcvi.compara.catalog ortholog ATH rapa --cscore=0.7 # 最重要的是 .anchors 文件(含有两个物种中所有共线性基因对的关系) 和 .simple 文件 (绘图所需文件) 29 | 30 | #对共线性区域进行过滤 31 | /biosoft/miniconda/miniconda2/bin/python -m jcvi.compara.synteny screen --minsize=0 --minspan=30 --simple ATH.rapa.anchors ATH.rapa.anchors.new 32 | #绘制共线性图片:准备两个配置文件为输入文件: 33 | 34 | /biosoft/miniconda/miniconda2/bin/python -m jcvi.graphics.karyotype --format=pdf --figsize=15x5 mcscan_seqid mcscan_layout 35 | 36 | 37 | 38 | 39 | # 分别鉴定出两个物种中某一基因家族的 geneID 然后从 .anchors 文件中 挑选出 对应的具有共线性的基因,设置要展示的颜色,追加到 .simple 文件末尾 ----------- 绘图即可 40 | 41 | 1. 使用 物种间基因家族查找.pl 基于 .anchors 文件和两个个物种基因家族 geneID list 文件,鉴定某个两个物种中基因家族的共线性geneID 42 | 43 | 44 | perl 物种间基因家族查找.pl ../cmo_at/cmo_ERF.txt zm_ERF.txt Cm.zm.anchors >simple 45 | 46 | 47 | 48 | 2. 使用 生成基因家族配置simple文件.pl 将上面得到的 simple文件生成特定的绘图格式 49 | 50 | perl 生成基因家族配置simple文件.pl simple > simple1.txt 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /生成基因家族配置simple文件.pl: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | use strict; 3 | 4 | my $file = shift; 5 | 6 | open I,$file or die "$!"; 7 | 8 | 9 | 10 | while (defined (my $line = )) { 11 | 12 | chomp $line; 13 | 14 | my @tmp = split(/\s+/,$line); 15 | 16 | print "#FF0000*$tmp[0]\t$tmp[0]\t$tmp[1]\t$tmp[1]\t$tmp[2]\t+\n"; 17 | 18 | 19 | } 20 | --------------------------------------------------------------------------------