├── KAKS_SHAIXUAN.pl
├── README.md
├── cal_N50.pl
├── chr.info
├── colline_v2.pl
├── config1.txt
├── config2.txt
├── domain_xulie.pl
├── fq2fa.pl
├── gene_family_analysis_pipeline.sh
├── geneid_to_mRNAid.pl
├── genome.txt
├── get_data_by_id.pl
├── get_fa_by_id.pl
├── get_fa_by_id_from_bed.pl
├── get_fa_by_id_from_gff.pl
├── get_gene_bed.pl
├── get_gene_exon_from_gff.pl
├── get_gene_position.pl
├── get_gene_weizhi.pl
├── get_gtf.pl
├── get_promoter.pl
├── get_tandem_gene.pl
├── link.txt
├── mRNAid_to_geneid.pl
├── mcscan_seqid_染色体编号配置文件
├── mcscan_图层配置文件
├── mcscanx物种内共线性分析.sh
├── select_redundant_mRNA.pl
├── stat_protein_fa.pl
├── text.txt
├── upload_code
├── vimrc
├── 下载数据
├── 共线性图.pdf
├── 添加基因家族物种共线性关系配置
├── 物种间基因家族查找.pl
├── 物种间基因家族比较分析流程.txt
└── 生成基因家族配置simple文件.pl


/KAKS_SHAIXUAN.pl:
--------------------------------------------------------------------------------
 1 | use Getopt::Long;
 2 | my %opts;
 3 | use Data::Dumper;
 4 | GetOptions (\%opts,"in1=s","in2=s","out=s","h"); 
 5 | if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){
 6 | 	&USAGE;
 7 | }
 8 | open (IN1,"$opts{in1}") || die "open $opts{in} failed\n";
 9 | open (IN2,"$opts{in2}") || die "open $opts{ina} failed\n";
10 | open (OUT,">$opts{out}") || die "open $opts{out} failed\n";
11 |  my %cds_length;
12 | while(<IN1>){
13 | 	chomp;
14 | 	my @line = split("\t",$_);
15 | 	$cds_length{$line[0]}= $line[1];
16 | 	#print "$cds_length{$line[0]}\n";
17 | } 
18 | 
19 | while( <IN2>){
20 | 	
21 | 		chomp($_);
22 | 		my @line1 = split ("\t",$_);
23 | 		#print @line1;
24 | 		#print "\n";
25 | 		my $max_length = $cds_length{$line1[0]} > $cds_length{$line1[1]} ? $cds_length{$line1[0]}:$cds_length{$line1[1]};
26 | 		if(($line1[0] ne $line1[1]) && ($line1[2] > 70 )&& ($line1[3] > 0.70*$max_length)){
27 | 			print OUT $_."\t$max_length\n";
28 | 		
29 | 		}
30 | 		#print $cds_length1{$line1[0]};
31 | 	
32 | }
33 | 
34 | 
35 | close(IN1);
36 | close(IN2);
37 | close(OUT);
38 | sub USAGE {
39 |        print "usage: perl $0 -in1 cds_length   -in2 result.txt -out shaixuan_result.txt";
40 | 	exit;
41 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gene family analysis 
2 | 


--------------------------------------------------------------------------------
/cal_N50.pl:
--------------------------------------------------------------------------------
  1 | #/usr/bin/perl -w
  2 | use strict;
  3 | use List::Util qw(sum min max);
  4 | use Getopt::Long;
  5 | use File::Basename;
  6 | 
  7 | # Parameter variables
  8 | my $file;
  9 | my $helpAsked;
 10 | my $outfile = "";
 11 | 
 12 | GetOptions(
 13 | 			"i=s" => \$file,
 14 | 			"h|help" => \$helpAsked,
 15 | 			"o|outputFile=s" => \$outfile,
 16 | 		  );
 17 | if(defined($helpAsked)) {
 18 | 	prtUsage();
 19 | 	exit;
 20 | }
 21 | if(!defined($file)) {
 22 | 	prtError("No input files are provided");
 23 | }
 24 | 
 25 | my ($fileName, $filePath) = fileparse($file);
 26 | $outfile = $file . "_n50_stat" if($outfile eq "");
 27 | 
 28 | 
 29 | 
 30 | 
 31 | open IN, "$file" or die "$!";
 32 | open O,">$outfile" or die "$!";
 33 | my @len;
 34 | my $totseq;
 35 | 
 36 | sub median (\@){
 37 | 
 38 | 	my @x = sort {$a <=> $b} @{$_[0]};
 39 | 	my $len = scalar @x;
 40 | 	my $median;	
 41 | 	if ($len % 2 == 0) {
 42 | 		$median = ($x[$len/2 - 1] + $x[$len/2]) / 2;
 43 | 	
 44 | 	}else {
 45 | 	
 46 | 		$median = $x[($len + 1) / 2];
 47 | 	}
 48 | 	return $median;
 49 | }
 50 | 
 51 | 
 52 | my $As = 0;
 53 | my $Ts = 0;
 54 | my $Gs = 0;
 55 | my $Cs = 0;
 56 | my $Ns = 0;
 57 | 
 58 | #-------------------------------------- 子程序 -----------------------------------------------
 59 | 
 60 | sub basecount (\$){
 61 | 	my $seq = ${$_[0]};
 62 | 	
 63 | 	my $tAs += $seq =~ s/A/A/gi;
 64 | 	my $tGs += $seq =~ s/G/G/gi;
 65 | 	my $tCs += $seq =~ s/C/C/gi;
 66 | 	my $tTs += $seq =~ s/T/T/gi;
 67 | 	my $Ns = (length $seq) - $tAs - $tTs - $tCs - $tGs; 
 68 | 	$As += $tAs;
 69 | 	$Ts += $tTs;
 70 | 	$Gs += $tGs;
 71 | 	$Cs += $tCs;
 72 | 
 73 | }
 74 | 
 75 | ##计算N50
 76 | 
 77 | #### 从大到小排序--> for循环相加,直到>= $ /2
 78 | 
 79 | sub calN50 (\@$){
 80 | 	my @x = sort {$b <=> $a} @{$_[0]};
 81 | 	my $n = $_[1];
 82 | 	my $totlen = sum(@x);
 83 | 	my ($tot,$n50) = (0,0);
 84 | 	for (my $i = 0;$i <@x;$i++) {
 85 | 		$tot += $x[$i];
 86 | 		if ($tot >= $totlen*$n/100) {
 87 | 			$n50 = $x[$i];
 88 | 			last;
 89 | 		}
 90 | 		
 91 | 	
 92 | 	}
 93 | 	return $n50;
 94 |  }
 95 | 
 96 | sub prtHelp {
 97 | 	print "\n$0 options:\n\n";
 98 | 	print "### Input reads/sequences (FASTA) (Required)\n";
 99 | 	print "  -i <Read/Sequence file>\n";
100 | 	print "    Read/Sequence in fasta format\n";
101 | 	print "\n";
102 | 	print "### Other options [Optional]\n";
103 | 	print "  -h | -help\n";
104 | 	print "    Prints this help\n";
105 | 	print "  -o | -outputFile <Output file name>\n";
106 | 	print "    Output will be stored in the given file\n";
107 | 	print "    default: By default, N50 statistics file will be stored where the input file is\n";
108 | 	print "\n";
109 | }
110 | 
111 | sub prtError {
112 | 	my $msg = $_[0];
113 | 	print STDERR "+======================================================================+\n";
114 | 	printf STDERR "|%-70s|\n", "  Error:";
115 | 	printf STDERR "|%-70s|\n", "       $msg";
116 | 	print STDERR "+======================================================================+\n";
117 | 	prtUsage();
118 | 	exit;
119 | }
120 | 
121 | sub prtUsage {
122 | 	print "\nUsage: perl $0 <options>\n";
123 | 	prtHelp();
124 | }
125 | 
126 | 
127 | #----------------------------- 主程序 -----------------------------------------------
128 | 
129 | while (defined(my $line = <IN>)) {
130 | 	chomp $line;
131 | 	if ($line =~ /^>/) {
132 | 		my $genenID = $line;
133 | 		my $seq = <IN>;
134 | 		chomp $seq;
135 | 		push @len,length $seq;
136 | 		$totseq .= $seq;
137 | 	}
138 | 	
139 | }
140 | #print "@len\n";
141 | my $totlen = sum(@len);
142 | my $totreads = scalar @len;
143 | my $min = min(@len);
144 | my $max = max(@len);
145 | my $avg = sprintf "%0.2f", $totlen/$totreads;
146 | my $median = median(@len);
147 | #print "$totseq\n";
148 | #print "$median\n";
149 | 
150 | basecount($totseq);
151 | my $n25 = calN50(@len,25);
152 | my $n50 = calN50(@len,50);
153 | my $n75 = calN50(@len,75);
154 | my $n90 = calN50(@len,90);
155 | my $n95 = calN50(@len,95);
156 | 
157 | 
158 | printf O "%-25s %d\n", "Total sequences",$totreads;
159 | printf O "%-25s %d\n", "total base",$totlen;
160 | printf O "%-25s %d\n" , "Min sequence length", $min;
161 | printf O "%-25s %d\n" , "Max sequence length", $max;
162 | printf O "%-25s %0.2f\n", "Average sequence length", $avg;
163 | printf O "%-25s %0.2f\n", "Median sequence length", $median;
164 | printf O "%-25s %d\n", "N25 length", $n25;
165 | printf O "%-25s %d\n", "N50 length", $n50;
166 | printf O "%-25s %d\n", "N75 length", $n75;
167 | printf O "%-25s %d\n", "N90 length", $n90;
168 | printf O "%-25s %d\n", "N95 length", $n95;
169 | printf O "%-25s %0.2f %s\n", "As", $As/$totlen*100, "%";
170 | printf O "%-25s %0.2f %s\n", "Ts", $Ts/$totlen*100, "%";
171 | printf O "%-25s %0.2f %s\n", "Gs", $Gs/$totlen*100, "%";
172 | printf O "%-25s %0.2f %s\n", "Cs", $Cs/$totlen*100, "%";
173 | printf O "%-25s %0.2f %s\n", "(A + T)s", ($As+$Ts)/$totlen*100, "%";
174 | printf O "%-25s %0.2f %s\n", "(G + C)s", ($Gs+$Cs)/$totlen*100, "%";
175 | printf O "%-25s %0.2f %s\n", "Ns", $Ns/$totlen*100, "%";
176 | 
177 | print "N50 Statistics file: $outfile\n";
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/chr.info:
--------------------------------------------------------------------------------
 1 | 
 2 | # 染色体配置文件
 3 |  
 4 | 
 5 | chr	-	chr01	Chr1	0	302956453	chr1
 6 | chr	-	chr02	Chr2	0	241307389	chr2
 7 | chr	-	chr03	Chr3	0	232953155	chr3
 8 | chr	-	chr04	Chr4	0	242006640	chr4
 9 | chr	-	chr05	Chr5	0	215148664	chr5
10 | chr	-	chr06	Chr6	0	165010417	chr6
11 | 
12 | # 设置：前两列规定，表明时染色体配置文件  第三列：gff 文件实际编号  第四列：你希望在图片上展示的染色体名字  第五列+第六列：染色体长度    第六列：官网上规定每一个 chr 编号对应一种颜色
13 | 
14 | chr     -       chr01   Chr1    0       302956453       chr1
15 | chr     -       chr02   Chr2    0       241307389       chr1
16 | chr     -       chr03   Chr3    0       232953155       chr1
17 | chr     -       chr04   Chr4    0       242006640       chr1
18 | chr     -       chr05   Chr5    0       215148664       chr1
19 | chr     -       chr06   Chr6    0       165010417       chr1
20 | chr     -       chr06   Chr6    0       302956453       chr2
21 | chr     -       chr05   Chr5    0       241307389       chr2
22 | chr     -       chr04   Chr4    0       232953155       chr2
23 | chr     -       chr03   Chr3    0       242006640       chr2
24 | chr     -       chr02   Chr2    0       215148664       chr2
25 | chr     -       chr01   Chr1    0       165010417       chr2
26 | 
27 | # 上面这个展示物种间的共线性图 染色体的配置文件，两个物种染色体编号顺序要相反
28 | 


--------------------------------------------------------------------------------
/colline_v2.pl:
--------------------------------------------------------------------------------
  1 | use Getopt::Long;
  2 | use strict;
  3 | use Cwd qw(abs_path getcwd);
  4 | 
  5 | 
  6 | 
  7 | 
  8 | my %opts;
  9 | 
 10 | GetOptions (\%opts,"list=s","od=s","colline=s","gff=s","name=s"); 
 11 | 
 12 | 
 13 | 
 14 | my $od=$opts{od};
 15 | $od||=getcwd;
 16 | $od=abs_path($od);
 17 | unless(-d $od){	mkdir $od;}
 18 | 
 19 | 
 20 | 
 21 | #############gff for cir text###########3
 22 | 
 23 | open (IN,"$opts{gff}") || die "open $opts{gff} failed\n";
 24 | my %gff;
 25 | my @info;
 26 | my $chr;
 27 | my $start;
 28 | my $end;
 29 | my $gene;
 30 | while(<IN>){
 31 | 	chomp;
 32 | 	next if /^#/;
 33 | 	
 34 | 	@info=split(/\t/,$_);
 35 | 	
 36 | 	next unless($info[2]=~/gene/);
 37 | 	($gene)=($info[8]=~/ID=([^;]+)/);
 38 | 	
 39 | 	$chr=$info[0];
 40 | 	$start=$info[3];
 41 | 	$end=$info[4];
 42 | 	$gff{$gene}=$chr."\t".$start."\t".$end;
 43 | }
 44 | 
 45 | close(IN);
 46 | 
 47 | 
 48 | 
 49 | ####################### list ##############
 50 | 
 51 | 
 52 | my %list;
 53 | my $pair;
 54 | my $Len;
 55 | my $Agene;
 56 | my $Bgene;
 57 | my %text;
 58 | 
 59 | open (IN,"$opts{list}") || die "open $opts{list} failed\n";
 60 | open (OUT,">$od/$opts{name}.txt") || die "open $od/$opts{name}.txt failed\n";
 61 | open (OUTL,">$od/$opts{name}.link.txt") || die "open $od/$opts{name}.link.txt failed\n";
 62 | 
 63 | while(<IN>){
 64 | 	chomp;
 65 | 	@info=split(/\t/,$_);
 66 | 	$Len = @info;
 67 | 	print $Len;
 68 | 	
 69 | 	my $len=1;
 70 | 	while($len<$Len){
 71 | 			$pair=$info[$len];   ####�ų�0��Ҳ���ǵ�һ��λ�ã��˴�Ϊ����
 72 | 			($Agene,$Bgene)=split(/:/,$pair,2);
 73 | 			print OUT $Agene."\t".$Bgene."\n";
 74 | 			
 75 | 	        if(exists $gff{$Agene} && exists $gff{$Bgene}){ #######ʵ���Ͽ϶����ڣ�
 76 | 	        	
 77 | 	        	print OUTL $gff{$Agene}."\t".$gff{$Bgene}."\n";
 78 | 	        	
 79 | 	        	$text{$Agene}=$gff{$Agene}."\t".$Agene;   ##��ȥ�ؿ���ֱ�ӿ�ʼ���
 80 | 	        	$text{$Bgene}=$gff{$Bgene}."\t".$Bgene;
 81 | 	        	
 82 | 	        }
 83 | 	        $len=$len+1;
 84 | 	}
 85 | 
 86 | }
 87 | close(IN);
 88 | close(OUT);
 89 | 
 90 | 
 91 | ##ȥ���ظ�ID��text
 92 | open (OUT,">$od/$opts{name}.text.txt") || die "open $od/$opts{name}.text.txt failed\n";
 93 | my $loc;
 94 | while(($gene,$loc)=each %text){
 95 | 	print OUT $loc."\n";
 96 | }
 97 | close(OUT);
 98 | 
 99 | ######### collinearity for genome block colline #####
100 | 
101 | open (IN,"$opts{colline}") || die "open $opts{colline} failed\n";
102 | open (OUT,">$od/genome.blocklink.txt") || die "open $od/genome.blocklink.txt failed\n";
103 | open (OUTA,">$od/genome.align.blocklink.txt") || die "open $od/genome.align.blocklink.txt failed\n";
104 | my $n;
105 | my $align;
106 | my $colline;
107 | my %block;
108 | my $Agene1S;
109 | my $AgeneNE;
110 | my $Bgene1S;
111 | my $BgeneNE;
112 | my $Achr;
113 | my $Bchr;
114 | while(<IN>){
115 | 	chomp;
116 | 	if(/^#/){
117 | 		if(/Alignment/){
118 | 			$n=1;
119 | 			$_=~/Alignment ([^:]*)/;
120 | 			$align="Alignment".$1;
121 | 		}
122 | 		next;
123 | 	}
124 | 	
125 | 	$colline=$_;
126 | 	@info=split("\t",$colline);
127 | 	$Agene=$info[1];
128 | 	$Bgene=$info[2];
129 | 	
130 | 	if(exists $gff{$Agene} && exists $gff{$Bgene} ){
131 | 	
132 | 	    if($n ==1 ){
133 | 		
134 | 		    ($chr,$start,$end)=split(/\t/,$gff{$Agene});
135 | 	    	$Agene1S=$start;
136 | 		    $Achr=$chr;
137 | 		
138 | 		    ($chr,$start,$end)=split(/\t/,$gff{$Bgene});
139 | 		    $Bgene1S=$start;
140 | 		    $Bchr=$chr;
141 | 		
142 | 	    }else{
143 | 		
144 | 				
145 | 		    ($chr,$start,$end)=split(/\t/,$gff{$Agene});
146 | 		    $AgeneNE=$end;
147 | 		
148 | 		    ($chr,$start,$end)=split(/\t/,$gff{$Bgene});
149 | 		    $BgeneNE=$end;
150 | 		
151 | 	    }
152 | 	}
153 | 	$n=$n+1;
154 | 	$block{$align}=$Achr."\t".$Agene1S."\t".$AgeneNE."\t".$Bchr."\t".$Bgene1S."\t".$BgeneNE;	
155 | 		
156 | 	
157 | }
158 | 
159 | close(IN);
160 | 
161 | my $block_info;
162 | 
163 | while(($align,$block_info)=each %block){
164 | 	print OUT $block_info."\n";
165 | 	print OUTA $align."\t".$block_info."\n";
166 | }
167 | close(OUT);
168 | close(OUTA);
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/config1.txt:
--------------------------------------------------------------------------------
  1 | chromosomes_units=1000000        # 染色体显示单元，一般以 兆b 显示
  2 | chromosomes_reverse=/chr[01]/    
  3 | 
  4 | # 染色体配置
  5 |  
  6 | <ideogram>                
  7 |     fill=yes
  8 |     label_font=default
  9 |     label_parallel=yes
 10 |     label_radius=dims(image,radius)-60p
 11 |     label_size=45
 12 |     radius=0.90r   # 染色体离圆心的位置 0.9 表示离圆心 90% 的位置
 13 |     show_label=yes
 14 |     <spacing>
 15 |         default=0.005r   # 两个染色体间的间隙
 16 |     </spacing>
 17 |     stroke_color=dgrey
 18 |     stroke_thickness=2p
 19 |     thickness=0.03r
 20 | </ideogram>
 21 | karyotype=/home/manager/share/cir/chr.info   # 染色体配置文件路径
 22 | 
 23 | 
 24 | # 共线性文件配置（基因共线性和区块共线性）
 25 | 
 26 | <links>     # 两个共线性文件主配置
 27 |     bezier_radius=0r
 28 |     bezier_radius_purity=0.75
 29 |     color=black
 30 |     crest=0.5
 31 |     <link>   # 基因共线性
 32 |         bezier_radius=0r
 33 |         bezier_radius_purity=0.75
 34 |         color=set2-8-qual-1
 35 |         crest=0.5
 36 |         file=/home/manager/share/cir/link.txt        # 基因共线性配置文件路径
 37 |         radius=0.88r
 38 |         <rules>
 39 |             <rule>
 40 |                 color=red
 41 |                 condition=var(intrachr)    # 染色体内部共线性颜色配置
 42 |             </rule>
 43 |             <rule>
 44 |                 color=red
 45 |                 condition=var(interchr)   # 染色体间共线性颜色配置
 46 |             </rule>
 47 |         </rules>
 48 |         thickness=6
 49 |         z=20
 50 |     </link>
 51 |     <link>   # 区块共线性
 52 |         bezier_radius=0r
 53 |         bezier_radius_purity=0.75
 54 |         color=230,230,230,0.2   #背景区块颜色，此处设置的是灰色，透明度为 0.2
 55 |         crest=0.5
 56 |         ribbon=yes         # 区块连线设置为条带状
 57 |         file=/home/manager/share/cir/genome.txt       # 区块共线性配置文件路径
 58 |         radius=0.88r
 59 |         <rules>
 60 |             <rule>
 61 |                 condition=var(intrachr)
 62 |             </rule>
 63 |             <rule>
 64 |                 condition=var(interchr)
 65 |             </rule>
 66 |         </rules>
 67 |         thickness=1
 68 |         z=15
 69 |     </link>
 70 |     radius=0.40r
 71 |     thickness=1
 72 | </links>
 73 | 
 74 | # 基因名注释配置
 75 | <plots>
 76 |     <plot>
 77 |         color=set2-8-qual-2
 78 |         file=/home/manager/share/cir/text.txt     #基因名注释文件路径
 79 |         label_font=light
 80 |         link_color=black
 81 |         link_dims=0p,2p,5p,2p,2p
 82 |         link_thickness=2p
 83 |         r0=0.88r
 84 |         r1=0.99r
 85 |         rpadding=5p
 86 |         show_links=no
 87 |         type=text
 88 |     </plot>
 89 |     type=histogram
 90 | </plots>
 91 | show_tick_labels=yes
 92 | show_ticks=yes
 93 | spacing=10u
 94 | 
 95 | # 染色体刻度的设置
 96 | 
 97 | <ticks>
 98 |     color=black
 99 |     format=%d
100 |     multiplier=1e-6
101 |     radius=1r
102 |     thickness=2p
103 |     <tick>
104 |         size=10p
105 |         spacing=5u
106 |     </tick>
107 |     <tick>
108 |         color=black
109 |         format=%d
110 |         label_offset=10p
111 |         label_size=25p
112 |         show_label=yes
113 |         size=15p
114 |         spacing=25u
115 |         thickness=4p
116 |     </tick>
117 | </ticks>
118 | <colors>
119 | <<include etc/colors.conf>>
120 | <<include etc/brewer.conf>>
121 | #<<include etc/colors_fonts_patterns.conf>>
122 | #<<include colors.ucsc.conf>>
123 | #<<include colors.hsv.conf>>
124 | </colors>
125 | 
126 | <fonts>
127 | <<include etc/fonts.conf>>
128 | </fonts>
129 | 
130 | <image>
131 | <<include etc/image.conf>>
132 | </image>
133 | <<include etc/housekeeping.conf>>
134 | 
135 | 


--------------------------------------------------------------------------------
/config2.txt:
--------------------------------------------------------------------------------
  1 | chromosomes_units=1000000
  2 | chromosomes_reverse=/chr[01]/
  3 | <ideogram>
  4 |     fill=yes
  5 |     label_font=default
  6 |     label_parallel=yes
  7 |     label_radius=dims(image,radius)-60p
  8 |     label_size=45
  9 |     radius=0.90r
 10 |     show_label=yes
 11 |     <spacing>
 12 |         default=0.005r
 13 |     </spacing>
 14 |     stroke_color=dgrey
 15 |     stroke_thickness=2p
 16 |     thickness=0.03r
 17 | </ideogram>
 18 | karyotype=/home/manager/share/cir/chr.info
 19 | <links>
 20 |     bezier_radius=0r
 21 |     bezier_radius_purity=0.75
 22 |     color=black
 23 |     crest=0.5
 24 |     <link>
 25 |         bezier_radius=0r
 26 |         bezier_radius_purity=0.75
 27 |         color=set2-8-qual-1
 28 |         crest=0.5
 29 |         file=/home/manager/share/cir/link.txt
 30 |         radius=0.88r
 31 |         <rules>
 32 |             <rule>
 33 |                 color=green
 34 |                 condition=var(intrachr)
 35 |             </rule>
 36 |             <rule>
 37 |                 color=green
 38 |                 condition=var(interchr)
 39 |             </rule>
 40 |         </rules>
 41 |         thickness=8
 42 |         z=20
 43 |     </link>
 44 |     <link>
 45 |         bezier_radius=0r
 46 |         bezier_radius_purity=0.75
 47 |         color=230,230,230,0.2
 48 |         crest=0.5
 49 |         ribbon=yes
 50 |         file=/home/manager/share/cir/genome.txt
 51 |         radius=0.88r
 52 |         <rules>                # 背景区块个性化配置，不同染色体间背景区块的颜色
 53 |             <rule>
 54 |                 color=255,225,255,0.2
 55 |                 condition=between(chr01,chr02)
 56 |             </rule>
 57 |             <rule>
 58 |                 color=102,205,170,0.2
 59 |                 condition=between(chr04,chr05)
 60 |             </rule>
 61 |             <rule>
 62 |                 color=208,32,114,0.2
 63 |                 condition=between(chr01,chr05)
 64 |             </rule>
 65 |             <rule>
 66 |                 color=150,0,0,0.2
 67 |                 condition=between(chr02,chr03)
 68 |             </rule>
 69 | 
 70 |             <rule>
 71 |                 condition=var(interchr)
 72 |             </rule>
 73 |         </rules>
 74 |         thickness=1
 75 |         z=15
 76 |     </link>
 77 |     radius=0.40r
 78 |     thickness=1
 79 | </links>
 80 | <plots>
 81 |     <plot>
 82 |         color=set2-8-qual-2
 83 |         file=/home/manager/share/cir/text.txt
 84 |         label_font=light
 85 |         link_color=black
 86 |         link_dims=0p,2p,5p,2p,2p
 87 |         link_thickness=2p
 88 |         r0=0.88r
 89 |         r1=0.99r
 90 |         rpadding=5p
 91 |         show_links=no
 92 |         type=text
 93 |     </plot>
 94 |     type=histogram
 95 | </plots>
 96 | show_tick_labels=yes
 97 | show_ticks=yes
 98 | spacing=10u
 99 | <ticks>
100 |     color=black
101 |     format=%d
102 |     multiplier=1e-6
103 |     radius=1r
104 |     thickness=2p
105 |     <tick>
106 |         size=10p
107 |         spacing=5u
108 |     </tick>
109 |     <tick>
110 |         color=black
111 |         format=%d
112 |         label_offset=10p
113 |         label_size=25p
114 |         show_label=yes
115 |         size=15p
116 |         spacing=25u
117 |         thickness=4p
118 |     </tick>
119 | </ticks>
120 | <colors>
121 | <<include etc/colors.conf>>
122 | <<include etc/brewer.conf>>
123 | #<<include etc/colors_fonts_patterns.conf>>
124 | #<<include colors.ucsc.conf>>
125 | #<<include colors.hsv.conf>>
126 | </colors>
127 | 
128 | <fonts>
129 | <<include etc/fonts.conf>>
130 | </fonts>
131 | 
132 | <image>
133 | <<include etc/image.conf>>
134 | </image>
135 | <<include etc/housekeeping.conf>>
136 | 
137 | 


--------------------------------------------------------------------------------
/domain_xulie.pl:
--------------------------------------------------------------------------------
 1 | #北京组学生物科技有限公司
 2 | #email: huangls@biomics.com.cn
 3 | 
 4 | die "perl $0 <hmmoutfile> <fa> <OUT> <E-value>" unless ( @ARGV == 4 );
 5 | use Math::BigFloat;
 6 | use Bio::SeqIO;
 7 | use Bio::Seq;
 8 | $in = Bio::SeqIO->new(
 9 | 	-file   => "$ARGV[1]",
10 | 	-format => 'Fasta'
11 | );
12 | $out = Bio::SeqIO->new(
13 | 	-file   => ">$ARGV[2]",
14 | 	-format => 'Fasta'
15 | );
16 | my %keep = ();
17 | open IN, "$ARGV[0]" or die "$!";
18 | 
19 | while (<IN>) {
20 | 	chomp;
21 | 	next if /^#/;
22 | 
23 | 	my @a = split /\s+/;
24 | 	next if $a[6] > $ARGV[3];
25 | 	my @b = ( $a[17], $a[18] );
26 | 	my $keys = $a[0];
27 | 	if ($a[9]==1 and !exists $keep{$keys} ) { #提取序列中第一个结构域所在的序列
28 | 		$keep{$keys} = \@b;
29 | 
30 | 	}
31 | }
32 | close(IN);
33 | while ( my $seq = $in->next_seq() ) {
34 | 	my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc );
35 | 
36 | 	if ( exists $keep{$id} ) {
37 | 		my $subseq = $seq->subseq( $keep{$id}->[0], $keep{$id}->[1]); #截取序列
38 | 		my $newseqobj = Bio::Seq->new(
39 | 			-seq  => $subseq,
40 | 			-desc => "domain:$keep{$id}[0]-$keep{$id}[1]",
41 | 			-id   => "$id",
42 | 		);
43 | 
44 | 		$out->write_seq($newseqobj);
45 | 	}
46 | }
47 | $in->close();
48 | $out->close();
49 | 


--------------------------------------------------------------------------------
/fq2fa.pl:
--------------------------------------------------------------------------------
  1 | #!usr/bin/perl -w
  2 | use strict;
  3 | use Getopt::Long;
  4 | use File::Basename;
  5 | 
  6 | #---------------------------------------- 模板 ----------------------------------------------------------------
  7 | #定义命令行参数
  8 | my $file;
  9 | my $help;
 10 | my $outFile = "";
 11 | 
 12 | GetOptions(
 13 | 		"i=s" => \$file,
 14 | 		"h|help" => \$help,
 15 | 		"o|outputfile" => \$outFile,
 16 | );
 17 | 
 18 | #检查参数
 19 | 
 20 | #定义help文档
 21 | sub prtHelp {
 22 | 	print "\n$0 options:\n\n";
 23 | 	print "### Input reads (FASTQ) (Required)\n";
 24 | 	print "  -i <FASTQ read file>\n";
 25 | 	print "    Read file in FASTQ format\n";
 26 | 	print "\n";
 27 | 	print "### Other options [Optional]\n";
 28 | 	print "  -h | -help\n";
 29 | 	print "    Prints this help\n";
 30 | 	print "  -o | -outputFile <Output file name>\n";
 31 | 	print "    Output will be stored in the given file\n";
 32 | 	print "    default: By default, file will be stored where the input file is\n";
 33 | 	print "\n";
 34 | }
 35 | 
 36 | 
 37 | sub prtError {
 38 | 	my $msg = $_[0];
 39 | 	print STDERR "+======================================================================+\n";
 40 | 	printf STDERR "|%-60s|\n", "  Error:";	
 41 | 	printf STDERR "|%-70s|\n", "       $msg";
 42 | 	print STDERR "+======================================================================+\n";
 43 | 	prtUsage();
 44 | 	exit;
 45 | }
 46 | 
 47 | 
 48 | #定义usage
 49 | 
 50 | sub prtUsage {
 51 | 	print "\nUsage:perl $0 <options>";
 52 | 	prtHelp();
 53 | } 
 54 | 
 55 | 
 56 | if (defined($help)) {
 57 | 	prtUsage();
 58 | 	exit;
 59 | }
 60 | 
 61 | if (!defined($file)) {
 62 | 	prtError("NO input files are provided");
 63 | }
 64 | 
 65 | #---------------------------------------- 模板 ----------------------------------------------------------------
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | #-----------------------------------------主程序 ---------------------------------------------------------------
 72 | 	#自定义输出路径
 73 | 
 74 | my ($filename,$filepath) = fileparse ($file);
 75 | $outFile = $file . "_fasta" if ($outFile eq "");
 76 | 
 77 | open I, "<$file" or die "can not open file:$file\n";
 78 | open OF, ">$outFile" or die "can not open file:$outFile\n";
 79 | 
 80 | sub formatseq {
 81 | 	my $seq = $_[0];
 82 | 	$seq =~ s/(\w{50})/$1\n/g;
 83 | 	return $seq;
 84 | }
 85 | #-----------------------------------------主程序 ---------------------------------------------------------------
 86 | 
 87 | 
 88 | 
 89 | #下面这是修改前的源代码
 90 | 
 91 | #-----------------------------------------------------------------------------#
 92 | #sub foramtseq {
 93 | #	my $seq = $_[0];
 94 | #	my $newseq = "";
 95 | #	my $ch = 60;
 96 | #	for (my $i = 0; $i <length $seq; $i += $ch) {
 97 | #		$newseq .= substr($seq,$i,$ch) . "\n";
 98 | #	}
 99 | #	chomp $newseq;
100 | #	return $newseq;
101 | #}
102 | #-----------------------------------------------------------------------------#
103 | 
104 | 
105 | while (defined(my $line = <I>)) {
106 | 	chomp $line;
107 | 	my $id = $line;
108 | 	$id =~ s/^\@//;
109 | 	print OF ">$id\n";
110 | 	my $seq = <I>;
111 | 	print OF formatseq($seq);
112 | 	<I>;
113 | 	<I>;
114 | }
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/gene_family_analysis_pipeline.sh:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # ----------------------------------------------- 基因家族分析 ---------------------------------------------------
  4 | 
  5 | 
  6 | #--------------------------  1. 搜索物种的基因家族信息 ----------------------------------------
  7 | 
  8 | Things required:
  9 | 所需文件：
 10 | 
 11 | 	基因组 fasta 文件  whole genome about your species
 12 | 	
 13 | 	基因组 CDS fasta 文件   CDS file
 14 | 
 15 | 	基因组 蛋白质 fasta 文件 protein file
 16 | 
 17 | 	基因注释文件 gff    gff file
 18 | 
 19 | 	基因家族 pfam 文件   your protein PFAM accession number (.hmm file)
 20 | 
 21 | 
 22 | #下载拟南芥基因组信息
 23 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz
 24 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz
 25 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz
 26 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-41/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.41.gff3.gz
 27 | #
 28 | #解压压缩文件 decompressed gz file
 29 | #gunzip *gz
 30 | 
 31 | # optional step (just change some file format)
 32 | 
 33 | #处理GFF 文件里面ID中一些不必要的信息，gene:  transcript: 删除；与蛋白质中的ID保持一致：Arabidopsis_thaliana.TAIR10.pep.all.fa 
 34 | #sed -i 's#gene:##' Arabidopsis_thaliana.TAIR10.41.gff3
 35 | #sed -i 's#transcript:##' Arabidopsis_thaliana.TAIR10.41.gff3
 36 | #sed -i 's#CDS:##' Arabidopsis_thaliana.TAIR10.41.gff3
 37 | 
 38 | 
 39 | #------------------- gff文件  获取基因与mRNA的对应关系
 40 | 
 41 | perl script/mRNAid_to_geneid.pl Arabidopsis_thaliana.TAIR10.41.gff3 mRNA2geneID.txt
 42 | perl script/geneid_to_mRNAid.pl Arabidopsis_thaliana.TAIR10.41.gff3 geneid2mRNAid.txt
 43 | 
 44 | #与Arabidopsis_thaliana.TAIR10.pep.all.fa 文件中的ID保持一致，如果第20-21行没有做，这里可以补做；
 45 | #sed -i 's#gene:##' mRNA2geneID.txt
 46 | #sed -i 's#transcript:##' mRNA2geneID.txt
 47 | #sed -i 's#CDS:##' mRNA2geneID.txt
 48 | 
 49 | #---------------------  在 蛋白文件  中搜索 基因家族 保守结构域
 50 | 
 51 | hmmsearch --domtblout WRKY_hmm_out.txt --cut_tc WRKY.hmm Arabidopsis_thaliana.TAIR10.pep.all.fa
 52 | 
 53 | 
 54 | # -------------------  筛选比对搜索结果，并提取 保守结构域 序列
 55 | 
 56 | #提取结构域序列，脚本最后的evalue参数1.2e-28，根据实际情况可调,大于这个E值脚本会跳过这个一行;注意脚本提取的是第一个domain，如要提取其他domain，请修改脚本27行$a[9]==1为第一个，$a[9]==2为第二个，依次类推
 57 | 
 58 | perl script/domain_xulie.pl WRKY_hmm_out.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_domain.fa 1.2e-28
 59 | 
 60 | 
 61 | 
 62 | ###########以下部分为建立物种特异模型再次搜索，可根据自己基因家族情况选做这部分内容#############################  ( 这一步可选 )
 63 | 
 64 | 
 65 | #clusterW多序列比对快捷方法
 66 | 
 67 | echo "1\nWRKY_domain.fa\n2\n1\nWRKY_domain.aln\nWRKY_domain.dnd\nX\n\n\nX\n" |clustalw
 68 | 
 69 | #利用比对结果建立物种特异hmm模型
 70 | hmmbuild WRKY_domain_new.hmm WRKY_domain.aln
 71 | 
 72 | #新建物种特异hmm模型，再次搜索
 73 | 
 74 | hmmsearch --domtblout WRKY_domain_new_out.txt --cut_tc WRKY_domain_new.hmm Arabidopsis_thaliana.TAIR10.pep.all.fa
 75 | 
 76 | ############################################################################################################
 77 | 
 78 | 
 79 | 
 80 | # ------------------------ 转录本选择，去冗余
 81 | 
 82 | #筛选 hmm搜索结果，可以用excel手动筛选，筛选标准，
 83 | #1.E-value值小于0.001；
 84 | #2.如果有多个转录本选第一个转录本
 85 | #3.只有一个转录本，就选那个转录本
 86 | 
 87 | #筛选EValue  <0.001
 88 | #如果只想用hmmer搜索一次，可将下面的文件：WRKY_domain_new_out.txt 替换成 57行 生成的文件：WRKY_hmm_out.txt
 89 | grep -v "^#" WRKY_domain_new_out.txt|awk '$7<0.001 {print}' >WRKY_domain_new_out_selected.txt
 90 | 
 91 | 
 92 | #去除重复的hmmer搜索的转录本ID，多个转录本ID保留一个作为基因的代表，此步建议对脚本输出的文件手动筛选，挑选ID：
 93 | perl script/select_redundant_mRNA.pl mRNA2geneID.txt WRKY_domain_new_out_selected.txt WRKY_remove_redundant_IDlist.txt
 94 | 
 95 | 
 96 | #请手动挑选完mRNA的ID放在第一列，也就是挑选一个转录本ID代表这个基因，存成新的文件WRKY_removed_redundant_IDlist.txt：
 97 | 
 98 | 
 99 | # ------------------------ 提取 筛选过后 转录本所对应的 基因的序列
100 | 
101 | #利用脚本得到对应基因的蛋白序列，脚本会读取第一个文件的第一列ID，把对应ID的序列提取出来：
102 | perl script/get_fa_by_id.pl WRKY_removed_redundant_IDlist.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_pep_need_to_confirm.fa
103 | 
104 | 
105 | #将上面WRKY_pep_need_to_confirm.fa文件中的蛋白序列，再手动验证一下，把不需要的ID删除，最终确认：WRKY_removed_redundant_IDlist.txt 存成新文件：WRKY_removed_redundant_and_confirmed_IDlist.txt
106 | 
107 | #手动确认结构域，CDD，SMART，PFAM
108 | #确定分子量大小：http://web.expasy.org/protparam/
109 | #perl script/stat_protein_fa.pl WRKY_pep_need_to_confirm.fa WRKY_pep_need_to_confirm.MW.txt
110 | #三大数据库网站，筛选之后去除一些不确定的基因ID，最终得到可靠的基因家族基因列表,存储在文件：WRKY_removed_redundant_and_confirmed_IDlist.txt ; 
111 | 
112 | 
113 | #脚本提取hmm结果文件，重新筛选一下hmm的结果：
114 | 
115 | perl script/get_data_by_id.pl WRKY_removed_redundant_and_confirmed_IDlist.txt WRKY_domain_new_out_selected.txt WRKY_domain_new_out_removed_redundant.txt
116 | 
117 | #截取得到序列上的保守结构域序列，注意脚本提取的是第一个domain，如要提取其他domain，请修改脚本27行$a[9]==1为第一个，$a[9]==2为第二个，依次类推
118 | 
119 | perl script/domain_xulie.pl WRKY_domain_new_out_removed_redundant.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_domain_confirmed.fa 0.1
120 | 
121 | #得到对应基因的蛋白序列全长：
122 | 
123 | perl script/get_fa_by_id.pl WRKY_domain_new_out_removed_redundant.txt Arabidopsis_thaliana.TAIR10.pep.all.fa WRKY_pep_confirmed.fa
124 | 
125 | #得到对应基因的cds序列：
126 | 
127 | perl script/get_fa_by_id.pl WRKY_domain_new_out_removed_redundant.txt Arabidopsis_thaliana.TAIR10.cds.all.fa WRKY_cds_confirmed.fa
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | ########################进化树分析##########################################
135 | 
136 | 所需文件
137 | 
138 | 	上面搜索到的 基因家族蛋白质保守结构域序列(也可以用蛋白序列全长)
139 | 
140 | 	mega 分析绘制 树
141 | 
142 | 	Evoview 进行美化
143 | 
144 | #cd $workdir 回到工作路径
145 | 
146 | mkdir gene_tree_analysis
147 | cd gene_tree_analysis
148 | cp ../WRKY_domain_confirmed.fa .
149 | cp ../WRKY_pep_confirmed.fa .
150 | cp ../WRKY_cds_confirmed.fa .
151 | cp ../WRKY_domain_new_out_removed_redundant.txt .
152 | 
153 | 
154 | #########################利用meme软件做motif分析################################33
155 | 
156 | 
157 | 所需文件
158 | 
159 | 	基因家族蛋白质 全长
160 | 
161 | #cd $workdir
162 | mkdir meme_motif_analysis
163 | cd meme_motif_analysis
164 | #搜索结构域：
165 | #-nmotifs 10  搜索motif的总个数
166 | #-minw 6   motif的最短长度
167 | #-maxw 50   motif的最大长度
168 | 
169 | # meme 输出 motif 图
170 | 
171 | /biosoft/meme/meme-v4.12.0/bin/meme ../WRKY_pep_confirmed.fa -protein -oc ./ -nostatus -time 18000 -maxsize 6000000 -mod anr -nmotifs 10 -minw 6 -maxw 100
172 | 
173 | 
174 | 
175 | 
176 | ##################################基因结构分析structure####################
177 | 
178 | 所需文件
179 | 
180 | 	基因蛋白结构域所对应的 geneID 信息
181 | 
182 | 	gff 文件
183 | 
184 | #cd $workdir 回到工作路径
185 | cd $workdir
186 | mkdir gene_structure_analysis
187 | cd gene_structure_analysis
188 | cp ../WRKY_domain_new_out_removed_redundant.txt .
189 | 
190 | 
191 | 
192 | #获得基因的在染色体上的外显子，CDS，UTR位置信息，用于绘制基因结构图
193 | 
194 | perl ../script/get_gene_exon_from_gff.pl -in1 WRKY_domain_new_out_removed_redundant.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out gene_exon_info.gff
195 | 
196 | # 将上述的 gff文件放到 GSDS 网站即可绘制图形
197 | 
198 | ################################基因定位到染色体############################################### （蜈蚣图）
199 | 
200 | 所需文件
201 | 
202 | 	基因家族 geneID
203 | 
204 | 	染色体 长度信息
205 | 
206 | #cd $workdir 回到工作路径
207 | cd $workdir
208 | mkdir map_to_chr
209 | cd map_to_chr
210 | cp ../WRKY_domain_new_out_removed_redundant.txt .    #WRKY基因家族文件
211 | 
212 | 
213 | #获得基因的在染色体上的位置信息，用于绘制染色体位置图,注意提取的是基因位置还是mRNA位置,以下代码是提取的 mRNA位置
214 | perl ../script/get_gene_weizhi.pl -in1 WRKY_domain_new_out_removed_redundant.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out mrna_location.txt
215 | 
216 | #获得基因组染色体长度：
217 | samtools faidx ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa
218 | 
219 | cp ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.fai .
220 | 
221 | #绘图参考：http://www.omicsclass.com/article/397
222 | 
223 | 
224 | ###############################blast方法 复制基因查找  及KAKS分析#################################
225 | 
226 | 基因串联重复
227 | 	
228 | 	两个基因比对率 > 70% (相对于较长的基因)，且基因的比对相似性 > 70%
229 | 	两个基因在同一条染色体上，且位置 < 100 kb
230 | 
231 | 
232 | 所需文件
233 | 
234 | 	基因家族 geneID
235 | 
236 | 	基因家族 CDS 序列
237 | 
238 | 
239 | #cd $workdir 回到工作路径
240 | mkdir gene_duplication_kaks_blast
241 | cd gene_duplication_kaks_blast
242 | cp ../WRKY_domain_new_out_removed_redundant.txt .
243 | cp ../WRKY_cds_confirmed.fa .
244 | #blast建库,DNA序列,all vs all 比对，结果说明见：http://www.omicsclass.com/article/505
245 | makeblastdb -in WRKY_cds_confirmed.fa -dbtype nucl -title WRKY_cds_confirmed.fa 
246 | blastall -i WRKY_cds_confirmed.fa -d WRKY_cds_confirmed.fa -p blastn -e 1e-20  -m 8 -o WRKY_cds_confirmed_blast.out
247 | 
248 | #获取基因cds序列的长度：
249 | samtools faidx WRKY_cds_confirmed.fa
250 | 
251 | perl ../script/KAKS_SHAIXUAN.pl -in1 WRKY_cds_confirmed.fa.fai -in2 WRKY_cds_confirmed_blast.out -out duplication_gene.out
252 | 
253 | 
254 | # 对上面的结果进行去重复 
255 | 基因之间两两比对，会存在 AvsB BvsA 因此用下面的脚本去重复
256 | 
257 | perl ../clean_blastall.pl duplication_gene.out
258 | 
259 | 
260 | ###kaks 分析###
261 | 
262 | 所需文件
263 | 	上面的 基因家族 复制基因
264 | 
265 | 
266 | #提取成对基因的序列
267 | echo "AT1G66600.1\nAT1G66560.1" >dupid.txt
268 | perl ../script/get_fa_by_id.pl dupid.txt WRKY_cds_confirmed.fa dup_gene_paired1.fa
269 | 
270 | #多序列比对 clustalw
271 | echo "1\ndup_gene_paired1.fa\n2\n9\n4\n\n1\ndup_gene_paired1.aln\ndup_gene_paired1.dnd\nX\n\n\nX\n" |clustalw
272 | 
273 | #格式转换axt  如果遇到报错not equal，可参考：http://www.omicsclass.com/article/700 （ KAKS 一次只能进行 一对基因的分析 ）
274 | /biosoft/KaKs_Calculator2.0/src/AXTConvertor dup_gene_paired1.aln dup_gene_paired1.axt
275 | /biosoft/KaKs_Calculator2.0/bin/Linux/KaKs_Calculator  -i dup_gene_paired1.axt -o dup_gene_paired1.kaks.result
276 | 
277 | #分离时间计算：http://www.omicsclass.com/question/896
278 | 
279 | 
280 | ###########################################以下blast为可选分析内容########################################################################
281 | 
282 | 所需文件 
283 | 	
284 | 	从 NCBI 下载蛋白序列
285 | 
286 | #blastp比对寻找基因家族成员，WRKY部分
287 | #参考文献：https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4955-8
288 | #NCBI上搜索WRKY蛋白序列：搜索条件：WRKY[title] NOT putative[title] AND plants[filter]
289 | 
290 | 
291 | #blast比对首先建库
292 | #makeblastdb -in WRKY_NCBI_pep.fasta -dbtype prot -title WRKY_NCBI_pep.fasta   #蛋白质序列
293 | #
294 | #blastp比对
295 | #blastall -i ../Arabidopsis_thaliana.TAIR10.pep.all.fa -d WRKY_NCBI_pep.fasta -p blastp -e 1e-10 -b 1 -v 1 -m 8 -o ncbi_WRKY_blast.out 
296 | 
297 | 利用上述的比对结果提取 序列间的局部匹配区域，利用匹配区域，进行 clustalw 多序列比对,然后通过 hmmbuild 构建物种的hmm文件 
298 | 
299 | 
300 | #######################基因上游顺势作用原件分析#######################################
301 | 
302 | 所需文件
303 | 
304 | 	geneID 信息
305 | 	
306 | 	gff 文件
307 | 
308 | 	基因组文件
309 | 
310 | #回到工作路径
311 | cd $workdir
312 | mkdir gene_promoter
313 | cd gene_promoter
314 | cp ../WRKY_domain_new_out_removed_redundant.txt .
315 | 
316 | #得到基因在染色体上的位置，此脚本会把基因组所有的序列读入内存，如果基因组较大，可能因为内存不足使脚本运行不成功，可以分染色体分开分析：
317 | perl ../script/get_gene_weizhi.pl -in1 WRKY_domain_new_out_removed_redundant.txt -in2 ../Arabidopsis_thaliana.TAIR10.41.gff3 -out mrna_location.txt
318 | #根据位置信息提取，promoter序列 1500
319 | perl ../script/get_promoter.pl ../Arabidopsis_thaliana.TAIR10.dna.toplevel.fa mrna_location.txt promoter.fa
320 | 
321 | #生成 GSDS配置文件
322 | cat WRKY_domain_new_out_removed_redundant.txt|awk 'BEGIN{OFS="\t"}{print $1,"0","1500","CDS","."}' >gene.bed
323 | #生成feature文件
324 | cat PlantCARE_9210__plantCARE/plantCARE_output_PlantCARE_9210.tab|grep "Arabidopsis"|awk -F"\t"  'BEGIN{OFS="\t"} {print $1,$4,$4+length($3),$2}'>feature.bed
325 | 


--------------------------------------------------------------------------------
/geneid_to_mRNAid.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | use Cwd qw(abs_path getcwd);
 4 | use Getopt::Long;
 5 | use Data::Dumper;
 6 | 
 7 | die "perl $0 <gff> <outfile>" unless(@ARGV==2);
 8 | my$gff=$ARGV[0];
 9 | my%gene=();
10 | my%gene_region=();
11 | 
12 | open IN,"$gff" or die "$!";
13 | 
14 | while(<IN>){
15 | 	chomp;
16 | 	next if (/^#/);
17 | 	my@tmp=split(/\t/);
18 | 
19 | 	if($tmp[2] =~/^gene/){
20 | 		my($id)=($tmp[8]=~/ID=([^;]+)/);
21 | 		$gene{$id}=[];
22 | 		$gene_region{$id}="$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]";
23 | 	}
24 | 	if($tmp[2] =~/mRNA|transcript/i){
25 | 		my($id)=($tmp[8]=~/ID=([^;]+)/);
26 | 		my($pid)=($tmp[8]=~/Parent=([^;]+)/);
27 | 		
28 | 
29 | 		if(exists $gene{$pid}){
30 | 			push @{$gene{$pid}},$id;
31 | 		}else{
32 | 			die "please check mRNA $id has gene ID \n";		
33 | 		}
34 | 
35 | 	}
36 | }
37 | 
38 | close(IN);
39 | 
40 | open OUT ,">$ARGV[1]" or die "$!";
41 | print OUT "#gene_ID\tchr\tstart\tend\tstrand\ttranscript_id\n";
42 | for my $id(keys %gene) {
43 | 	print OUT "$id\t$gene_region{$id}\t".join("\t",sort  @{$gene{$id}})."\n";
44 | }
45 | 	
46 | close(OUT);
47 | 


--------------------------------------------------------------------------------
/genome.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # 背景区块的配置文件
 3 | # 基于mcscanx分析结果，选取染色体上对应基因的第一个和最后一个
 4 | 
 5 | chr04	235985192	238520393	chr05	89127420	77919313
 6 | chr01	177745754	179756069	chr01	263221976	261689189
 7 | chr02	226660231	227489681	chr04	21034937	23267356
 8 | chr01	257237981	259662579	chr05	16839069	18254076
 9 | chr02	7313083	10264901	chr04	29042531	25887059
10 | chr01	72781187	78190879	chr02	213991895	211892225
11 | chr03	10161437	12538552	chr06	126316653	127947519
12 | chr01	13199428	16659014	chr05	33280286	36117488
13 | chr02	11039996	22253136	chr05	197970559	185883292
14 | chr01	247942338	252515573	chr05	21179147	18361206
15 | chr03	9637008	38688558	chr08	18392025	3044143
16 | chr04	235545789	235928643	chr05	89236231	93996019
17 | chr01	263545596	301474757	chr05	13038255	960992
18 | chr02	224426294	224827815	chr04	16011093	15055409
19 | chr04	235076669	235495146	chr05	101090117	95339508
20 | chr01	100393815	104124989	chr05	43593097	44784264
21 | chr05	40695082	42005127	chr06	89614367	90338798
22 | chr01	79166192	82359578	chr02	15670012	13962160
23 | chr01	3635462	4144525	chr01	78813994	79779467
24 | chr02	191431054	193577343	chr05	207321134	206112159
25 | chr01	3131661	3450838	chr01	228088856	226688602
26 | chr01	225786615	242895913	chr05	20236510	33124260
27 | chr02	236303593	238275076	chr05	3518540	4544483
28 | chr05	57879365	64457951	chr06	85758193	80992737
29 | chr04	237005472	238418562	chr06	90241588	85971864
30 | chr03	12324004	15125672	chr06	120931084	123310948
31 | chr02	28317840	32416047	chr04	143101873	135253384
32 | chr02	25672897	27242359	chr04	147014326	144062146
33 | chr02	38300581	42362725	chr04	129149685	125170216
34 | chr03	175562023	177176703	chr06	146135846	146803277
35 | chr02	140597842	146529585	chr03	140747563	142885052
36 | chr02	147210539	159658223	chr04	191153761	185126620
37 | chr01	186306392	187840847	chr01	188087101	189667363
38 | chr01	209188508	210401506	chr04	218276481	216110408
39 | 


--------------------------------------------------------------------------------
/get_data_by_id.pl:
--------------------------------------------------------------------------------
 1 | print "perl $0   <id_list>  <data_file> <out_file>\n" and die unless(@ARGV==3);
 2 | 
 3 | open IN,"$ARGV[0]" or die "$!";
 4 | 
 5 | my%t;
 6 | my$head;
 7 | while(<IN>){
 8 | 	chomp;
 9 | 	my@tmp=split(/\s+/);
10 | 	
11 | 	
12 | 	$t{$tmp[0]}=1;
13 | }
14 | 
15 | close(IN);
16 | 
17 | open IN,"$ARGV[1]" or die "$!";
18 | 
19 | open OUT,">$ARGV[2]" or die "$!";
20 | while(<IN>){
21 | 	chomp;
22 | 	if (/^#/){
23 | 		print OUT "$_\n";
24 | 		next ;
25 | 	}
26 | 
27 | 	my@tmp=split(/\s+/);
28 | 
29 | 	if(exists $t{$tmp[0]}){
30 | 		print OUT "$_\n";
31 | 	}else{
32 | 		#print  "$tmp[0]\n";
33 | 	}
34 | }
35 | close(IN);
36 | 
37 | close(OUT);
38 | 


--------------------------------------------------------------------------------
/get_fa_by_id.pl:
--------------------------------------------------------------------------------
 1 | #北京组学生物科技有限公司
 2 | #email: huangls@biomics.com.cn
 3 | 
 4 | die "perl $0 <idlist> <fa> <OUT>" unless ( @ARGV == 3 );
 5 | use Math::BigFloat;
 6 | use Bio::SeqIO;
 7 | use Bio::Seq;
 8 | 
 9 | #读入蛋白序列
10 | $in = Bio::SeqIO->new(
11 | 	-file   => "$ARGV[1]",
12 | 	-format => 'Fasta'
13 | );
14 | 
15 | #输出蛋白序列：
16 | $out = Bio::SeqIO->new(
17 | 	-file   => ">$ARGV[2]",
18 | 	-format => 'Fasta'
19 | );
20 | 
21 | #读取需要提取基因ID
22 | my %keep = ();
23 | open IN, "$ARGV[0]" or die "$!";
24 | 
25 | while (<IN>) {
26 | 	chomp;
27 | 	next if /^#/;
28 | 	my @a = split /\s+/;
29 | 	$keep{$a[0]}=1;
30 | }
31 | close(IN);
32 | 
33 | #输出想要的基因的序列
34 | while ( my $seq = $in->next_seq() ) {
35 | 	my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc );
36 | 
37 | 	if ( exists $keep{$id} ) {
38 | 		$out->write_seq($seq);
39 | 	}
40 | }
41 | $in->close();
42 | $out->close();


--------------------------------------------------------------------------------
/get_fa_by_id_from_bed.pl:
--------------------------------------------------------------------------------
 1 | #北京组学生物科技有限公司
 2 | #email: huangls@biomics.com.cn
 3 | 
 4 | die "perl $0 <idlist> <fa> <OUT>" unless ( @ARGV == 3 );
 5 | use Math::BigFloat;
 6 | use Bio::SeqIO;
 7 | use Bio::Seq;
 8 | 
 9 | #读入蛋白序列
10 | $in = Bio::SeqIO->new(
11 | 	-file   => "$ARGV[1]",
12 | 	-format => 'Fasta'
13 | );
14 | 
15 | #输出蛋白序列：
16 | $out = Bio::SeqIO->new(
17 | 	-file   => ">$ARGV[2]",
18 | 	-format => 'Fasta'
19 | );
20 | 
21 | #读取需要提取基因ID
22 | my %keep = ();
23 | open IN, "$ARGV[0]" or die "$!";
24 | 
25 | while (<IN>) {
26 | 	chomp;
27 | 	next if /^#/;
28 | 	my @a = split /\t/;
29 | 	$keep{"$a[3].1"}=1; ##注意提取第一个转录本
30 | }
31 | close(IN);
32 | 
33 | #输出想要的基因的序列
34 | while ( my $seq = $in->next_seq() ) {
35 | 	my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc );
36 | 
37 | 	if ( exists $keep{$id} ) {
38 | 		$out->write_seq($seq);
39 | 	}
40 | }
41 | $in->close();
42 | $out->close();


--------------------------------------------------------------------------------
/get_fa_by_id_from_gff.pl:
--------------------------------------------------------------------------------
 1 | #script www.omicsclass.com
 2 | die "perl $0 <id><fa><OUT>" unless(@ARGV==3);
 3 | 
 4 | use Bio::SeqIO;
 5 | use Bio::Seq;
 6 | 
 7 | my$in  = Bio::SeqIO->new(-file => "$ARGV[1]" ,
 8 |                                -format => 'Fasta');
 9 | my$out = Bio::SeqIO->new(-file => ">$ARGV[2]" ,
10 |                                -format => 'Fasta');
11 | my%keep=();
12 | 
13 | open IN ,"$ARGV[0]" or die "$!";
14 | while(<IN>){
15 |         chomp;
16 |         next if /^#/;
17 |         my@tmp=split(/\s+/);
18 | 	$keep{"$tmp[1].1"}=1;
19 | }
20 | close(IN);
21 | while ( my $seq = $in->next_seq() ) {
22 |             my($id,$sequence,$desc)=($seq->id,$seq->seq,$seq->desc);
23 |             if( exists $keep{$id}){
24 |             	$out->write_seq($seq); 
25 | 	    }     
26 | }
27 | $in->close();
28 | $out->close();
29 | 
30 | 


--------------------------------------------------------------------------------
/get_gene_bed.pl:
--------------------------------------------------------------------------------
 1 | use Getopt::Long;
 2 | my %opts;
 3 | use Data::Dumper;
 4 | GetOptions( \%opts, "in1=s", "out=s", "h" );
 5 | if ( !defined( $opts{in1} ) || !defined( $opts{out} ) || defined( $opts{h} ) ) {
 6 | 	&USAGE;
 7 | }
 8 | open( IN1, "$opts{in1}" )  || die "open $opts{in1} failed\n";
 9 | open( OUT, ">$opts{out}" ) || die "open $opts{out} failed\n";
10 | 
11 | while (<IN1>) {
12 | 	chomp;
13 | 	my @a = split /\t/, $_;
14 | 	if ( $a[2] eq "gene" ) {
15 | 		#if ($a[2] eq "mRNA") {
16 | 		$a[8] =~ m/ID=([^;]*)/;    #注意这里匹配基因的ID信息
17 | 		$id = $1;
18 | 
19 | 		print OUT "$a[0]\t$a[3]\t$a[4]\t$id\t$a[7]\t$a[6]\n";
20 | 
21 | 	}
22 | 
23 | }
24 | close OUT;
25 | close IN1;
26 | close IN2;
27 | 
28 | sub USAGE {
29 | 	print "usage: perl $0 -in1  gff   -out gene_location.bed ";
30 | 	exit;
31 | }
32 | 


--------------------------------------------------------------------------------
/get_gene_exon_from_gff.pl:
--------------------------------------------------------------------------------
 1 | use Getopt::Long;
 2 | my %opts;
 3 | use Data::Dumper;
 4 | GetOptions( \%opts, "in1=s", "in2=s", "out=s", "h" );
 5 | if (   !defined( $opts{in1} )
 6 | 	|| !defined( $opts{in2} )
 7 | 	|| !defined( $opts{out} )
 8 | 	|| defined( $opts{h} ) )
 9 | {
10 | 	&USAGE;
11 | }
12 | open( IN1, "$opts{in1}" )  || die "open $opts{in1} failed\n";
13 | open( IN2, "$opts{in2}" )  || die "open $opts{in2} failed\n";
14 | open( OUT, ">$opts{out}" ) || die "open $opts{out} failed\n";
15 | my %gffs;
16 | while (<IN1>) {
17 | 	chomp;
18 | 	next if /^#/;
19 | 	my @b = split/\s+/, $_;
20 | 	$gffs{$b[0]} = 1;
21 | }
22 | 
23 | #print Dumper(\%gffs);
24 | while (<IN2>) {
25 | 	chomp;
26 | 	next if (/^#/);
27 | 	my @a = split /\t/, $_;
28 | 	next if $a[2]=~/exon/i;
29 | 	if ($a[2] =~/^mRNA$/i or $a[2] =~/^transcript$/i ) {
30 | 		($id1) =  ($a[8] =~ m/ID=([^;]*)/);
31 | 
32 | 	}elsif ( $a[2] =~/^CDS$/i or $a[2] =~/utr/i ) {
33 | 
34 | 		($id1) =  ($a[8] =~ m/Parent=([^;]*)/);
35 | 	}else{
36 | 		next;
37 | 	}
38 | 
39 | 	if ( exists $gffs{$id1} ) {
40 | 		print OUT "$_\n";
41 | 	}
42 | 
43 | }
44 | close OUT;
45 | close IN1;
46 | close IN2;
47 | 
48 | sub USAGE {
49 | 	print "usage: perl $0 -in1  mRNA_id.txt -in2  genome.gff3  -out gene_location.txt ";
50 | 	exit;
51 | }
52 | 


--------------------------------------------------------------------------------
/get_gene_position.pl:
--------------------------------------------------------------------------------
 1 | 
 2 | open IN,"$ARGV[0]" or die "$!";
 3 | open OUT,">$ARGV[1]" or die "$!";
 4 | while(<IN>){
 5 | 	chomp;
 6 | 
 7 | 	next if /^#/;
 8 | 	@tmp=split(/\t/);
 9 | 	#if($tmp[2]=~/gene/ && $tmp[0]=~/^\d+/ && $tmp[-1]=~/protein_coding/){
10 | 	if($tmp[2]=~/gene/){
11 | 		my($id)=($tmp[-1]=~/ID=([^;]+)/);
12 | 		print OUT "$tmp[0]\t$id\t$tmp[3]\t$tmp[4]\n";
13 | 	}
14 | }
15 | 
16 | close(IN);
17 | close(OUT);
18 | 


--------------------------------------------------------------------------------
/get_gene_weizhi.pl:
--------------------------------------------------------------------------------
 1 | use Getopt::Long;
 2 | my %opts;
 3 | use Data::Dumper;
 4 | GetOptions (\%opts,"in1=s","in2=s","out=s","h"); 
 5 | if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){
 6 | 	&USAGE;
 7 | }
 8 | open (IN1,"$opts{in1}") || die "open $opts{in1} failed\n";
 9 | open (IN2,"$opts{in2}") || die "open $opts{in2} failed\n";
10 | open (OUT,">$opts{out}") || die "open $opts{out} failed\n";
11 | my%gffs;
12 | while (<IN1>) {
13 | 	next if (/^#/);
14 | chomp;
15 | 	  my@b=split,$_;
16 | 	  $keys= $b[0];
17 | 
18 | 	  $values= $b[0];
19 | 
20 |       $gffs{$keys} = $values;
21 |  
22 | }
23 | 
24 | while (<IN2>) {
25 | 	 chomp;
26 |           my @a=split /\t/,$_;
27 | 		 #if ($a[2] eq "gene") { 
28 | 		 if ($a[2] eq "mRNA") {
29 | 		 	 $a[8]=~ m/ID=([^;]*)/;#注意这里匹配基因的ID信息
30 | 		 	 	$id1=$1;
31 | 		
32 | 		  if ( exists  $gffs{$id1} ) {
33 | 		
34 | 	  	 print OUT "$gffs{$id1}\t$a[0]\t$a[3]\t$a[4]\t$a[6]\n";
35 | 		  }
36 | 		 }
37 | 		 
38 | 		 }
39 | close OUT;
40 | close IN1;
41 | close IN2;
42 | 
43 | sub USAGE {
44 |        print "usage: perl $0 -in1  gene_id.txt -in2  genome.gff3  -out gene_location.txt ";
45 | 	exit;
46 | }


--------------------------------------------------------------------------------
/get_gtf.pl:
--------------------------------------------------------------------------------
 1 | use Getopt::Long;
 2 | my %opts;
 3 | use Data::Dumper;
 4 | GetOptions (\%opts,"in1=s","in2=s","out=s","h"); 
 5 | if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){
 6 | 	&USAGE;
 7 | }
 8 | open (IN1,"$opts{in1}") || die "open $opts{in1} failed\n";
 9 | open (IN2,"$opts{in2}") || die "open $opts{in2} failed\n";
10 | open (OUT,">$opts{out}") || die "open $opts{out} failed\n";
11 | my%gffs;
12 | while (<IN1>) {
13 | 	  chomp;
14 | 	  my@b=split,$_;
15 | 	  $keys= $b[0];
16 | 	#  print "$keys\n";
17 | 	 $values= $b[0];
18 | 	# print "$values";
19 |      $gffs{$keys} = $values;
20 |    #print "$gffs{$_}\n";
21 | }
22 | #print Dumper(\%gffs);
23 | while (<IN2>) {
24 | 	 chomp;
25 |           my @a=split /\t/,$_;
26 | 		
27 | 		 	$a[8]=~ m/transcript_id "([^\"]*)/;
28 | 		 	 	$id1=$1;
29 | 		#print "$id1\t";
30 | 		  if ( exists  $gffs{$id1} ) {
31 | 		 # print "aaa/n";
32 | 		#print OUT join ("\t",@a)."\n";
33 | 		print OUT "$a[0]\t$a[1]\t$a[2]\t$a[3]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\ttranscript_id \"$gffs{$id1}\";\n";
34 | 		  }
35 | 		
36 | 		 
37 | 		 }
38 | 
39 | close OUT;
40 | close IN1;
41 | close IN2;
42 | sub USAGE {
43 |        print "usage: perl test1.pl -in1  gene_id.txt  -in2  基因组gtf文件  -out 结果文件";
44 | 	exit;
45 | }


--------------------------------------------------------------------------------
/get_promoter.pl:
--------------------------------------------------------------------------------
 1 | die "perl $0 <genome.fa> <weizhi.txt> <OUT> " unless(@ARGV==3 );
 2 | use Math::BigFloat;
 3 | use Bio::SeqIO;
 4 | use Bio::Seq;
 5 | $in = Bio::SeqIO -> new(-file => "$ARGV[0]",
 6 |                                   -format => 'Fasta');
 7 | $out = Bio::SeqIO -> new(-file => ">$ARGV[2]",
 8 |                                   -format => 'Fasta');
 9 | my %keep=() ;
10 | open IN,"$ARGV[0]" or die "$!";
11 | my%ref=();
12 | while ( my $seq = $in->next_seq() ) {
13 |      my($id,$sequence,$desc)=($seq->id,$seq->seq,$seq->desc);
14 |      
15 |          $ref{$id}=$seq;
16 | 
17 | }
18 | 
19 | $in->close();
20 | 
21 | open IN,"$ARGV[1]" or die "$!";
22 | while (<IN>) {
23 | 		chomp;
24 | 		next if /^#/;
25 | 		my @a= split /\t/;
26 | 		my$seq=0;
27 | 		if(exists $ref{$a[1]}){
28 | 			$seq=$ref{$a[1]};
29 | 		}else{
30 | 			print "chromosome $a[1] not in reference file\n";
31 | 			next;
32 | 		}
33 | 		
34 | 		print "$a[1]";
35 |      if( $a[4]  eq "-" ){
36 | 		      $start=  $a[3]+1;
37 | 			  $end=$a[3]+1500;
38 | 			  if($end>$seq->length){
39 |               	print "Note: $seq->id: upstream don't have enough sequence to cut for $a [0] and skiped\n";
40 |               	next;
41 | 
42 | 			  }
43 | 
44 |               my$seq_string=$seq->subseq($start,$end);
45 |               my$newseqobj1=Bio::Seq -> new(-seq => $seq_string,
46 | 				-id => "$a[0]"
47 |                ) ;
48 |      	       my$reseq = $newseqobj1 ->revcom();
49 |      	       $out->write_seq($reseq);      	
50 |      }elsif ( $a[4]  eq "+" ){
51 |               $start=  $a[2]-1500;
52 |               if ($start<0){
53 |               	print "Note: $seq->id: upstream don't have enough sequence to cut for $a[0] and skiped\n";
54 |               	next;
55 |               }
56 | 			  $end=$a[2]-1;
57 | 
58 |                my$seq_string=$seq->subseq($start,$end);
59 |            
60 |                my$newseqobj1=Bio::Seq -> new(-seq => $seq_string,
61 |                -id => "$a[0]"
62 |                    
63 |                ) ;
64 |      	      
65 |      	       $out->write_seq($newseqobj1);          
66 |      }
67 | 
68 | }
69 | close (IN);
70 | $in->close();
71 | $out->close();
72 | 


--------------------------------------------------------------------------------
/get_tandem_gene.pl:
--------------------------------------------------------------------------------
 1 | use Data::Dumper;
 2 | use Getopt::Long;
 3 | use strict;
 4 | use Cwd qw(abs_path getcwd);
 5 | my %opts;
 6 | 
 7 | GetOptions (\%opts,"id=s","tandem=s","od=s","name=s"); 
 8 | 
 9 | 
10 | if (! defined($opts{id}) ||! defined($opts{tandem})||! defined($opts{name}) || defined($opts{h})){
11 | 	&USAGE;
12 | }
13 | 
14 | sub USAGE {
15 | 	
16 | 	
17 |        print "perl $0  -id gene_family.id  -tandem gene.tandem  -name gene_famil -od ./\n";
18 | 	exit;
19 | }
20 | 
21 | 
22 | my $od=$opts{od};
23 | $od||=getcwd;
24 | $od=abs_path($od);
25 | unless(-d $od){    mkdir $od;}
26 | 
27 | ####get target gene id
28 | 
29 | my $gene;
30 | my @info;
31 | my %hashG;
32 | open (IN,"$opts{id}") || die "open $opts{id} failed\n";
33 | while(<IN>){
34 |     chomp;
35 |     @info=split(/\s+/,$_);
36 |     $gene=$info[0];
37 |     $hashG{$gene}=$gene;
38 | }
39 | close(IN);
40 | 
41 | 
42 | #######select tandem
43 | 
44 | 
45 | my $Agene;
46 | my $Bgene;
47 | open(OUT,">$od/$opts{name}.tandem")||die "open $od/$opts{name}.tandem failed\n";
48 | open (IN,"$opts{tandem}") || die "open $opts{tandem} failed\n";
49 | while(<IN>){
50 |     chomp;
51 |     @info=split(/,/,$_);
52 |     $Agene=$info[0];
53 |     $Bgene=$info[1];
54 |     if(exists $hashG{$Agene} && exists $hashG{$Bgene}){
55 |         print OUT $Agene."\t".$Bgene."\n";
56 |     }
57 | 
58 | }
59 | close(IN);
60 | close(OUT);


--------------------------------------------------------------------------------
/link.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # 基因组内共线性配置文件(某个基因家族或者自己想展示的基因对应关系)
 3 | 
 4 | chr01	230647852	230649724	chr05	23322345	23324725
 5 | chr01	249064852	249066345	chr05	20077643	20079139
 6 | chr02	43381422	43385332	chr05	172468991	172471289
 7 | chr04	239656815	239657190	chr05	73785793	73787640
 8 | 
 9 | 
10 | 
11 | 
12 | # 前三列 和 后三列 是一个对应关系  表示共线性基因对的位置信息
13 | 


--------------------------------------------------------------------------------
/mRNAid_to_geneid.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use strict;
 3 | use Cwd qw(abs_path getcwd);
 4 | use Getopt::Long;
 5 | use Data::Dumper;
 6 | 
 7 | 
 8 | die "perl $0 <gff> <outfile>" unless(@ARGV==2);
 9 | 
10 | 
11 | 
12 | my$gff=$ARGV[0];
13 | my%gene=();
14 | my%gene_region=();
15 | my%mRNA2Gene=();
16 | open IN,"$gff" or die "$!";
17 | open OUT ,">$ARGV[1]" or die "$!";
18 | print OUT "#mRNA_ID\tgene_ID\tchr\tstart\tend\tstrand\n";
19 | while(<IN>){
20 | 	chomp;
21 | 	next if (/^#/);
22 | 	my@tmp=split(/\t/);
23 | 	
24 | 
25 | 	
26 | 	if($tmp[2] =~/^gene/){
27 | 		my($id)=($tmp[8]=~/ID=([^;]+)/);
28 | 		$gene{$id}=1;
29 | 		$gene_region{$id}=[$tmp[0],$tmp[3],$tmp[4],$tmp[6]];
30 | 		
31 | 		
32 | 		#print "gene:$id\n";
33 | 		#my$gene_chr->{$id}=$tmp[0];
34 | 	}
35 | 	if($tmp[2] =~/mRNA|transcript/i){
36 | 		my($id)=($tmp[8]=~/ID=([^;]+)/);
37 | 		my($pid)=($tmp[8]=~/Parent=([^;]+)/);
38 | 		print OUT "$id\t$pid\t";
39 | 		
40 | 		if(exists $gene{$pid}){
41 | 			print OUT "$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
42 | 		}
43 | 		#print "mRNA:$id\n";
44 | 	}
45 | }
46 | 
47 | close(IN);
48 | close(OUT);
49 | 
50 | 
51 | 	
52 | 
53 | 


--------------------------------------------------------------------------------
/mcscan_seqid_染色体编号配置文件:
--------------------------------------------------------------------------------
1 | 1,5,3,2,4         # 一个物种的 染色体编号 --- 按染色体长度排序 (可选)
2 | A09,A03,A01,A02,A06,A05,A07,A08,A04,A10    # 另一个物种的 染色体编号
3 | 


--------------------------------------------------------------------------------
/mcscan_图层配置文件:
--------------------------------------------------------------------------------
1 | # y, xstart, xend, rotation, color, label, va,  bed
2 | .8,     .2,    .75,       0,      red, ATH, top, ATH.bed(要在当前路径或者绝对路径)
3 | .2,     .2,    .75,       0,      green, rapa, bottom, rapa.bed(要在当前路径或者绝对路径)
4 | # edges
5 | e, 0, 1, ATH.rapa.anchors.simple.c(绘图文件，经过上一步分析可以得到)
6 | 


--------------------------------------------------------------------------------
/mcscanx物种内共线性分析.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # mcscanx 物种内基因家族共线性分析
 3 | 
 4 | 所需文件
 5 | 
 6 | cds 文件 
 7 | gff 文件
 8 | pep 文件
 9 | 
10 | 基因家族 ID 信息
11 | 
12 | #www.omicsclass.com
13 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/arabidopsis_thaliana/cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz
14 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-39/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.39.gff3.gz
15 | #wget ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz
16 | #gunzip *gz
17 | 
18 | #mkdir mcscan
19 | #perl get_gene_position.pl Arabidopsis_thaliana.TAIR10.39.gff3 AT.gff
20 | #sed -i 's#gene:##' AT.gff
21 | #perl get_fa_by_id.pl AT.gff Arabidopsis_thaliana.TAIR10.pep.all.fa pep.fa
22 | #sed -i 's#\.1##' pep.fa
23 | makeblastdb -in pep.fa  -dbtype prot -title pep.fa
24 | blastall -i pep.fa -d pep.fa -e 1e-10  -p blastp  -b 5 -v 5 -m 8 -o mcscan/AT.blast
25 | cp AT.gff mcscan/AT.gff
26 | 
27 | /biosoft/MCScanX/MCScanX/MCScanX mcscan/AT
28 | 
29 | # 编辑物种染色体信息
30 | wget http://chibba.pgml.uga.edu/mcscan2/examples/family.ctl
31 | 
32 | 
33 | # 基因家族 geneID 文件
34 | wget http://chibba.pgml.uga.edu/mcscan2/examples/MADS_box_family.txt
35 | 
36 | sed -i 's#at##g' family.ctl
37 | 
38 | # 绘制图形
39 | 
40 | cd /biosoft/MCScanX/MCScanX/downstream_analyses 
41 | java family_circle_plotter -g /home/manager/share/mcscan/mcscan/AT.gff -s /home/manager/share/mcscan/mcscan/AT.collinearity -c /home/manager/share/mcscan/family.ctl -f /home/manager/share/mcscan/MADS_box_family.txt -o /home/manager/share/mcscan/mcscan/MADS.circle.PNG
42 | 
43 | 
44 | # 分析基因家族的串联重复 以及 基因组内所有串联重复信息
45 | cd /biosoft/MCScanX/MCScanX/downstream_analyses
46 | perl detect_collinearity_within_gene_families.pl -i /home/manager/share/mcscan/MADS_box_family.txt -d /home/manager/share/mcscan/mcscan/AT.collinearity -o /home/manager/share/mcscan/mcscan/MADS.collinear.pairs
47 | 
48 | # 由于绘制的圈图不是很美观，因此我们会对分析结果利用 circos 进行图形的绘制
49 | 


--------------------------------------------------------------------------------
/select_redundant_mRNA.pl:
--------------------------------------------------------------------------------
 1 | die "perl $0 <mRNA2genefile> <idlist> <outfile>" unless(@ARGV==3);
 2 | 
 3 | 
 4 | 
 5 | open IN ,"$ARGV[0]" or die "$!";
 6 | open OUT,">$ARGV[2]" or die "$!";
 7 | my%mRNA2geneData;
 8 | my%mRNA2gene;
 9 | 
10 | my%gene2mRNA;
11 | 
12 | print OUT "#geneID\tmRNAID\n";
13 | while (<IN>){
14 | 	chomp;
15 | 	if(/^#/){
16 | 		
17 | 		next;
18 | 		
19 | 	}
20 | 
21 | 	my@tmp=split(/\t/);
22 | 	$mRNA2gene{$tmp[0]}=$tmp[1];
23 | 	$mRNA2geneData{$tmp[0]}=$_;
24 | 	$gene2mRNA{$tmp[1]}{$tmp[0]}=1;
25 | 
26 | }
27 | 
28 | close(IN);
29 | 
30 | open IN,"$ARGV[1]" or die "$!";
31 | my%uniqGene;
32 | 
33 | while (<IN>){
34 | 	chomp;
35 | 	next if /^#/;
36 | 	my@tmp=split(/\s+/);
37 | 	$uniqGene{$mRNA2gene{$tmp[0]}}{$tmp[0]}=1;
38 | 
39 | }
40 | close(IN);
41 | 
42 | 
43 | for my$geneID(keys %uniqGene){
44 | 	
45 | 	my$transcriptIDNumber=scalar keys %{$uniqGene{$geneID}};
46 | 	my@transIDs=keys %{$uniqGene{$geneID}};
47 | 	
48 | 	print OUT "$geneID\t".join("\t",sort{$a cmp $b} @transIDs)."\n";
49 | }
50 | 
51 | close(OUT);
52 | 


--------------------------------------------------------------------------------
/stat_protein_fa.pl:
--------------------------------------------------------------------------------
 1 | #北京组学生物科技有限公司
 2 | #email: huangls@biomics.com.cn
 3 | 
 4 | die "perl $0 <in>  <out>" unless(@ARGV==2);
 5 | use Bio::SeqIO;
 6 | use Bio::Seq;
 7 | use Bio::Tools::SeqStats;
 8 | use Bio::Tools::pICalculator;
 9 | use Data::Dumper;
10 | #读入序列
11 | my $in = Bio::SeqIO->new(
12 | 	-file   => "$ARGV[0]",
13 | 	-format => 'Fasta'
14 | );
15 | 
16 | open OUT,">$ARGV[1]" or die "$!";
17 | print OUT "#ID\tlength\tMW(Da)\tpI\n";
18 | my $calc = Bio::Tools::pICalculator->new(-places => 2,-pKset => 'EMBOSS');
19 | 
20 | 
21 | #逐条读取序列并计算
22 | while ( my $seq = $in->next_seq() ) {
23 | 	#my ( $id, $sequence, $desc ) = ( $seq->id, $seq->seq, $seq->desc );
24 | 	my $weight = Bio::Tools::SeqStats ->get_mol_wt($seq);
25 | 	$calc->seq($seq);
26 |     my $iep = $calc->iep;
27 |     print OUT sprintf("%s\t%s\t%s\t%s\n",
28 |                   $seq->id,
29 |                   $seq->length,
30 |                   "$weight->[0]",
31 |                   $iep);
32 | }
33 | $in->close();
34 | close(OUT);
35 | 


--------------------------------------------------------------------------------
/text.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # 配置基因显示名称
 3 | chr01	230647852	230649724	Zm00008a004052
 4 | chr05	23322345	23324725	Zm00008a019932
 5 | chr01	249064852	249066345	Zm00008a004460
 6 | chr05	20077643	20079139	Zm00008a019854
 7 | chr02	43381422	43385332	Zm00008a007262
 8 | chr05	172468991	172471289	Zm00008a022078
 9 | chr04	239656815	239657190	Zm00008a018843
10 | chr05	73785793	73787640	Zm00008a020843
11 | 


--------------------------------------------------------------------------------
/upload_code:
--------------------------------------------------------------------------------
1 | git init
2 | git add README.md
3 | git commit -m "first commit"
4 | git remote add origin  git@github.com:chensole/bioinformatics.git
5 | git push -u origin master
6 | 


--------------------------------------------------------------------------------
/vimrc:
--------------------------------------------------------------------------------
  1 | "颜色主题
  2 | 
  3 | colorscheme molokai
  4 | set t_Co=256
  5 | set background=dark
  6 | 
  7 | "Vundle相关。Vundle是vim插件管理器，使用它来管理插件很方便，而且功能强大
  8 | 
  9 | set nocompatible              " be iMproved, required
 10 | filetype off                  " required
 11 | 
 12 | 
 13 | " set the runtime path to include Vundle and initialize
 14 | set rtp+=~/.vim/bundle/Vundle.vim
 15 | call vundle#begin()
 16 | " alternatively, pass a path where Vundle should install plugins
 17 | "call vundle#begin('~/some/path/here')
 18 | 
 19 | " let Vundle manage Vundle, required
 20 | " Plugin 'VundleVim/Vundle.vim'
 21 | " The following are examples of different formats supported.
 22 | Plugin 'https://github.com/scrooloose/nerdtree.git'
 23 | Plugin 'Valloric/YouCompleteMe' "{
 24 | "配置默认文件路径
 25 | let g:ycm_global_ycm_extra_conf='~/.vim/bundle/YouCompleteMe/third_party/ycmd/cpp/ycm/.ycm_extra_conf.py'
 26 | 
 27 | "语法关键字补全
 28 | let g:ycm_seed_identifiers_with_syntax = 1  
 29 | let g:ycm_add_preview_to_completeopt = 0
 30 | let g:ycm_show_diagnostics_ui = 0
 31 | let g:ycm_server_log_level = 'info'
 32 | let g:ycm_min_num_identifier_candidate_chars = 2
 33 | let g:ycm_collect_identifiers_from_comments_and_strings = 1
 34 | 
 35 | "字符串开启补全
 36 | let g:ycm_complete_in_strings=1
 37 | let g:ycm_key_invoke_completion = '<c-z>'
 38 | set completeopt=menu,preview
 39 | 
 40 | "补全后自动关闭预览窗口
 41 | let g:ycm_autoclose_preview_window_after_completion = 1
 42 | noremap <c-z> <NOP>
 43 | 
 44 | "回车选中匹配项
 45 | inoremap <expr> <CR> pumvisible() ? "\<C-y>" : "\<CR>" 
 46 | 
 47 | "语义补全触发条件
 48 | let g:ycm_semantic_triggers =  {
 49 |   \   'c' : ['->', '.'],
 50 |   \   'objc' : ['->', '.', 're!\[[_a-zA-Z]+\w*\s', 're!^\s*[^\W\d]\w*\s',
 51 |   \             're!\[.*\]\s'],
 52 |   \   'ocaml' : ['.', '#'],
 53 |   \   'cpp,objcpp' : ['->', '.', '::'],
 54 |   \   'perl' : ['->'],
 55 |   \   'php' : ['->', '::'],
 56 |   \   'cs,java,javascript,typescript,d,python,perl6,scala,vb,elixir,go' : ['.'],
 57 |   \   'ruby' : ['.', '::'],
 58 |   \   'lua' : ['.', ':'],
 59 |   \   'erlang' : [':'],
 60 |   \ }
 61 | let g:ycm_semantic_triggers =  {
 62 | 			\ 'c,cpp,python,java,go,erlang,perl': ['re!\w{2}'],
 63 | 			\ 'cs,lua,javascript': ['re!\w{2}'],
 64 | 			\ }
 65 | "}
 66 | " Keep Plugin commands between vundle#begin/end.
 67 | " plugin on GitHub repo
 68 | " Plugin 'tpope/vim-fugitive'
 69 | " plugin from http://vim-scripts.org/vim/scripts.html
 70 | " Plugin 'L9'
 71 | " Git plugin not hosted on GitHub
 72 | " Plugin 'git://git.wincent.com/command-t.git'
 73 | " git repos on your local machine (i.e. when working on your own plugin)
 74 | " Plugin 'file:///home/gmarik/path/to/plugin'
 75 | " The sparkup vim script is in a subdirectory of this repo called vim.
 76 | " Pass the path to set the runtimepath properly.
 77 | " Plugin 'rstacruz/sparkup', {'rtp': 'vim/'}
 78 | " Install L9 and avoid a Naming conflict if you've already installed a
 79 | " different version somewhere else.
 80 | " Plugin 'ascenator/L9', {'name': 'newL9'}
 81 | 
 82 | " All of your Plugins must be added before the following line
 83 | call vundle#end()            " required
 84 | filetype plugin indent on    " required
 85 | " To ignore plugin indent changes, instead use:
 86 | "filetype plugin on
 87 | "
 88 | " Brief help
 89 | " :PluginList       - lists configured plugins
 90 | " :PluginInstall    - installs plugins; append `!` to update or just :PluginUpdate
 91 | " :PluginSearch foo - searches for foo; append `!` to refresh local cache
 92 | " :PluginClean      - confirms removal of unused plugins; append `!` to auto-approve removal
 93 | "
 94 | " see :h vundle for more details or wiki for FAQ
 95 | " Put your non-Plugin stuff after this line
 96 | 
 97 | "vim支持鼠标点击
 98 | "set mouse=a
 99 | 
100 | set wildmenu
101 | 
102 | " 显示行号
103 | set number
104 | 
105 | " 自动对齐文中行缩进
106 | set autoindent
107 | 
108 | "智能缩进使用了代码语法和样式来对齐
109 | set smartindent
110 | 
111 | " tab键的宽度
112 | set tabstop=4
113 | 
114 | set cursorline
115 | 
116 | "在遍历文件时识别括弧的起始和结束位置
117 | set showmatch
118 | 
119 | "在文件中高亮显示搜索关键词
120 | set hlsearch
121 | 
122 | set encoding=utf-8
123 | set nocompatible
124 | syntax on
125 | 
126 | let python_highlight_all=1
127 | au Filetype python set tabstop=4
128 | au Filetype python set softtabstop=4
129 | au Filetype python set shiftwidth=4
130 | au Filetype python set textwidth=79
131 | au Filetype python set expandtab
132 | au Filetype python set autoindent
133 | au Filetype python set fileformat=unix
134 | autocmd Filetype python set foldmethod=indent
135 | autocmd Filetype python set foldlevel=99
136 | 
137 | map <F5> :call CompileRunGcc()<CR>
138 | func! CompileRunGcc()
139 |         exec "w"
140 |         if &filetype == 'c'
141 |                 exec "!g++ % -o %<"
142 |                 exec "!time ./%<"
143 |         elseif &filetype == 'cpp'
144 |                 exec "!g++ % -o %<"
145 |                 exec "!time ./%<"
146 |         elseif &filetype == 'java'
147 |                 exec "!javac %"
148 |                 exec "!time java %<"
149 |         elseif &filetype == 'sh'
150 |                 :!time bash %
151 |         elseif &filetype == 'python'
152 |                 exec "!clear"
153 |                 exec "!time python3 %"
154 |         elseif &filetype == 'html'
155 |                 exec "!firefox % &"
156 |         elseif &filetype == 'go'
157 |                 " exec "!go build %<"
158 |                 exec "!time go run %"
159 |         elseif &filetype == 'mkd'
160 |                 exec "!~/.vim/markdown.pl % > %.html &"
161 |                 exec "!firefox %.html &"
162 |         endif
163 | endfunc
164 | 
165 | "自动补全
166 | :inoremap < <><ESC>i
167 | :inoremap > <c-r>=ClosePair('>')<CR>
168 | :inoremap ( ()<ESC>i
169 | :inoremap ) <c-r>=ClosePair(')')<CR>
170 | :inoremap { {}<ESC>i
171 | :inoremap } <c-r>=ClosePair('}')<CR>
172 | :inoremap [ []<ESC>i
173 | :inoremap ] <c-r>=ClosePair(']')<CR>
174 | :inoremap " ""<ESC>i
175 | :inoremap ' ''<ESC>i
176 | function! ClosePair(char)
177 |     if getline('.')[col('.') - 1] == a:char
178 |         return "\<Right>"
179 |     else
180 |         return a:char
181 |     endif
182 | endfunction
183 | 
184 | "NERDTree config
185 |              map <F4> :NERDTreeToggle<CR>  " F4一键开关目录树
186 |              autocmd bufenter * if (winnr("$") == 1 && exists("b:NERDTreeType") &&b:NERDTreeType == "primary") | q | endif  " 当目录树窗口为最后一个窗口时自动退出vim
187 | 
188 | 
189 | """"""""""""""""" 新文件标题"""""""""""""""""""""""""""	  
190 | 
191 | "新建.pl文件，自动插入文件头
192 | 
193 | autocmd BufNewFile *.pl exec ":call SetPerlTitle()"
194 | 
195 | func SetPerlTitle()
196 | 	call setline(1,"#!usr/bin/perl -w")
197 | 	call append( line("."),"use strict;")
198 | 	call append(line(".")+1," ")
199 | 	call append(line(".")+2, "\# File Name: ".expand("%"))
200 | 	call append(line(".")+3, "\# Author: chensole")
201 | 	call append(line(".")+4, "\# mail: 1278371386@qq.com")
202 | 	call append(line(".")+5, "\# Created Time: ".strftime("%Y-%m-%d",localtime()))
203 | endfunc
204 | 
205 | 
206 | " 键盘命令
207 | 
208 | " 映射全选+复制 ctrl+a
209 | 
210 |  map <C-A> ggVGY
211 | 
212 |  map! <C-A> <Esc>ggVGY
213 | 
214 |  map <F12> gg=G
215 | 
216 |  " 选中状态下 Ctrl+c 复制
217 | 
218 |  vmap <C-c> "+y
219 | 
220 |  "去空行
221 | 
222 |  nnoremap <F2> :g/^\s*$/d<CR>
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 


--------------------------------------------------------------------------------
/下载数据:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ##1. 从NCBI上面下载nr数据库，不要用wget，相当慢
  4 | 
  5 | 
  6 | 使用 aspera 下载NCBI FTP 站点数据会非常快
  7 | 
  8 | ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /blast/db/FASTA/nr.gz ./
  9 | 
 10 | ###
 11 | Aspera的用法： $ ascp [参数] 目标文件 目的地址
 12 | Aspera的常用参数：
 13 | -T            不进行加密。若不添加此参数，可能会下载不了。
 14 | -i             string 输入私钥，安装 aspera 后有在目录 ~/.aspera/connect/etc/ 下有几个私钥，使用 linux 服务器的时候一般使用 asperaweb_id_dsa.openssh 文件作为私钥。
 15 | --host      string ftp的host名，NCBI的为ftp-private.ncbi.nlm.nih.gov；EBI的为fasp.sra.ebi.ac.uk。
 16 | --user      string 用户名，NCBI的为anonftp，EBI的为era-fasp。
 17 | --mode    string 选择模式，上传为 send，下载为 recv。
 18 | -l             string 设置最大传输速度，比如设置为 200M 则表示最大传输速度为 200m/s。若不设置该参数，则一般可达到10m/s的速度，而设置了，传输速度可以更高。
 19 | 
 20 | ##2. 从NCBI上面下载分类数据库文件
 21 | 
 22 | ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /pub/taxonomy/taxdump.tar.gz ./
 23 | ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /pub/taxonomy/accession2taxid/prot.accession2taxid.gz ./
 24 | 
 25 | 
 26 | 其中taxdump.tar.gz中主要有两个文件很有用
 27 | 
 28 | names.dmp 记录物种名及其分类编号
 29 | nodes.dmp 记录分类编号的节点信息
 30 | 
 31 | 例如 植物大类的编号为 3193
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | ### NCBI gene 数据库
 38 | 
 39 | https://www.ncbi.nlm.nih.gov/gene/?term=
 40 | 
 41 | 这个数据库中存储了所有物种的gene 信息
 42 | 
 43 | 从它所在的FTP站点，可以下载有用的文件
 44 | 
 45 | ftp://ftp.ncbi.nih.gov/gene/DATA/
 46 | 
 47 | 
 48 | 常见的有
 49 | 	
 50 | 	gene2accession.gz  geneID(Entrez ID)与accession(收集的序列大多来自 swissport、RefSeq等)间的对应关系,这个文件是一个非常详细的数据文件
 51 | 
 52 | 
 53 | 	gene2go.gz geneID与Go间的对应关系
 54 | 
 55 | 
 56 | 
 57 | ### NCBI taxonmony 数据库
 58 | 
 59 | NCBI 中存储了大量的物种，Taxonmony 数据库专门存储每个物种在 NCBI 中的 ID号
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | ### 批量下载 SRA 数据  
 66 | 
 67 | 
 68 | 	1. 把要下载的数据 SRR号写入一个文件，如 srr.txt,每行是一个 SRR id
 69 | 	2. 利用 SRA toolkit 的 prefetch 下载，并指定下载方式为 ascp 
 70 | 
 71 | prefetch -t ascp --ascp-path "/home/chenzhi/.aspera/connect/bin/ascp|/home/chenzhi/.aspera/connect/etc/asperaweb_id_dsa.openssh" --option-file srr.txt -O .
 72 | 
 73 | 
 74 | 
 75 | 
 76 | ## nr 数据库注释  
 77 | 
 78 | 
 79 | 	整个nr数据库非常大，130多个G，如果想要进行物种注释会消耗大量的时间。这里我推荐使用 diamond 这个工具，可以只进行某一类物种比对，如植物类的
 80 | 
 81 | taxmony ID是3193.
 82 | 
 83 | 
 84 | 	1.利用diamond 建库，需要额外提供两个文件（可从NCBI上下载）
 85 | 
 86 | 		prot.accession2taxid (蛋白accession登录号与taxid的对应关系)
 87 | 
 88 | 		taxdump文件夹下的nodes.dmp文件
 89 | 
 90 | 
 91 | 	nohup ~/biosoft/diamond makedb -p 30 --taxonmap ../prot.accession2taxid --taxonnodes ../taxdump/nodes.dmp --in nr --db nr_tax &
 92 | 
 93 | 	
 94 | 利用上面这条命令就可以构建索引，且每个索引都有对应的 taxid 号
 95 | 
 96 | 
 97 | 
 98 | 	2. 比对
 99 | 
100 | 	
101 | 	可以指定 -taxonlist 参数值，即对nr索引中某类生物比对
102 | 
103 | nohup ~/biosoft/diamond blastp -p 30 -q /date/cjt/ref/CM3.6.1_pep.fasta --db nr_index/nr_tax.dmnd --taxonlist 3193 -f 6 --max-hsps 1 --max-target-seqs 1 -o cme_blast@nr.m6 &
104 | 
105 | 
106 | 	上面是将甜瓜蛋白序列与nr数据库中所有的植物序列比对，比对后即可获得甜瓜蛋白序列与nr数据库相似的蛋白的 accession 登录号
107 | 
108 | 
109 | 	3. 将比对结果结合 gene2accession（NCBI下载） 文件，可得到甜瓜蛋白序列对应 gene symbol,同时可以综合 eggnog-mapper-1.0.3的比对结果，进行总体注释
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/共线性图.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chensole/bioinformatics/e1e98c053f5b3d22afd059cccdfe00e5dee24cbb/共线性图.pdf


--------------------------------------------------------------------------------
/添加基因家族物种共线性关系配置:
--------------------------------------------------------------------------------
 1 | # 在物种共线性区块图形中显示自己研究的基因家族的共线性关系
 2 | 
 3 | 在 .simple 文件后面加入基因家族物种间共线性序列-------------------- 格式如下
 4 | 
 5 | 
 6 | 
 7 | # 每一行的开头加颜色值 ，第五列自定义数值  第六列自定义 + or — ，其中每个基因 ID 重复一列展示（这样会变成一条线）                                                  
 8 | 
 9 | #FF0000*AT1G69310       AT1G69310       Bra038313       Bra038313       10      +
10 | #FF0000*AT1G69810       AT1G69810       Bra007884       Bra007884       10      +
11 | #FF0000*AT1G80840       AT1G80840       Bra008435       Bra008435       10      +
12 | #FF0000*AT1G55600       AT1G55600       Bra038006       Bra038006       10      +
13 | #FF0000*AT1G13960       AT1G13960       Bra019697       Bra019697       10      +
14 | #FF0000*AT1G69310       AT1G69310       Bra004002       Bra004002       10      +
15 | #FF0000*AT1G29860       AT1G29860       Bra030178       Bra030178       10      +
16 | #FF0000*AT1G68150       AT1G68150       Bra004285       Bra004285       10      +
17 | #FF0000*AT1G69310       AT1G69310       Bra004370       Bra004370       10      +
18 | #FF0000*AT1G80840       AT1G80840       Bra035148       Bra035148       10      +
19 | #FF0000*AT1G80840       AT1G80840       Bra003588       Bra003588       10      +
20 | #FF0000*AT1G18860       AT1G18860       Bra016535       Bra016535       10      +
21 | 
22 | 


--------------------------------------------------------------------------------
/物种间基因家族查找.pl:
--------------------------------------------------------------------------------
 1 | 
 2 | use strict;
 3 | use List::Util qw(any);
 4 | 
 5 | my $f1 = shift;
 6 | my $f2 = shift;
 7 | my $f3 = shift;
 8 | 
 9 | my (@a1,@a2);
10 | 
11 | open I1,$f1 or die "$!";
12 | open I2,$f2 or die "$!";
13 | open I3,$f3 or die "$!";
14 | 
15 | while (defined(my $line = <I1>)) {
16 | 	chomp $line;
17 | 
18 | 	push @a1,$line;
19 | 
20 | 
21 | }
22 | while (defined(my $line = <I2>)) {
23 | 	chomp $line;
24 | 
25 | 	push @a2,$line;
26 | 
27 | 
28 | }
29 | while (defined(my $line = <I3>)) {
30 | 	chomp $line;
31 | 
32 | 	next if $line =~ /^#/;
33 | 		
34 | 		my @tmp = split(/\s+/,$line);
35 | 		
36 | 		if ((any {/$tmp[0]/} @a1) and (any {/$tmp[1]/} @a2)) {
37 | 
38 | 			print $line."\n";
39 | 		}			
40 | 
41 | 	}
42 | 	
43 | 		
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/物种间基因家族比较分析流程.txt:
--------------------------------------------------------------------------------
 1 | # ----------------------------- 物种间 mcscanx 基因共线性分析 (基于 python 版本的 mcscanx 软件)
 2 | 
 3 | 软件下载地址 ： https://github.com/tanghaibao/jcvi
 4 | 
 5 | 所需文件
 6 | 
 7 | 	gene 位置信息 gff文件中回去
 8 | 		
 9 | 	CDS.fasta 文件
10 | 
11 | 	基因 CDS文件 只选择一个转录本作为该基因的代表序列 (否则软件会报错) 从CDS文件中进行挑选
12 | 
13 | 
14 | 
15 | #得到基因组上所有基因的位置信息，bed文件；以及cds序列;这里的两个脚本不在script下，需要自行拷贝；
16 | perl get_gene_bed.pl -in1 Arabidopsis_thaliana.TAIR10.41.gff3 -out ATH.bed
17 | perl get_fa_by_id_from_bed.pl ATH.bed Arabidopsis_thaliana.TAIR10.cds.all.fa ATH.cds
18 | 
19 | ##统一成基因ID
20 | sed  's#\.1##' ATH.cds
21 | 
22 | 
23 | #同样的道理准备，准备白菜的基因组，bed文件和，cds文件；
24 | perl get_gene_bed.pl -in1 Brassica_rapa.Brapa_1.0.41.chr.gff3 -out rapa.bed
25 | perl get_fa_by_id_from_bed.pl rapa.bed Brassica_rapa.Brapa_1.0.cds.all.fa rapa.cds
26 | 
27 | 
28 | /biosoft/miniconda/miniconda2/bin/python -m jcvi.compara.catalog ortholog ATH rapa --cscore=0.7      # 最重要的是 .anchors 文件(含有两个物种中所有共线性基因对的关系) 和 .simple 文件 (绘图所需文件)
29 | 
30 | #对共线性区域进行过滤
31 | /biosoft/miniconda/miniconda2/bin/python -m jcvi.compara.synteny screen --minsize=0 --minspan=30 --simple ATH.rapa.anchors   ATH.rapa.anchors.new
32 | #绘制共线性图片：准备两个配置文件为输入文件：
33 | 
34 | /biosoft/miniconda/miniconda2/bin/python -m jcvi.graphics.karyotype  --format=pdf  --figsize=15x5 mcscan_seqid mcscan_layout
35 | 
36 | 
37 | 
38 | 
39 | # 分别鉴定出两个物种中某一基因家族的 geneID 然后从 .anchors 文件中 挑选出 对应的具有共线性的基因，设置要展示的颜色，追加到 .simple 文件末尾 ----------- 绘图即可
40 | 
41 | 1. 使用  物种间基因家族查找.pl  基于 .anchors 文件和两个个物种基因家族 geneID list 文件，鉴定某个两个物种中基因家族的共线性geneID
42 | 
43 | 
44 | perl  物种间基因家族查找.pl ../cmo_at/cmo_ERF.txt zm_ERF.txt Cm.zm.anchors >simple
45 | 
46 | 
47 | 
48 | 2. 使用  生成基因家族配置simple文件.pl  将上面得到的 simple文件生成特定的绘图格式
49 | 
50 | perl   生成基因家族配置simple文件.pl  simple > simple1.txt
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/生成基因家族配置simple文件.pl:
--------------------------------------------------------------------------------
 1 | #!usr/bin/perl -w
 2 | use strict;
 3 | 
 4 | my $file = shift;
 5 | 
 6 | open I,$file or die "$!";
 7 | 
 8 | 
 9 | 
10 | while (defined (my $line = <I>)) {
11 | 
12 | 	chomp $line;
13 | 
14 | 	my @tmp = split(/\s+/,$line);
15 | 
16 | 	print "#FF0000*$tmp[0]\t$tmp[0]\t$tmp[1]\t$tmp[1]\t$tmp[2]\t+\n";
17 | 
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------