├── BioScripts ├── PerlScript │ ├── ColExtracter.pl │ ├── ManagerAndWorker │ │ ├── ReadMe.md │ │ ├── command.list.txt │ │ ├── manager.pl │ │ └── worker.pl │ ├── bowtieOut2RegionCounts.pl │ ├── checkMIRstructure.pl │ ├── downloadSRA.pl │ ├── generateCommandFromSRATabTable.pl │ ├── mergeRegionForQZ.pl │ ├── regionGrouper.pl │ └── tableTomerge.txt ├── PythonScript │ ├── Fasta_stat.py │ ├── GC_window_scan.py │ ├── Predict_TF_family.py │ ├── Predict_miRNA_from_All_Plant_Genome.py │ ├── extract_fasta.py │ ├── fasta_ID_simplifier.py │ ├── filter_BlastP_report_ID.py │ └── filter_fasta_by_id.py ├── Rscript │ ├── Boxplot.violinPlot.Point.with.Ttest.for.TwoSet.Comparison.Visulization.r │ ├── DEseq2 有重复数据.r │ ├── Draw_TBtools_KEGG_Pathway_Enrichment_Graph.r │ ├── Revigo本地可视化代码.r │ ├── R语言物种分布.txt │ ├── pheatmap缺失值上热图.txt │ ├── 合并展示多组富集结构.sh │ ├── 基于RSEM结果直接调用DEseq进行差异表达分析.r │ └── 绘制GO富集柱形图R码 ├── ShellScript │ ├── OneKP数据下载 │ ├── smallRNAadapterRM.sh │ └── 使用genBlast从所有植物基因组中提取目标基因家族成员.sh └── UsageOfSomeCliSoftware │ └── VARNA │ ├── Color.jpg │ ├── default.png │ └── 命令行使用VARNA可视化miRNA结构.md ├── LICENSE └── README.md /BioScripts/PerlScript/ColExtracter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | my $usage = " 4 | perl $0 matrix.tab id.tab out.tab.xls 5 | "; 6 | my $matrix = shift; 7 | my $ids = shift; 8 | my $out = shift; 9 | die $usage unless -s $matrix && -s $ids && $out; 10 | open ID,'<',$ids or die "Can't read id.tab $ids\n"; 11 | my %ids; 12 | while(){ 13 | s/\s*\r?\n$//; 14 | $ids{$_}=1; 15 | } 16 | close ID; 17 | open MAT,'<',$matrix or die "Can't read matrix.tab $matrix\n"; 18 | my $idLine = ; 19 | $idLine=~s/\s*\r?\n$//; 20 | my @colNames = split /\t/,$idLine; 21 | my @needCols; 22 | my $curIndex =0; 23 | for(@colNames){ 24 | if($ids{$_}){ 25 | push @needCols,$curIndex; 26 | } 27 | $curIndex++; 28 | } 29 | # print @needCols; 30 | seek MAT,0,0; 31 | open OUT,'>',$out or die "Can't write into outfile $out\n"; 32 | while(){ 33 | s/\s*\r?\n$//; 34 | my @curCols = split /\t/,$_; 35 | my @printArr; 36 | for(@needCols){ 37 | push @printArr,$curCols[$_]; 38 | } 39 | print OUT join "\t",@printArr; 40 | print OUT "\n"; 41 | } 42 | close MAT; 43 | close OUT; -------------------------------------------------------------------------------- /BioScripts/PerlScript/ManagerAndWorker/ReadMe.md: -------------------------------------------------------------------------------- 1 | ## ManagerAndWorkder 2 | 3 | #### Why write these ugly scrpts? 4 | 5 | Most of the time, the server is free, just like a idor, wasting our time and money. We should make it working. 6 | 7 | #### How to use these scripts? 8 | 9 | 1. prepare a command list, in which, one shell comand one line 10 | 11 | >echo "Job1" 12 | > 13 | >perl assembly.pl in.fq out.fa 14 | > 15 | >perl align.pl in.fq out.fa out.bam 16 | > 17 | >..... 18 | 19 | 2. run command like below 20 | 21 | ```bash 22 | nohup perl manager.pl 'perl worker.pl command.list' 23 | # or I would prefer to run this command under tmux.... 24 | ``` 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/ManagerAndWorker/command.list.txt: -------------------------------------------------------------------------------- 1 | echo 1 2 | echo 123 3 | ohc 4 | perl -le 'print "inPerl..."' 5 | print aba 6 | echo aaa 7 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/ManagerAndWorker/manager.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | ###################### 再撰写一个脚本 manager.pl 用于监测系统负载并调用 worker进行工作 ################## 3 | use strict; 4 | use POSIX; 5 | my $year_month_day="[".strftime("%Y-%m-%d %H:%M:%S",localtime())."] "; 6 | my $usage = " 7 | perl $0 'Command' 8 | "; 9 | my $loadAvgLimit = 2; 10 | my $command =shift; 11 | die $usage unless $loadAvgLimit and $command; 12 | 13 | while(1){ 14 | # 查看系统负载 15 | my @loadAvg=split /\s+/,`cat /proc/loadavg`; 16 | my $maxLoad = 0; 17 | for(0..2){ 18 | if($maxLoad<$loadAvg[$_]){ 19 | $maxLoad = $loadAvg[$_]; 20 | } 21 | } 22 | if($maxLoad>$loadAvgLimit){ 23 | print $year_month_day."maxLoadAvg is $maxLoad, Higher than given $loadAvgLimit,\nQuit and Wait for Next Check\n"; 24 | }else{ 25 | print $year_month_day."maxLoadAvg is $maxLoad, Ready to Work.\n"; 26 | my @result =`$command`; 27 | print @result; 28 | if($result[0] =~/No Command To Execute/){ 29 | print $year_month_day."All Command Were Executed, Quit\n"; 30 | exit; 31 | } 32 | } 33 | # sleep 1000*60*60; 34 | sleep 2; # 休眠1s 35 | } 36 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/ManagerAndWorker/worker.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | ######## 撰写脚本 workder.pl , 主要检测命令列表，逐个执行，无论成功或者失败，每个命令只执行一次，直到获得成功或者失败的结果 ####### 3 | use strict; 4 | use POSIX; 5 | my $year_month_day="[".strftime("%Y-%m-%d %H:%M:%S",localtime())."] "; 6 | my $usage = " 7 | perl $0 ShellCommandPerLine 8 | "; 9 | my $commandList =shift; 10 | die $usage unless -s $commandList; 11 | 12 | # 创建一个文件, 表示当前脚本正在运行，不允许两个脚本同时运行 13 | my $flagFile = "TimeJobRunning.CJ.Worker"; 14 | if(-e $flagFile){ 15 | print $year_month_day."One Job is already running...\nQuite and wait for next check time..."; 16 | exit; 17 | }else{ 18 | open OUT,'>',$flagFile or die "Can't create RunStatus Checking File\n"; 19 | close OUT; 20 | } 21 | my $finishedCommondFile = "FinishedCommands.txt.CJ.Worker"; 22 | my @finishedCommands; 23 | if(-s $finishedCommondFile){ 24 | open ALFINISHED,'<',$finishedCommondFile or die "Can't read input finishedCommondFile\n"; 25 | while(){ 26 | chomp; 27 | push @finishedCommands,$_; 28 | } 29 | close ALFINISHED; 30 | } 31 | my $failCommandFile = "FailCommands.txt.CJ.Worker"; 32 | my @failedCommands; 33 | if(-s $failCommandFile){ 34 | open ALFAIL,'<',$failCommandFile or die "Can't read input failCommandFile"; 35 | while(){ 36 | chomp; 37 | push @failedCommands,$_; 38 | } 39 | close ALFAIL; 40 | } 41 | 42 | 43 | open IN,'<',$commandList or die "Can't read input $commandList\n"; 44 | my $commandToRun = ""; 45 | while(){ 46 | chomp; 47 | my $curCommand = $_; 48 | # 跳过成功了的，或者是失败了的 49 | next if grep {$curCommand eq $_} (@finishedCommands,@failedCommands); 50 | $commandToRun = $curCommand; 51 | last; 52 | } 53 | close IN; 54 | if(!$commandToRun){ 55 | print $year_month_day."No Command To Execute...\n"; 56 | }else{ 57 | my $ret = system $commandToRun; 58 | if($ret){ 59 | print $year_month_day."Execute command [$commandToRun] Failed...\n"; 60 | push @failedCommands,$commandToRun; 61 | }else{ 62 | print $year_month_day."Execute command [$commandToRun] Finished...\n"; 63 | push @finishedCommands,$commandToRun; 64 | } 65 | # 将完成的或者是失败了的命令保存 66 | open FINISHED,'>',$finishedCommondFile or die "Can't write input finishedCommondFile"; 67 | print FINISHED $_,"\n" for @finishedCommands; 68 | close FINISHED; 69 | open FAILED,'>',$failCommandFile or die "Can't write into failedCommandsFile"; 70 | print FAILED $_,"\n" for @failedCommands; 71 | close FAILED; 72 | } 73 | # 删除掉Flag文件 74 | unlink $flagFile; -------------------------------------------------------------------------------- /BioScripts/PerlScript/bowtieOut2RegionCounts.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | my $usage = " 4 | perl $0 input.collasped.bowtie.out region.Range region.RPTM 5 | USAGE: 6 | perl $0 input.collasped.bowtie.out region.Range region.RPTM 7 | 8 | region.Range: 9 | scaffold886:82000-97000 10 | ... 11 | "; 12 | my $bowtie = shift; 13 | my $region = shift; 14 | my $rptm = shift; 15 | 16 | # my $readMappedTime = shift; 17 | 18 | die $usage unless -s $bowtie and -s $region and $rptm; 19 | 20 | # my ($scaffold,$start,$end) = map {/(\S+):(\d+)-(\d+)/} $region; 21 | my %region; 22 | open REGION,'<',$region or die "Can't read input region file"; 23 | while(){ 24 | chomp; 25 | my ($scaffold,$start,$end) = map {/(\S+):(\d+)-(\d+)/} $_; 26 | # print ($scaffold,$start,$end); 27 | push @{$region{$scaffold}},[$start,$end]; 28 | } 29 | 30 | close REGION; 31 | 32 | 33 | open BOWTIE,'<',$bowtie or die "Can't read bowtie output file $bowtie\n"; 34 | # open OUT,'>',$out or die "Can't write into file:$out\n"; 35 | 36 | # totalMapped Read Counts 37 | my %seenRead = (); 38 | my $libSize = 0; 39 | my %regionCount; 40 | while(){ 41 | # chomp; 42 | my ($readAndCount,$strand,$curScaffold,$curStart,$readSeq,$readQual,$tallyCount) = split /\t/,$_; 43 | my ($readId,$readCount) = map {/^(\S+)-(\d+)$/} $readAndCount; 44 | $libSize += $readCount unless $seenRead{$readAndCount}++; 45 | 46 | if($region{$curScaffold}){ 47 | # print ($readAndCount,$strand,$curScaffold,$curStart,$readSeq,$readQual,$tallyCount); 48 | for(@{$region{$curScaffold}}){ 49 | my ($start,$end) = @{$_}; 50 | 51 | next if $curStart > $end; 52 | next if $curStart+length($readSeq) < $start; 53 | 54 | push @{$regionCount{"$curScaffold:$start-$end"}},[$readAndCount,$strand,$curScaffold,$curStart,$readSeq,$readQual,$tallyCount]; 55 | } 56 | } 57 | 58 | } 59 | # print STDERR $libSize,"\n"; 60 | close BOWTIE; 61 | 62 | open RPTM,'>',$rptm or die "Can't write into $rptm file\n"; 63 | for (sort keys %regionCount){ 64 | my $region = $_; 65 | 66 | my $averageCounts = 0; 67 | my $totalCounts = 0; 68 | 69 | for(@{$regionCount{$_}}){ 70 | my ($readAndCount,$strand,$curScaffold,$curStart,$readSeq,$readQual,$tallyCount) = @{$_}; 71 | my ($readId,$readCount) = map {/^(\S+)-(\d+)$/} $readAndCount; 72 | # 对于多匹配的结果，直接平均分配 73 | $totalCounts+=$readCount; 74 | $averageCounts+=($readCount/$seenRead{$readAndCount}); 75 | } 76 | # 输出总的计数(包含多匹配)，平均计数（多匹配平均分配），RPTM 77 | # print $_,"\t",$totalCounts,"\t",$averageCounts,"\t",$averageCounts*10000000/$libSize,"\n"; 78 | 79 | # 80 | print RPTM $_,"\t",$averageCounts*10000000/$libSize,"\n"; 81 | } 82 | close RPTM; 83 | 84 | __END__ 85 | # 86 | ls *.genome_mapping.txt|parallel -j 40 perl bowtieOut2RegionCount.pl 87 | 88 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/checkMIRstructure.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use RNA; # Vienna RNA package perl Interface 4 | my $usage = " 5 | perl $0 possible.percursor.info out 6 | "; 7 | 8 | my $info = shift; 9 | my $out = shift; 10 | die $usage unless -s $info and $out; 11 | open OUT,'>',$out or die "Can't write into $out"; 12 | open INFO,'<',$info or die "Can't read input info file:$info"; 13 | while(){ 14 | chomp; 15 | my ($seqId,$percursor,$tag) = split /\t/,$_; 16 | my $stru_check = &stru_screen($percursor, $tag); 17 | print OUT $_,"\n" if $stru_check; 18 | } 19 | close INFO; 20 | close OUT; 21 | 22 | 23 | 24 | 25 | ########### 26 | # 使用夏老师 miRso.pl的一些子函数 27 | ############ 28 | 29 | 30 | # define which arm tag reside, return (5p/3p/-) 31 | sub which_arm { 32 | my $substruct=shift; 33 | my $arm; 34 | if ($substruct=~/$/ && $substruct=~/$/) { 35 | $arm="-"; 36 | } 37 | elsif ($substruct=~/\(/) { 38 | $arm="5p"; 39 | } 40 | else { 41 | $arm="3p"; 42 | } 43 | return $arm; 44 | } 45 | # compute biggest bulge size 46 | sub biggest_bulge { 47 | my $struct=shift; 48 | my $bulge_size=0; 49 | my $max_bulge=0; 50 | while ($struct=~/(\.+)/g) { 51 | $bulge_size=length $1; 52 | if ($bulge_size > $max_bulge) { 53 | $max_bulge=$bulge_size; 54 | } 55 | } 56 | return $max_bulge; 57 | } 58 | 59 | # compute asymmetry 60 | sub get_asy { 61 | my($table,$a1,$a2)=@_; 62 | my ($pre_i,$pre_j); 63 | my $asymmetry=0; 64 | foreach my $i ($a1..$a2) { 65 | if (defined $table->{$i}) { 66 | my $j=$table->{$i}; 67 | if (defined $pre_i && defined $pre_j) { 68 | my $diff=($i-$pre_i)+($j-$pre_j); 69 | $asymmetry += abs($diff); 70 | } 71 | $pre_i=$i; 72 | $pre_j=$j; 73 | } 74 | } 75 | return $asymmetry; 76 | } 77 | sub stru_screen { 78 | my $MAX_ENERGY = -18; 79 | my $MIN_SPACE= 5; # minimal space size between miRNA and miRNA* 80 | my $MAX_SPACE = 200; # maximal space size between miRNA and miRNA* 81 | my $MIN_PAIR = 14; # minimal pair of miRNA and miRNA* 82 | my $MAX_BULGE = 4; # maximal bulge of miRNA and miRNA* 83 | my $MAX_SIZEDIFF = 4; # maximal size different of miRNA and miRNA* 84 | my $MAX_ASY = 5; # maximal asymmetry of miRNA/miRNA* duplex 85 | my ($seq, $tag) = @_; 86 | if ($seq !~ /$tag/) { 87 | print "$seq => $tag####\n"; 88 | } 89 | my ($struct,$mfe)=RNA::fold($seq); 90 | my $tag_beg=index($seq,$tag,0)+1; 91 | my $tag_end=$tag_beg+length($tag)-1; 92 | my $tag_length = length $tag; 93 | my $pass=0; 94 | if ($mfe > $MAX_ENERGY) { #fold energy filter; 95 | return $pass; 96 | } 97 | # check for mature; 98 | my $tag_struct=substr($struct,$tag_beg-1,$tag_length); 99 | my $tag_arm=which_arm($tag_struct); 100 | my $tag_unpair=$tag_struct=~tr/.//; 101 | my $tag_pair=$tag_length-$tag_unpair; 102 | my $tag_max_bulge=biggest_bulge($tag_struct); 103 | if ($tag_arm eq "-") {return $pass} #not a single arm; 104 | if ($tag_pair < $MIN_PAIR) {return $pass} 105 | if ($tag_max_bulge > $MAX_BULGE) {return $pass} 106 | 107 | # build base pairing table 108 | my %pairtable; 109 | &parse_struct($struct,\%pairtable); # coords count from 1 110 | #print "$seq => $struct => $tag\n"; 111 | # check for star 112 | my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); 113 | my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); 114 | my $star_length=$star_end-$star_beg+1; 115 | my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); 116 | my $star_arm=which_arm($star_struct); 117 | my $star_unpair=$star_struct=~tr/.//; 118 | my $star_pair=$star_length-$star_unpair; 119 | my $star_max_bulge=biggest_bulge($star_struct); 120 | if ($star_arm eq "-") {return $pass} 121 | if ($star_pair < $MIN_PAIR) {return $pass} 122 | if ($star_max_bulge > $MAX_BULGE) {return $pass} 123 | 124 | # space size between miR and miR* 125 | my $space; 126 | if ($tag_beg < $star_beg) { 127 | $space=$star_beg-$tag_end-1; 128 | } 129 | else { 130 | $space=$tag_beg-$star_end-1; 131 | } 132 | if ($space < $MIN_SPACE) {return $pass} 133 | if ($space > $MAX_SPACE) {return $pass} 134 | 135 | # size diff 136 | my $size_diff=abs($tag_length-$star_length); 137 | if ($size_diff > $MAX_SIZEDIFF) {return $pass} 138 | 139 | # asymmetry 140 | my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); 141 | if ($asy > $MAX_ASY) {return $pass} 142 | 143 | $pass=1; 144 | return $pass; 145 | } 146 | 147 | # build base pair table, coors count from 1 148 | sub parse_struct { 149 | my $struct=shift; 150 | my $table=shift; 151 | 152 | my @t=split('',$struct); 153 | my @lbs; # left brackets 154 | foreach my $k (0..$#t) { 155 | if ($t[$k] eq "(") { 156 | push @lbs, $k+1; 157 | } 158 | elsif ($t[$k] eq ")") { 159 | my $lb=pop @lbs; 160 | my $rb=$k+1; 161 | $table->{$lb}=$rb; 162 | $table->{$rb}=$lb; 163 | } 164 | } 165 | if (@lbs) { 166 | warn "unbalanced RNA struct.\n"; 167 | } 168 | } 169 | # given a sub-region, get its star region, 2 nt 3' overhang 170 | sub get_star { 171 | my($table,$beg,$end)=@_; 172 | 173 | my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 174 | foreach my $i ($beg..$end) { 175 | if (defined $table->{$i}) { 176 | my $j=$table->{$i}; 177 | $s1=$i; 178 | $s2=$j; 179 | last; 180 | } 181 | } 182 | foreach my $i (reverse ($beg..$end)) { 183 | if (defined $table->{$i}) { 184 | my $j=$table->{$i}; 185 | $e1=$i; 186 | $e2=$j; 187 | last; 188 | } 189 | } 190 | # print "$s1,$e1 $s2,$e2\n"; 191 | 192 | # correct terminus 193 | my $off1=$s1-$beg; 194 | my $off2=$end-$e1; 195 | $s2+=$off1; 196 | $s2+=2; # 081009 197 | $e2-=$off2; 198 | $e2=1 if $e2 < 1; 199 | $e2+=2; 200 | $e2=1 if $e2 < 1; # 081009 201 | ($s2,$e2)=($e2,$s2) if ($s2 > $e2); 202 | return ($s2,$e2); 203 | } 204 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/downloadSRA.pl: -------------------------------------------------------------------------------- 1 | use LWP::Simple; 2 | 3 | my $usage = " 4 | perl $0 startDate endDate 5 | Note: 6 | Download sra.sampleInfo.xml file as specific time region: 7 | eg. 8 | perl $0 2017/01/01 2017/12/31 9 | "; 10 | my $startDate = shift; 11 | my $endDate = shift; 12 | die $usage unless $startDate and $endDate; 13 | my $timeStamp = $startDate."_".$endDate; 14 | $timeStamp =~ tr/\//_/; 15 | $query='(("green plants"[Organism]) AND ("biomol rna"[Properties])) AND (("rna seq"[Strategy]) NOT ("mirna seq"[Strategy])) AND (("bgiseq"[Platform]) OR ("illumina"[Platform])) AND ("'.$startDate.'"[Publication Date] : "'.$endDate.'"[Publication Date])'; 16 | #assemble the esearch URL 17 | $base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'; 18 | $url = $base . "esearch.fcgi?db=sra&term=$query&usehistory=y"; 19 | # $url=~s/\s+/+/g; 20 | # print "$url\n"; 21 | #post the esearch URL 22 | $output = get($url); 23 | # print $output,"\n"; 24 | #parse WebEnv, QueryKey and Count (# records retrieved) 25 | $web = $1 if ($output =~ /(\S+)<\/WebEnv>/); 26 | $key = $1 if ($output =~ /(\d+)<\/QueryKey>/); 27 | $count = $1 if ($output =~ /(\d+)<\/Count>/); 28 | #retrieve data in batches of 500 29 | $retmax = 500; 30 | for ($retstart = 0; $retstart < $count; $retstart += $retmax) { 31 | $efetch_url = $base ."efetch.fcgi?db=sra&WebEnv=$web"; 32 | $efetch_url .= "&query_key=$key&retstart=$retstart"; 33 | $efetch_url .= "&retmax=$retmax&rettype=xml&retmode=full"; 34 | $efetch_out = get($efetch_url); 35 | open(OUT, ">sra.".$timeStamp.".".$retstart.".xml") || die "Can't open file!\n"; 36 | binmode OUT,":utf8"; 37 | print OUT "$efetch_out"; 38 | close OUT; 39 | # 40 | sleep int rand(10)+5; 41 | } 42 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/generateCommandFromSRATabTable.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | my $usage = " 4 | perl $0 in.sra.table.tab.xls 5 | Note: 6 | in.sra.table.tab.xls might be transformet with R/Excel from sra.table.csv down load from NCBI 7 | "; 8 | my $sraTable = shift; 9 | 10 | my $TRINITI_MAIN = "/tools/trinityrnaseq-Trinity-v2.4.0/Trinity"; 11 | my $aspe_MAIN = "~/.aspera/cli/bin/ascp"; 12 | my $ascp_sshKey = "~/.aspera/cli/etc/asperaweb_id_dsa.openssh"; 13 | my $trimmomaticAdapters = "/tools/Trimmomatic-0.33/adapters/merged_adapted.fa"; 14 | 15 | 16 | die $usage unless -s $sraTable; 17 | open IN,'<',$sraTable or die "Can't read sra table in tab format"; 18 | my %maxDataSize; 19 | while(){ 20 | my ($Run,$ReleaseDate,$LoadDate,$spots,$bases,$spots_with_mates,$avgLength,$size_MB,$AssemblyName,$download_path,$Experiment,$LibraryName,$LibraryStrategy,$LibrarySelection,$LibrarySource,$LibraryLayout,$InsertSize,$InsertDev,$Platform,$Model,$SRAStudy,$BioProject,$Study_Pubmed_id,$ProjectID,$Sample,$BioSample,$SampleType,$TaxID,$ScientificName,$SampleName,$g1k_pop_code,$source,$g1k_analysis_group,$Subject_ID,$Sex,$Disease,$Tumor,$Affection_Status,$Analyte_Type,$Histological_Type,$Body_Site,$CenterName,$Submission,$dbgap_study_accession,$Consent,$RunHash,$ReadHash) = split /\t/,$_; 21 | if($maxDataSize{$ScientificName}<$size_MB){ 22 | $maxDataSize{$ScientificName} = $size_MB; 23 | } 24 | } 25 | seek IN,0,0; 26 | my %processedSpe; 27 | while(){ 28 | my ($Run,$ReleaseDate,$LoadDate,$spots,$bases,$spots_with_mates,$avgLength,$size_MB,$AssemblyName,$download_path,$Experiment,$LibraryName,$LibraryStrategy,$LibrarySelection,$LibrarySource,$LibraryLayout,$InsertSize,$InsertDev,$Platform,$Model,$SRAStudy,$BioProject,$Study_Pubmed_id,$ProjectID,$Sample,$BioSample,$SampleType,$TaxID,$ScientificName,$SampleName,$g1k_pop_code,$source,$g1k_analysis_group,$Subject_ID,$Sex,$Disease,$Tumor,$Affection_Status,$Analyte_Type,$Histological_Type,$Body_Site,$CenterName,$Submission,$dbgap_study_accession,$Consent,$RunHash,$ReadHash) = split /\t/,$_; 29 | next unless $size_MB>=$maxDataSize{$ScientificName}; 30 | next if $processedSpe{$ScientificName}++; 31 | # 制备shell文件 32 | my $prefix1 = substr($Run,0,3); 33 | my $prefix2 = substr($Run,0,6); 34 | $ScientificName=~s/[^a-zA-Z]/_/g; 35 | if($LibraryLayout eq "PAIRED"){ 36 | # 37 | print join ";",split /\r?\n/,<<"COMMAND" 38 | echo $Run $ScientificName 39 | $aspe_MAIN -i $ascp_sshKey -T anonftp\@ftp-private.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/$prefix1/$prefix2/$Run/$Run.sra . 40 | fastq-dump --defline-seq '\@\$sn/\$ri' --split-files $Run.sra 41 | /bin/rm $Run.sra 42 | $TRINITI_MAIN --seqType fq --left ${Run}_1.fastq --right ${Run}_2.fastq --CPU 20 --normalize_reads --output $ScientificName.${Run}.trinity --full_cleanup --max_memory 80G --trimmomatic --quality_trimming_params "ILLUMINACLIP:$trimmomaticAdapters:2:30:10 MAXINFO:76:0.8 MINLEN:36" 43 | /bin/rm ${Run}_1.fastq ${Run}_2.fastq 44 | COMMAND 45 | }else{ 46 | # [ \$?!=0 ] && exit \$?; 47 | print join ";",split /\r?\n/,<<"COMMAND" 48 | echo $Run $ScientificName 49 | $aspe_MAIN -i $ascp_sshKey -T anonftp\@ftp-private.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/$prefix1/$prefix2/$Run/$Run.sra . 50 | fastq-dump --defline-seq '\@\$sn/\$ri' --split-files $Run.sra 51 | /bin/rm $Run.sra 52 | $TRINITI_MAIN --seqType fq --single ${Run}_1.fastq --CPU 20 --normalize_reads --output $ScientificName.${Run}.trinity --full_cleanup --max_memory 80G --trimmomatic --quality_trimming_params "ILLUMINACLIP:$trimmomaticAdapters:2:30:10 MAXINFO:76:0.8 MINLEN:36" 53 | /bin/rm ${Run}_1.fastq 54 | COMMAND 55 | } 56 | print "\n"; 57 | } 58 | close IN; 59 | 60 | 61 | __END__ 62 | echo SRR3627997 Litchi_chinensis;~/.aspera/cli/bin/ascp -i ~/.aspera/cli/etc/asperaweb_id_dsa.openssh -T anonftp@ftp-private.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/SRR/SRR362/SRR3627997/SRR3627997.sra .;fastq-dump --defline-seq '@$sn[_$rn]/$ri' --split-files SRR3627997.sra;/bin/rm SRR3627997.sra;/tools/trinityrnaseq-Trinity-v2.4.0/Trinity --seqType fq --left SRR3627997_1.fastq --right SRR3627997_2.fastq --CPU 20 --normalize_reads --output Litchi_chinensis.SRR3627997.trinity --full_cleanup --max_memory 80G --trimmomatic --quality_trimming_params "ILLUMINACLIP:/tools/Trimmomatic-0.33/adapters/merged_adapted.fa:2:30:10 MAXINFO:76:0.8 MINLEN:36";/bin/rm SRR3627997_1.fastq SRR3627997_2.fastq 63 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/mergeRegionForQZ.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | my $usage = " 4 | perl $0 tableTomerge merged.pos.sorted.info merged.table 5 | Note: 6 | 7 | "; 8 | my $flankingLen = 5; 9 | my $table = shift; 10 | my $info = shift; 11 | my $result = shift; 12 | die $usage unless -s $table && $info && $result; 13 | open TAB,'<',$table or die "Can't read input table\n"; 14 | my %posAnno; 15 | while(){ 16 | s/\r?\n$//; 17 | my @cols = split /\t/,$_; 18 | my $hit = 0; 19 | my $chr = $cols[0]; 20 | my ($start,$end) = sort {$a<=>$b} (@cols[1,2]); 21 | if($posAnno{$chr}){ 22 | my @keys = sort keys %{$posAnno{$chr}}; 23 | for (@keys){ 24 | my @curRegion = split /-/,$_; 25 | if(!($start-$flankingLen>$curRegion[1]||$end+$flankingLen<$curRegion[0])){ 26 | my ($curStart,$curEnd) = (sort {$a<=>$b} ($start,$end,$curRegion[0],$curRegion[1]))[0,-1]; 27 | my @tmpArr = @{$posAnno{$chr}->{$_}}; 28 | # might delete same region... 29 | delete $posAnno{$chr}->{$_}; 30 | push @{$posAnno{$chr}->{"$curStart-$curEnd"}},(@tmpArr,[@cols]); 31 | $hit = 1; 32 | last; 33 | } 34 | } 35 | if(!$hit){ 36 | push @{$posAnno{$chr}->{"$start-$end"}},[@cols]; 37 | } 38 | }else{ 39 | push @{$posAnno{$chr}->{"$start-$end"}},[@cols]; 40 | } 41 | } 42 | close TAB; 43 | open INFO,'>',$info or die "Can't write into $info\n"; 44 | open RESULT,'>',$result or die "Can't write into $result\n"; 45 | for my $chr (keys %posAnno){ 46 | for (sort keys %{$posAnno{$chr}}){ 47 | # print $chr,"\t",$_,"\n"; 48 | print INFO "\n"; 49 | my $region = $_; 50 | print INFO join "\t",$chr,$region,@{$_},+"\n" for @{$posAnno{$chr}->{$_}}; 51 | # 52 | my @groupRecord = @{$posAnno{$chr}->{$_}}; 53 | @groupRecord = sort {$a->[4] <=> $b->[4]} @groupRecord; 54 | print RESULT "\t",@{$groupRecord[-1]}; 55 | print RESULT "\n"; 56 | 57 | } 58 | } 59 | close INFO; 60 | close RESULT; 61 | -------------------------------------------------------------------------------- /BioScripts/PerlScript/regionGrouper.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | my $usage = " 4 | perl $0 in.table chrColIndex StrandColIndex StartPosIndex EndPosIndex RegionExtendLen 5 | #Note: 6 | In English: 7 | Read a table and indexes of four columns and an optional extendLen to Group lines, 8 | The groupID will be appeded to all lines. 9 | 10 | extendLen means to extends length of all records 11 | 12 | <===regionStart...regionEnd===> 13 | 14 | In Chinese: 15 | 输入一个表格，输出结果会在最后一列添加一个GroupID 16 | "; 17 | my $table = shift; 18 | my $chrIndex = shift; 19 | my $strandIndex = shift; 20 | my $startPos = shift; 21 | my $endPos = shift; 22 | my $extendRegionLen = shift; 23 | # 24 | $extendRegionLen = 0 unless defined $extendRegionLen; 25 | 26 | die $usage unless -s $table; 27 | open TABLE,'<',$table or die "Can't read input table:$table\n"; 28 | 29 | my $groupCounts = 0; 30 | my @group = map {chomp;[split /\t/,$_]} ; 31 | 32 | my $preChr = ""; 33 | my $preStrand = ""; 34 | my $preStart = 0; 35 | my $preEnd = 0; 36 | 37 | my %grouper; 38 | 39 | for(sort { 40 | $a->[$chrIndex] cmp $b->[$chrIndex] 41 | || 42 | $a->[$strandIndex] cmp $b->[$strandIndex] 43 | || 44 | $a->[$startPos] <=> $b->[$startPos] 45 | || 46 | $a->[$endPos] <=> $b->[$endPos] 47 | } 48 | @group){ 49 | # print join ("\t",@{$_}),"\n"; 50 | # my @curCols = @{$_}; 51 | 52 | my $curChr = $_->[$chrIndex]; 53 | my $curStrand = $_->[$strandIndex]; 54 | my $curStart = $_->[$startPos]; 55 | my $curEnd = $_->[$endPos]; 56 | 57 | if($preChr eq $curChr && $preStrand eq $curStrand && $preEnd+$extendRegionLen >= $curStart-$extendRegionLen){ 58 | # already sorted, only need to compare preEnd and curStart 59 | }else{ 60 | $groupCounts++; 61 | } 62 | 63 | $preChr = $curChr; 64 | $preStrand = $curStrand; 65 | $preStart = $curStart; 66 | $preEnd = $curEnd; 67 | 68 | # push @{$grouper{$groupCounts}},$_; 69 | print join ("\t",@{$_},$groupCounts),"\n"; 70 | 71 | } 72 | 73 | close TABLE; -------------------------------------------------------------------------------- /BioScripts/PerlScript/tableTomerge.txt: -------------------------------------------------------------------------------- 1 | chrX 307359 307880 circexplorer2 1 - 2 | chrX 361404 362332 circexplorer2 1 - 3 | chrX 2392353 2425304 circexplorer2 1 - 4 | chrX 2719660 2726373 circexplorer2 1 + 5 | chrX 2719660 2738256 circexplorer2 1 + 6 | chrX 9890976 9898290 circexplorer2 1 + 7 | chrX 10063444 10079804 circexplorer2 3 + 8 | chrX 10063444 10094281 circexplorer2 2 + 9 | chrX 10063444 10098579 circexplorer2 4 + 10 | chrX 10063444 10116507 circexplorer2 1 + 11 | chrX 10090770 10098579 circexplorer2 2 + 12 | chrX 11156528 11196996 circexplorer2 2 - 13 | chrX 11169504 11188984 circexplorer2 1 - 14 | chrX 13666316 13668383 circexplorer2 1 + 15 | chrX 13666316 13680598 circexplorer2 4 + 16 | chrX 13735247 13763855 circexplorer2 2 + 17 | chrX 13744414 13762444 circexplorer2 1 + 18 | chrX 14850504 14859334 circexplorer2 2 - 19 | chrX 14908859 14911413 circexplorer2 1 + 20 | chrX 15845378 15846011 circexplorer2 1 - 21 | chrX 16693140 16698336 circexplorer2 1 - 22 | chrX 16693140 16702941 circexplorer2 1 - 23 | chrX 16698235 16702941 circexplorer2 2 - 24 | chrX 16735226 16743840 circexplorer2 1 + 25 | chrX 17006220 17029623 circexplorer2 1 + 26 | chrX 17025058 17052445 circexplorer2 1 + 27 | chrX 17025061 17103779 circexplorer2 1 + 28 | chrX 17029525 17062532 circexplorer2 1 + 29 | chrX 17103717 17138961 circexplorer2 6 + 30 | chrX 19965043 19970298 circexplorer2 2 - 31 | chrX 20162963 20186366 circexplorer2 1 - 32 | chrX 21516484 21563452 circexplorer2 3 + 33 | chrX 22076387 22133624 circexplorer2 1 + 34 | chrX 24065997 24068108 circexplorer2 1 + 35 | chrX 24161714 24179770 circexplorer2 1 + 36 | chrX 24172714 24179770 circexplorer2 3 + 37 | chrX 24494741 24534068 circexplorer2 1 + 38 | chrX 24809897 24821583 circexplorer2 1 + 39 | chrX 31134101 31172413 circexplorer2 1 - 40 | chrX 31478105 31479103 circexplorer2 1 - 41 | chrX 32287528 32310276 circexplorer2 1 - 42 | chrX 41144521 41151057 circexplorer2 1 + 43 | chrX 41214567 41219231 circexplorer2 1 + 44 | chrX 41217219 41223402 circexplorer2 2 + 45 | chrX 41636577 41671530 circexplorer2 1 - 46 | chrX 41660438 41671530 circexplorer2 1 - 47 | chrX 43683512 43712796 circexplorer2 1 + 48 | chrX 45082575 45090864 circexplorer2 1 + 49 | chrX 47024723 47033788 circexplorer2 1 + 50 | chrX 47846104 47895940 circexplorer2 1 + 51 | chrX 48598957 48599352 circexplorer2 3 + 52 | chrX 53399588 53403893 circexplorer2 1 - 53 | chrX 53557381 53559060 circexplorer2 1 - 54 | chrX 53561755 53562929 circexplorer2 2 - 55 | chrX 53595186 53607699 circexplorer2 1 - 56 | chrX 53614533 53614745 circexplorer2 2 - 57 | chrX 53614533 53615835 circexplorer2 21 - 58 | chrX 53645310 53654131 circexplorer2 6 - 59 | chrX 53645310 53680186 circexplorer2 1 - 60 | chrX 53987077 53987944 circexplorer2 2 - 61 | chrX 54232808 54239099 circexplorer2 1 - 62 | chrX 54455447 54465852 circexplorer2 1 - 63 | chrX 63655493 63678572 circexplorer2 1 - 64 | chrX 63665885 63678572 circexplorer2 1 - 65 | chrX 63665885 63724711 circexplorer2 2 - 66 | chrX 68063853 68073299 circexplorer2 2 - 67 | chrX 68096869 68194498 circexplorer2 1 - 68 | chrX 68511848 68522917 circexplorer2 1 + 69 | chrX 71381742 71387461 circexplorer2 1 + 70 | chrX 71423116 71424238 circexplorer2 3 + 71 | chrX 73577406 73584572 circexplorer2 1 + 72 | chrX 75379119 75478985 circexplorer2 2 - 73 | chrX 77557450 77664778 circexplorer2 2 - 74 | chrX 77599410 77600564 circexplorer2 3 - 75 | chrX 77616612 77664778 circexplorer2 1 - 76 | chrX 77633206 77656653 circexplorer2 3 - 77 | chrX 77633206 77664778 circexplorer2 1 - 78 | chrX 77652113 77656653 circexplorer2 6 - 79 | chrX 77681519 77685006 circexplorer2 2 - 80 | chrX 77988241 77989958 circexplorer2 1 + 81 | chrX 77988241 78003236 circexplorer2 1 + 82 | chrX 78014661 78020398 circexplorer2 1 + 83 | chrX 78042788 78043313 circexplorer2 1 + 84 | chrX 80288905 80310233 circexplorer2 8 - 85 | chrX 80289663 80310233 circexplorer2 1 - 86 | chrX 80707426 80719656 circexplorer2 2 - 87 | chrX 85067126 85074391 circexplorer2 4 + 88 | chrX 85255258 85268341 circexplorer2 1 + 89 | chrX 85303405 85308216 circexplorer2 4 - 90 | chrX 85345859 85367766 circexplorer2 2 - 91 | chrX 97072940 97114965 circexplorer2 2 + 92 | chrX 97099693 97114965 circexplorer2 1 + 93 | chrX 101020486 101021315 circexplorer2 1 - 94 | chrX 101041316 101042312 circexplorer2 4 - 95 | chrX 106853520 106854296 circexplorer2 1 + 96 | chrX 107840669 107854704 circexplorer2 1 + 97 | chrX 115122526 115126893 circexplorer2 1 - 98 | chrX 115633999 115643508 circexplorer2 1 + 99 | chrX 118378418 118404444 circexplorer2 1 + 100 | chrX 118387339 118398470 circexplorer2 2 + 101 | chrX 118396969 118398470 circexplorer2 1 + 102 | chrX 118542724 118546116 circexplorer2 1 + 103 | chrX 118572322 118574018 circexplorer2 1 + 104 | chrX 118584734 118590302 circexplorer2 1 + 105 | chrX 118654601 118654961 circexplorer2 1 + 106 | chrX 118766529 118776511 circexplorer2 1 + 107 | chrX 119640691 119653040 circexplorer2 1 - 108 | chrX 119837756 119838527 circexplorer2 1 - 109 | chrX 119930015 119932216 circexplorer2 3 - 110 | chrX 123665641 123686714 circexplorer2 1 - 111 | chrX 123671668 123697751 circexplorer2 1 - 112 | chrX 123686547 123697751 circexplorer2 1 - 113 | chrX 123686547 123712908 circexplorer2 1 - 114 | chrX 123885630 123892773 circexplorer2 1 + 115 | chrX 124021366 124031125 circexplorer2 2 + 116 | chrX 124068563 124078058 circexplorer2 1 + 117 | chrX 129557324 129576552 circexplorer2 1 + 118 | chrX 131749305 131794466 circexplorer2 1 - 119 | chrX 134473358 134498684 circexplorer2 1 + 120 | chrX 135545422 135549805 circexplorer2 1 + 121 | chrX 135545422 135556300 circexplorer2 5 + 122 | chrX 136222392 136244795 circexplorer2 1 - 123 | chrX 136241159 136244795 circexplorer2 1 - 124 | chrX 136344391 136351541 circexplorer2 1 + 125 | chrX 139814877 139826823 circexplorer2 1 - 126 | chrX 140783175 140784660 circexplorer2 236 + 127 | chrX 148661907 148662768 circexplorer2 1 + 128 | chrX 150638942 150663609 circexplorer2 2 + 129 | chrX 150731619 150732537 circexplorer2 2 + 130 | chrX 150814861 150816078 circexplorer2 1 - 131 | chrX 154446414 154449297 circexplorer2 1 + 132 | chrX 155506897 155514265 circexplorer2 1 - 133 | -------------------------------------------------------------------------------- /BioScripts/PythonScript/Fasta_stat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | 5 | def getGCNum(inSeq): 6 | count = 0 7 | for i in inSeq.upper(): 8 | if i == 'G' or i=='C': 9 | count+=1 10 | return count 11 | 12 | def getLowerBase(inSeq): 13 | count = 0 14 | for i in inSeq: 15 | if not i.isupper(): 16 | count+=1 17 | return count 18 | 19 | def getNCount(inSeq): 20 | count = 0 21 | for i in inSeq: 22 | if i == 'N' or i=='n': 23 | count+=1 24 | return count 25 | 26 | inFaFile = sys.argv[1] 27 | 28 | totalLen = 0 29 | lenArr = [] 30 | GCcount = 0 31 | seqCounts = 0 32 | nBaseCounts = 0 33 | lowerBaseCounts = 0 34 | 35 | curId = "" 36 | curSeq = "" 37 | firstRecordflag = True 38 | faFh = open(inFaFile,'r') 39 | for line in faFh: 40 | line = line.rstrip() 41 | if line[0] == '>': 42 | if not firstRecordflag: 43 | totalLen += len(curSeq) 44 | lenArr.append(len(curSeq)) 45 | GCcount += getGCNum(curSeq) 46 | nBaseCounts += getNCount(curSeq) 47 | lowerBaseCounts += getLowerBase(curSeq) 48 | # 49 | firstRecordflag = False 50 | seqCounts += 1 51 | curSeq="" 52 | else: 53 | curSeq = curSeq + line 54 | # collect the last seq 55 | # print(totalLen) 56 | totalLen += len(curSeq) 57 | lenArr.append(len(curSeq)) 58 | GCcount += getGCNum(curSeq) 59 | nBaseCounts += getNCount(curSeq) 60 | lowerBaseCounts += getLowerBase(curSeq) 61 | # 62 | 63 | faFh.close() 64 | 65 | # getMaxLen MinLen MeanLen N50 66 | lenArr.sort() 67 | # 68 | maxLen = lenArr[-1] 69 | minLen = lenArr[0] 70 | meanLen = totalLen/seqCounts 71 | if seqCounts%2: 72 | medianLen = (lenArr[seqCounts//2]+lenArr[seqCounts//2+1])/2 73 | else: 74 | medianLen = lenArr[seqCounts//2] 75 | 76 | lenArr.reverse() 77 | halfTotalLen = totalLen/2 78 | cumLen = 0 79 | for curLen in lenArr: 80 | cumLen += curLen 81 | if cumLen >= halfTotalLen: 82 | N50 = curLen 83 | break 84 | 85 | print("Total Len is:\t",totalLen) 86 | print("Total Seq Num is:\t",seqCounts) 87 | print("Total N count is:\t",nBaseCounts) 88 | print("Total lowBase is:\t",lowerBaseCounts) 89 | print("Total GC contene is:\t",GCcount/totalLen) 90 | print("maxLen is:\t",maxLen) 91 | print("minLen is:\t",minLen) 92 | print("meanLen is:\t",meanLen) 93 | print("medianLen is:\t",medianLen) 94 | print("N50 is:\t",N50) 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /BioScripts/PythonScript/GC_window_scan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | def getGC(inSeq): 4 | GCcount = 0 5 | for i in inSeq: 6 | if i=='G' or i=='C': 7 | GCcount+=1 8 | return GCcount/len(inSeq) 9 | 10 | seq = "AATGCATCGATCGATGTCGATCAGCTAGCTACGATCAGT" 11 | window_size =5 12 | step_size = 5 13 | is_skip_last = True 14 | GCList = [] 15 | kmerList = [] 16 | for curPos in range(0,len(seq),step_size): 17 | curSeq = seq[curPos:curPos+window_size] 18 | if is_skip_last and len(curSeq)[\w.]+)") 23 | fafh = open(fafile,'r') 24 | for line in fafh: 25 | line = line.strip() 26 | if line[0] == '>': 27 | print (pattern.match(line).group(1),file=faOut) 28 | else: 29 | print(line,file=faOut) 30 | fafh.close() 31 | faOut.close() 32 | # blastP 33 | blastOutFile = blastInFile+".blastOut" 34 | executeObj = subprocess.Popen("blastp -db "+targetBlastDb+ 35 | " -query "+blastInFile+ 36 | " -outfmt 7 -evalue 1e-5 -num_threads 10 -max_target_seqs 20 -out "+ 37 | blastOutFile,shell=True) 38 | executeObj.wait() 39 | # extractIDs and etract Sequences 40 | 41 | ### extractIDs 42 | tabFile = blastOutFile 43 | HitsIDOutFile = blastOutFile+".ids" 44 | hitfh = open (HitsIDOutFile,'w') 45 | # print("What?:\t"+tabFile) 46 | tabfh = open(tabFile,'r') 47 | uniq =set() 48 | for line in tabfh: 49 | if line[0] == '#':continue 50 | cols = line.split("\t"); 51 | if float(cols[2])>=50.0 and float(cols[10])<=1e-5: 52 | uniq.add(cols[1]) 53 | tabfh.close() 54 | 55 | for curId in uniq: 56 | print(curId,file=hitfh) 57 | 58 | hitfh.close() 59 | 60 | ### extract Seqs: 61 | fafile = targetBlastDb 62 | idfile = HitsIDOutFile 63 | extractSeqOutFile = idfile+".fas" 64 | 65 | extractSeqOutFh = open(extractSeqOutFile,"w") 66 | uniqIDs = set() 67 | fh = open (idfile,'r') 68 | for line in fh: 69 | line = line.rstrip() 70 | uniqIDs.add(line) 71 | fh.close() 72 | 73 | # print(uniqIDs) 74 | flag = False 75 | # 76 | fafh = open (fafile,'r') 77 | for line in fafh: 78 | line = line.rstrip() 79 | if line[0] == '>': 80 | flag = False 81 | if line[1:] in uniqIDs: 82 | print (line,file=extractSeqOutFh) 83 | flag = True 84 | else: 85 | if flag: 86 | print(line,file=extractSeqOutFh) 87 | 88 | fafh.close() 89 | extractSeqOutFh.close() 90 | 91 | # merge Seq 92 | mergeOutFile = extractSeqOutFile+".merged.fa" 93 | executeObj = subprocess.Popen("cat "+extractSeqOutFile+" "+blastInFile+" > "+mergeOutFile,shell=True) 94 | executeObj.wait() 95 | # muscle 96 | # extractSeqOutFile 97 | muscleOutFile = mergeOutFile+".aln" 98 | # print(mergeOutFile) 99 | executeObj = subprocess.Popen("muscle -in "+mergeOutFile+" -out "+muscleOutFile+" -maxiters 1000",shell=True) 100 | executeObj.wait() 101 | 102 | # trimal 103 | # print("Now Trimal...") 104 | trimalOutFile = muscleOutFile+".trimal.fa" 105 | executeObj = subprocess.Popen("trimal -in "+muscleOutFile+" -out "+trimalOutFile+" -automated1",shell=True) 106 | executeObj.wait() 107 | # FastTree 108 | treeOut = trimalOutFile+".nwk" 109 | executeObj = subprocess.Popen("FastTree "+trimalOutFile+" > "+treeOut,shell=True) 110 | executeObj.wait() 111 | 112 | 113 | -------------------------------------------------------------------------------- /BioScripts/PythonScript/Predict_miRNA_from_All_Plant_Genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import sys 5 | import subprocess 6 | 7 | TBtoolsPath = "/tools/TBtools_JRE1.6.jar" 8 | VARNAPath = "/tools/VARNAv3-93.jar" 9 | genomeFaPath = "/data/data2/XiaLab/allPlantGenome" 10 | 11 | def checkPresetFile(): 12 | 13 | if not (os.path.exists(TBtoolsPath) and os.path.isfile(TBtoolsPath)): 14 | print("TBtools.jar is not found as "+TBtoolsPath) 15 | sys.exit(1) 16 | if not (os.path.exists(VARNAPath) and os.path.isfile(VARNAPath)): 17 | print("VARNA.jar is not found as "+VARNAPath) 18 | sys.exit(1) 19 | if not (os.path.exists(genomeFaPath) and os.path.isdir(genomeFaPath)): 20 | print("Genome File Set is not found as "+genomeFaPath) 21 | sys.exit(1) 22 | 23 | checkPresetFile() 24 | 25 | parser = argparse.ArgumentParser() 26 | 27 | parser.add_argument("--threadNum",type=int,help="Set threadNum to be used",default=10) 28 | parser.add_argument("--ARM",help="Restrict mature miRNA ARM",choices=["BOTH","FIVE","THREE"],default="BOTH") 29 | parser.add_argument("--maxAsy",type=int,help="set maximum asymmetry",default=2) 30 | parser.add_argument("--maxMatureAsy",type=int,help="set maximum Mature asymmetry ",default=2) 31 | parser.add_argument("--maxStarAsy",type=int,help="set maximum STAR asymmetry",default=2) 32 | parser.add_argument("--minMatureAsy",type=int,help="set maximum Mature asymmetry",default=0) 33 | parser.add_argument("--minStarAsy",type=int,help="set maximum STAR asymmetry",default=0) 34 | 35 | parser.add_argument("miRNAfa") 36 | 37 | args = parser.parse_args() 38 | print(args) 39 | 40 | # if(opt in ("-")) 41 | commands = """ 42 | TBtoolsPath=%s 43 | VARNAPath=%s 44 | genomePath=%s 45 | miRNAPaht=%s 46 | threadNum=%s 47 | checkARM=%s # BOTH|FIVE|THREE 48 | maxAsy=%s 49 | maxMatureAsy=%s 50 | maxStarAsy=%s 51 | minMatureAsy=%s 52 | minStarAsy=%s 53 | 54 | cp $TBtoolsPath . 55 | cp $VARNAPath . 56 | 57 | for file in $genomePath/*.fna;do ln -s $file;done 58 | for genome in `ls *.fna|perl -pe 's/.fna//'`;do 59 | echo $genome 60 | java -cp TBtools_JRE1.6.jar biocjava.bioDoer.miRNA.TargetSoPipe --inMIRfa $miRNAPaht --inGenomeFa $genome.fna --outTable $genome.target --isFragment true --maxThreadNum $threadNum; 61 | perl -F'\\t' -lane 'next if $seen{"$F[1],$F[2],$F[3],$F[4]"}++;print if length($F[7])==22 and index($F[7],"-")==-1' $genome.target > $genome.target.mod 62 | java -Xmx100G -cp TBtools_JRE1.6.jar biocjava.bioDoer.miRNA.MIRidentifierBasedOnTargetSoResult --inGenomeFile $genome.fna --inTargetSoFile $genome.target.mod --outPredict $genome.predict --outChecklog $genome.check --threadNum $threadNum --checkARM $checkARM --maxAsy $maxAsy --maxMatureAsy $maxMatureAsy --maxStarAsy $maxStarAsy --minMatureAsy $minMatureAsy --minStarAsy $minStarAsy 63 | cat $genome.predict|perl -lane '$start=index($F[-1],$F[2])+1;$end=$start+length($F[2]);$F[0]=~s/[^0-9a-zA-Z]/_/g;$foldSeq =$F[-1];@fold=split /\s/,`echo -n $foldSeq|/usr/local/bin/RNAfold`;$cmd=qq{java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd -sequenceDBN "$fold[0]" -structureDBN "$fold[1]" -basesStyle1 "fill=#FF0000,outline=#FF0000,label=#000000,number=#FF0000" -applyBasesStyle1on }.join (q{,},$start..($end-1)).qq{ -o $F[0].$F[2].$F[3].$F[5].$F[6].jpg -border "20x30" -resolution 10};`$cmd`' 64 | done 65 | 66 | """ % (TBtoolsPath, 67 | VARNAPath, 68 | genomeFaPath, 69 | args.miRNAfa, 70 | args.threadNum, 71 | args.ARM, 72 | args.maxAsy, 73 | args.maxMatureAsy, 74 | args.maxStarAsy, 75 | args.minMatureAsy, 76 | args.minStarAsy 77 | ) 78 | shellscript=open("tmp.sh","w") 79 | shellscript.write(commands) 80 | shellscript.close() 81 | 82 | subprocess.call("sh tmp.sh",shell=True) 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /BioScripts/PythonScript/extract_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import re 5 | fafile = sys.argv[1] 6 | idfile = sys.argv[2] 7 | 8 | # store all ids 9 | idfh = open(idfile,'r') 10 | idSet = set() 11 | for line in idfh: 12 | idSet.add(line.rstrip()) 13 | idfh.close() 14 | 15 | pattern = re.compile(">([\w.]+)") 16 | fafh = open(fafile,'r') 17 | hitFlag = False 18 | for line in fafh: 19 | line = line.strip() 20 | if line[0] == '>': 21 | hitFlag = False 22 | curId = pattern.match(line).group(1) 23 | if curId in idSet: 24 | hitFlag = True 25 | print (line) 26 | else: 27 | if hitFlag: 28 | print(line) 29 | fafh.close() 30 | 31 | -------------------------------------------------------------------------------- /BioScripts/PythonScript/fasta_ID_simplifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import re 5 | fafile = sys.argv[1] 6 | pattern = re.compile("(>[\w.]+)") 7 | fafh = open(fafile,'r') 8 | for line in fafh: 9 | line = line.strip() 10 | if line[0] == '>': 11 | print (pattern.match(line).group(1)) 12 | else: 13 | print(line) 14 | fafh.close() -------------------------------------------------------------------------------- /BioScripts/PythonScript/filter_BlastP_report_ID.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import re 4 | tabFile = sys.argv[1] 5 | 6 | tabfh = open(tabFile,'r') 7 | uniq =set() 8 | for line in tabfh: 9 | if line[0] == '#':continue 10 | cols = line.split("\t"); 11 | if float(cols[2])>=50.0 and float(cols[10])<=1e-5: 12 | uniq.add(cols[1]) 13 | tabfh.close() 14 | 15 | for curId in uniq: 16 | print(curId) 17 | -------------------------------------------------------------------------------- /BioScripts/PythonScript/filter_fasta_by_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import re 5 | fafile = sys.argv[1] 6 | idfile = sys.argv[2] 7 | 8 | # store all ids 9 | idfh = open(idfile,'r') 10 | idSet = set() 11 | for line in idfh: 12 | idSet.add(line.rstrip()) 13 | idfh.close() 14 | 15 | pattern = re.compile(">([\w.]+)") 16 | fafh = open(fafile,'r') 17 | hitFlag = False 18 | for line in fafh: 19 | line = line.strip() 20 | if line[0] == '>': 21 | hitFlag = False 22 | curId = pattern.match(line).group(1) 23 | if curId not in idSet: 24 | hitFlag = True 25 | print (line) 26 | else: 27 | if hitFlag: 28 | print(line) 29 | fafh.close() -------------------------------------------------------------------------------- /BioScripts/Rscript/Boxplot.violinPlot.Point.with.Ttest.for.TwoSet.Comparison.Visulization.r: -------------------------------------------------------------------------------- 1 | custom.theme <- theme_bw() + 2 | theme( 3 | 4 | panel.border = element_blank(), 5 | panel.background = element_blank(), 6 | panel.grid.major = element_blank(), 7 | panel.grid.minor = element_blank(), 8 | axis.line.x = element_line(colour = "black", size = 0.8), 9 | axis.line.y = element_line(colour = "black", size = 0.8), 10 | axis.ticks.x = element_line(size = 0.8), 11 | axis.ticks.y = element_line(size = 0.8), 12 | axis.text.x = element_text( 13 | angle = 30, hjust = 1, vjust = 1 14 | ), 15 | legend.position = "none", 16 | legend.key = element_blank(), 17 | legend.title = element_blank(), 18 | legend.text = element_text(size = 12, face = "bold"), 19 | legend.background = element_rect(fill = "transparent"), 20 | strip.background = element_rect( 21 | colour = "white", fill = "white", 22 | size = 0.2 23 | ), 24 | strip.text.x = element_text(size = 14), 25 | strip.text.y = element_text(size = 14), 26 | 27 | text = element_text( 28 | size = 14, 29 | #family = "arial", 30 | face = "bold" 31 | ), 32 | plot.title = element_text( 33 | size = 16, 34 | #family = "arial", 35 | face = "bold" 36 | ) 37 | ) 38 | 39 | 40 | # 绘制小提琴图，查看有miR2275的物种中靶位点和没有 miR2275 的物种中靶位点的数目差异 41 | violinData<-containSpe[Target4Counts$Species%in%containSpe$Species,] 42 | # violinData<-violinData[violinData$TargetSiteRatio!=0,] 43 | violinData<-merge(violinData,Target4Counts) 44 | violinData<-violinData[violinData$GenomeSizeMb>=100,] 45 | 46 | head(violinData) 47 | 48 | p.value<-t.test(violinData[violinData$isContained==1,]$TargetSiteRatio,violinData[violinData$isContained!=1,]$TargetSiteRatio)$p.value 49 | p.value 50 | 51 | library(ggthemes) 52 | 53 | ggplot(violinData)+ 54 | geom_violin(aes(x=factor(isContained),fill=factor(isContained),y=TargetSiteRatio),color=I("grey70"),size=1.3,alpha=I(0.7)) + 55 | # geom_jitter(aes(x=factor(isContained),y=TargetSiteRatio),color=I("black"),size=4) + 56 | geom_boxplot(aes(x=factor(isContained),y=TargetSiteRatio,fill=factor(isContained)),size=1.5,width=0.2)+ 57 | geom_jitter(aes(x=factor(isContained),y=TargetSiteRatio,fill=factor(isContained),color=I("grey50")),size=2.5,shape=21,stroke=1.2,alpha=I(0.7))+ 58 | 59 | 60 | geom_text(x=1.5,y=max(violinData$TargetSiteRatio)*1.05,label=format(p.value,scientific=TRUE,digit=3))+ 61 | geom_segment(x=0.8,xend=2.2,y=max(violinData$TargetSiteRatio)*1.08,yend=max(violinData$TargetSiteRatio)*1.08,size=1.3)+ 62 | geom_text(x=1.5,y=max(violinData$TargetSiteRatio)*1.11,label="**",size=10)+ 63 | ylim(0,max(violinData$TargetSiteRatio)*1.18)+ 64 | xlab("")+ 65 | scale_x_discrete(labels=c("NotContained","Contained"))+ 66 | scale_fill_manual(values=c("#db625e","#78a1c7"))+custom.theme 67 | 68 | ggsave("violin.pdf",w=4.95,h=8.24) -------------------------------------------------------------------------------- /BioScripts/Rscript/DEseq2 有重复数据.r: -------------------------------------------------------------------------------- 1 | ############# 2 | # DEseq2 有重复数据 3 | ############# 4 | 5 | # 设定工作目录 -- 随后所有的文件从这个 6 | setwd(".") 7 | ######## 8 | # 从RSEM结果整理出Counts数据 9 | ########## 10 | targetFileLists<-list.files(pattern = "*.genes.results") # 这个模式可以设定为1个参数 11 | targetFileLists 12 | counts<-read.table(targetFileLists[1],header=T)[,1]; 13 | for(i in 1:length(targetFileLists)){ 14 | counts<-data.frame(counts,read.table(targetFileLists[i],header=T,sep="\t")[,5]); 15 | } 16 | names(counts)<-c("ID",targetFileLists) # 这里可以设定为1个参数 17 | rownames(counts)<-counts[,1] 18 | counts<-counts[,-1] 19 | 20 | head(counts) 21 | # FZX-DY1.genes.results FZX-DY2.genes.results FZX-DY3.genes.results 22 | # Unigene_0 4.00 13.00 1.00 23 | # Unigene_1 2740.00 3008.00 1993.00 24 | # Unigene_10 841.00 996.00 815.00 25 | # Unigene_100 108.88 91.22 43.03 26 | # Unigene_1000 1123.55 1490.51 1033.10 27 | # Unigene_10000 0.00 20.43 0.00 28 | 29 | 30 | # 修饰列名 31 | colnames(counts)<-gsub(".genes.results","",colnames(counts)) 32 | head(counts) 33 | # FZX-DY1 FZX-DY2 FZX-DY3 FZX-leaves1 FZX-leaves2 FZX-leaves3 34 | # Unigene_0 4.00 13.00 1.00 8.00 19.00 13.00 35 | # Unigene_1 2740.00 3008.00 1993.00 3969.00 2693.00 3197.00 36 | # Unigene_10 841.00 996.00 815.00 342.00 303.00 438.00 37 | # Unigene_100 108.88 91.22 43.03 1088.57 723.97 611.06 38 | # Unigene_1000 1123.55 1490.51 1033.10 852.18 608.65 846.42 39 | # Unigene_10000 0.00 20.43 0.00 0.00 0.00 0.00 40 | # ZNX-DY1 ZNX-DY2 ZNX-DY3 ZNX-leaves1 ZNX-leaves2 ZNX-leaves3 41 | # Unigene_0 7.00 11.00 7.00 14.00 3394.00 10.00 42 | # Unigene_1 1606.00 2533.00 1801.00 3179.00 718.00 3398.00 43 | # Unigene_10 539.00 428.00 531.00 404.00 198.00 347.00 44 | # Unigene_100 17.63 4.36 0.00 0.00 0.00 27.17 45 | # Unigene_1000 938.31 1026.63 1013.52 721.19 23.70 757.79 46 | # Unigene_10000 101.19 72.66 128.27 573.59 219.28 322.41 47 | 48 | 49 | # 50 | # 载入响应的R包 51 | library(DESeq2) 52 | 53 | # 整理对象 54 | # 必须先转换为DESeq对象 55 | colData<-data.frame(colnames(counts),rep("pair-end",length(colnames(counts)))) 56 | names(colData)<-c("sample","type") 57 | # ！！！！！！！！！！！！！！！这里要视具体样本情况而定 58 | # 去掉末尾的数值，取得样本条件 59 | condition<-rep(unique(gsub("\\d+$","",colData$sample,perl=T)),each=3) 60 | condition 61 | # [1] "FZX-DY" "FZX-DY" "FZX-DY" "FZX-leaves" "FZX-leaves" 62 | # [6] "FZX-leaves" "ZNX-DY" "ZNX-DY" "ZNX-DY" "ZNX-leaves" 63 | # [11] "ZNX-leaves" "ZNX-leaves" 64 | colData<-data.frame(colData,condition) 65 | colData 66 | # sample type condition 67 | # 1 FZX-DY1 pair-end FZX-DY 68 | # 2 FZX-DY2 pair-end FZX-DY 69 | # 3 FZX-DY3 pair-end FZX-DY 70 | # 4 FZX-leaves1 pair-end FZX-leaves 71 | # 5 FZX-leaves2 pair-end FZX-leaves 72 | # 6 FZX-leaves3 pair-end FZX-leaves 73 | # 7 ZNX-DY1 pair-end ZNX-DY 74 | # 8 ZNX-DY2 pair-end ZNX-DY 75 | # 9 ZNX-DY3 pair-end ZNX-DY 76 | # 10 ZNX-leaves1 pair-end ZNX-leaves 77 | # 11 ZNX-leaves2 pair-end ZNX-leaves 78 | # 12 ZNX-leaves3 pair-end ZNX-leaves 79 | 80 | # 使用循环，针对每个condition进行差异表达分析 81 | 82 | getPaitList<-function(vector){ 83 | list<-c() 84 | for(i in 1:(length(vector)-1)){ 85 | for(j in (i+1):(length(vector))){ 86 | list<-rbind(list,c(i,j)) 87 | } 88 | } 89 | list 90 | } 91 | 92 | uniqCondition<-unique(condition) 93 | Pairindex<-getPaitList(uniqCondition) 94 | 95 | for(i in 1:nrow(Pairindex)){ 96 | # 获取所有对应index组合 97 | j<-Pairindex[i,][1] 98 | k<-Pairindex[i,][2] 99 | print(paste(uniqCondition[j],uniqCondition[k],sep=" vs ")) 100 | 101 | sample_1 = uniqCondition[j] 102 | sample_2 = uniqCondition[k] 103 | 104 | curColData<-colData[colData$condition==sample_1 | colData$condition==sample_2,] 105 | colnames(curColData)<-c("sample","type","curCondition") 106 | row.names(curColData)<-curColData$sample 107 | # curCountData<-counts[,which(colnames(counts) %in% curColData$sample)] # 不能用=号,因为会缺失 108 | curCountData<-counts[,which(colnames(counts) %in% curColData$sample)] 109 | curCondition<-rep(c(sample_1,sample_2),each=3) 110 | 111 | dds <- DESeqDataSetFromMatrix(countData = ceiling(curCountData), 112 | colData = curColData, 113 | design =~ curCondition) # factor levels were dropped which had no samples 不影响，因为没有自行去除而已 114 | # 过滤掉没表达或者几乎不表达的 115 | # 所有counts 加和不足10 的过滤掉 116 | dds <- dds[rowSums(counts(dds)) > 10,] 117 | dds 118 | # 开始做差异表达分析 119 | dds <- DESeq(dds) 120 | res <- results(dds) 121 | resOrdered <- res[order(res$padj),] 122 | summary(resOrdered) 123 | # 查看有显著差异的基因的个数 124 | print(sum(resOrdered$padj < 0.01 & abs(resOrdered$log2FoldChange)>1, na.rm=TRUE)) 125 | # 126 | 127 | # 输出MAplot... 128 | pdf(paste0(sample_1,"_vs_",sample_2,"_MAplot.pdf")) 129 | # plotMA(res, main="DESeq2", ylim=c(-2,2)) # 不同处理之间差异太大，大量超过 -2 2 这个foldchange 范围 130 | plotMA(res, main=paste0(sample_1,"_vs_",sample_2,("_MAplot"), ylim=c(-10,10))) 131 | dev.off() 132 | 133 | # 保存结果 134 | write.table(as.data.frame(resOrdered), 135 | file=paste0(sample_1,"_vs_",sample_2,"_results.all.xls"),quote=F,sep="\t") 136 | # 保存符合规定的结果 137 | write.table(as.data.frame(resOrdered[(!is.na(resOrdered$padj)) & resOrdered$padj < 0.01 & abs(resOrdered$log2FoldChange)>1,]), 138 | file=paste0(sample_1,"_vs_",sample_2,"_results.padj01.logFC2.xls"),quote=F,sep="\t") 139 | 140 | } 141 | 142 | ######### 143 | # 差异表达分析结束，输出一些信息 144 | ########## 145 | 146 | 147 | row.names(colData)<-colData$sample 148 | dds <- DESeqDataSetFromMatrix(countData = ceiling(counts), 149 | colData = colData, 150 | design =~ condition) 151 | # # 无重复的条件下...都是零 152 | # summary(res) 153 | # sum(res$padj < 0.1, na.rm=TRUE) 154 | # # 调整adj-pvalue 155 | # res05 <- results(dds, alpha=0.05) 156 | # summary(res05) 157 | # sum(res05$padj < 0.05, na.rm=TRUE) 158 | # # 159 | vsd <- varianceStabilizingTransformation(dds) 160 | 161 | library("pheatmap") 162 | sampleDists <- dist(t(assay(vsd))) 163 | library("RColorBrewer") 164 | sampleDistMatrix <- as.matrix(sampleDists) 165 | # rownames(sampleDistMatrix) <- paste(vsd$curCondition, vsd$type, sep="-") 166 | rownames(sampleDistMatrix) <- colnames(vsd) 167 | colnames(sampleDistMatrix) <- NULL 168 | colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) 169 | pheatmap(file="sample-vsd-counts-cor.heatmap.pdf",sampleDistMatrix, 170 | clustering_distance_rows=sampleDists, 171 | clustering_distance_cols=sampleDists, 172 | col=colors) 173 | 174 | ############ 175 | # PCA分析 176 | ############### 177 | # plotPCA(vsd, intgroup=c("condition", "type")) 178 | # dev.off() 179 | # 180 | 181 | # data <- plotPCA(vsd, intgroup=c("condition", "type"), returnData=TRUE) 182 | library(ggplot2) 183 | data <- plotPCA(vsd, intgroup=c("condition"), returnData=TRUE) 184 | percentVar <- round(100 * attr(data, "percentVar")) 185 | 186 | ggplot(data, aes(PC1, PC2, color=condition)) + 187 | geom_point(size=3) + 188 | xlab(paste0("PC1: ",percentVar[1],"% variance")) + 189 | ylab(paste0("PC2: ",percentVar[2],"% variance")) 190 | # 191 | ggsave("sample-vsd-counts-PCA.pdf") -------------------------------------------------------------------------------- /BioScripts/Rscript/Draw_TBtools_KEGG_Pathway_Enrichment_Graph.r: -------------------------------------------------------------------------------- 1 | initial.options <- commandArgs(trailingOnly = FALSE) 2 | file.arg.name <- "--file=" 3 | script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) 4 | script.basename <- dirname(script.name) 5 | other.name <- paste(sep="/", script.basename, script.name) 6 | 7 | 8 | argv <- commandArgs(TRUE) 9 | 10 | if(length(argv)<1){ 11 | print("[Usage]:",quote=F) 12 | print(paste0(" Rscript ",other.name," enrichment.txt"),quote=F) 13 | q(save="no") 14 | } 15 | 16 | KeggEnrichment <- argv[1] 17 | enrichmentInfo <- read.delim(file=KeggEnrichment,header=T); 18 | head(enrichmentInfo) 19 | library(ggplot2) 20 | enrichmentInfoMod<-enrichmentInfo[enrichmentInfo$p.value<=5e-2 & enrichmentInfo$GeneHitsInSelectedSet>=5,] 21 | # enrichmentInfoMod<-enrichmentInfo[enrichmentInfo$p_adjust<=5e-2 & enrichmentInfo$seletedWhiteBalls>=5,] 22 | # enrichmentInfoMod<-enrichmentInfo 23 | if(dim(enrichmentInfoMod)[1]==0) quit(save="no") 24 | 25 | head(enrichmentInfoMod) 26 | 27 | enrichmentInfoMod$Term.Name<-gsub("^\\s*\\d{5}\\s+","",enrichmentInfoMod$Term.Name,perl=T) 28 | 29 | 30 | # 按照EnrichementFactor排序 31 | enrichmentInfoMod<-enrichmentInfoMod[order(enrichmentInfoMod$enrichFactor,decreasing=F),] 32 | enrichmentInfoMod$Term.Name<-factor(enrichmentInfoMod$Term.Name,unique(as.character(enrichmentInfoMod$Term.Name))) 33 | 34 | p<-ggplot(enrichmentInfoMod) 35 | # 因为q值可能直接不显著 36 | q<-p+geom_point(aes(x=enrichFactor,y=Term.Name,size=GeneHitsInSelectedSet,color=p.value))+scale_colour_gradient(low="red",high="blue")+scale_size_area()+theme_bw()+ 37 | # q<-p+geom_point(aes(x=EnrichmentFactor,y=Term.Name,size=NumberOfGeneInSelectedSet,color=p_adjust))+scale_colour_gradient(low="red",high="blue") +theme_bw()+ 38 | theme( 39 | plot.title=element_text(face="bold",vjust=1.0), 40 | axis.title.x=element_text(face="bold",vjust=-0.2), 41 | axis.title.y=element_text(face="bold"), 42 | axis.text.y=element_text(hjust=1.0,colour="black"), 43 | axis.text.x=element_text(angle=0,colour="black") 44 | )+ggtitle("Statistics of KEGG Pathway Enrichment")+ 45 | ylab("Pathway")+ 46 | xlab("Enrichment Factor"); 47 | q 48 | tiff(filename = paste0(KeggEnrichment,".KEGGPathwayEnrichment.tiff"),res=300,width=(666*4.17),height=(615*4.17),compression="lzw"); 49 | q 50 | dev.off() 51 | ggsave(paste0(KeggEnrichment,".KEGGPathwayEnrichment.pdf"),width=8.5,height=11) -------------------------------------------------------------------------------- /BioScripts/Rscript/Revigo本地可视化代码.r: -------------------------------------------------------------------------------- 1 | library(ggrepel) 2 | 3 | p1 <- ggplot( data = one.data ); 4 | p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = plot_size), alpha = I(0.6) ) + scale_size_area(); 5 | p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) ); 6 | p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = plot_size), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area(); 7 | p1 <- p1 + scale_size( range=c(5, 30)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) ); 8 | ex <- one.data [ one.data$dispensability < 0.15 9 | | grepl("meiotic",one.data$description) 10 | | grepl("flower",one.data$description) 11 | | grepl("floral",one.data$description) 12 | | grepl("ovary",one.data$description) 13 | | grepl("ovule",one.data$description) 14 | | grepl("pollen",one.data$description) 15 | | grepl("carpel",one.data$description) 16 | | grepl("ethylene",one.data$description) 17 | | grepl("siRNA",one.data$description) 18 | | grepl("pistil",one.data$description) 19 | | grepl("stamen",one.data$description) 20 | | grepl("androecium",one.data$description) 21 | | grepl("anther",one.data$description) 22 | ,]; 23 | ex <- ex [ !grepl("animal",ex$description) 24 | & !grepl("muscle",ex$description) 25 | & !grepl("BMP",ex$description) 26 | & !grepl("cancer",ex$description) 27 | & !grepl("liver",ex$description) 28 | & !grepl("MHC",ex$description) 29 | & !grepl("T cell",ex$description) 30 | & !grepl("renal system",ex$description) 31 | ,]; 32 | # ADDDDDDDDDDDDDDDDDDD 33 | p1 <- p1 + geom_text_repel( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("red", 0.85)), size = 5 ); 34 | p1 <- p1 + labs (y = "semantic space x", x = "semantic space y"); 35 | p1 <- p1 + theme(legend.key = element_blank()) ; 36 | one.x_range = max(one.data$plot_X) - min(one.data$plot_X); 37 | one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y); 38 | p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10); 39 | p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10); 40 | 41 | 42 | 43 | # -------------------------------------------------------------------------- 44 | # Output the plot to screen 45 | 46 | p1; 47 | ggsave("white.GO.enrichment.pdf",w=11,h=9) -------------------------------------------------------------------------------- /BioScripts/Rscript/R语言物种分布.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJ-Chen/TBtools-Manual/7c582a84ad6c1bfd65d31f4368c8dd6802743aef/BioScripts/Rscript/R语言物种分布.txt -------------------------------------------------------------------------------- /BioScripts/Rscript/pheatmap缺失值上热图.txt: -------------------------------------------------------------------------------- 1 | CJ 13:41:17 2 | 3 | library(pheatmap) 4 | expZhao<-read.table(file="clipboard",header=T,row.names = 1) 5 | expZhao<-t(scale(t(expZhao))) 6 | expZhao[is.na(expZhao)]<-0 7 | pheatmap(expZhao,scale="none") -------------------------------------------------------------------------------- /BioScripts/Rscript/合并展示多组富集结构.sh: -------------------------------------------------------------------------------- 1 | ########## 2 | # 做kegg富集分析 3 | ########## 4 | 5 | for file in Module*.txt;do 6 | java -cp /C:/Users/CJ/Desktop/TBtools_JRE1.6.jar biocjava.bioDoer.Kegg.AdvancedForEnrichment.KeggEnrichment --inKegRef TBtools.Plant.20180203.KeggBackEnd --Kannotation gene2ko --selectedSet $file --outFile $file.Pathway.EnrichMent.xls 7 | done 8 | # 9 | perl -F'\t' -lane 'print $F[0] unless $F[5]>0.05 or $F[1]<5 or $seen{$F[0]}++' ModuleGID_*.Pathway.EnrichMent.xls > Show.Term 10 | # 11 | 12 | for file in ModuleGID_*.Pathway.EnrichMent.xls;do 13 | perl -F'\t' -lane 'print join qq{\t},@F[0,5] if $seen{$F[0]}++' Show.Term $file > $file.selected 14 | done 15 | 16 | wc -l *.selected 17 | 23 ModuleGID_cyan.txt.Pathway.EnrichMent.xls.selected 18 | 26 ModuleGID_green.txt.Pathway.EnrichMent.xls.selected 19 | 26 ModuleGID_greenyellow.txt.Pathway.EnrichMent.xls.selected 20 | 25 ModuleGID_lightcyan.txt.Pathway.EnrichMent.xls.selected 21 | 100 total 22 | 23 | # 合并表格的时候 24 | perl -e '@AllFile=@ARGV;print qq{GeneID\t};print qq{$_\t} for @AllFile;print qq{\n};while(){chomp;${$ARGV}{(split qq{\t},$_)[0]}=(split qq{\t},$_)[1];$uniqID{(split qq{\t},$_)[0]}++};for $id (keys %uniqID){print qq{$id\t};for(@AllFile){if(${$_}{$id}){print ${$_}{$id}}else{print qq{0}};print qq{\t}};print "\n"}' *.xls.selected|perl -pe 's/\t$//'|perl -pe 's/ModuleGID_(.*?).txt.Pathway.EnrichMent.xls.selected/$1/g' > Merged.Qvalue.txt 25 | -------------------------------------------------------------------------------- /BioScripts/Rscript/基于RSEM结果直接调用DEseq进行差异表达分析.r: -------------------------------------------------------------------------------- 1 | #### 2 | # 加载所需的R包 3 | ####### 4 | # source("http://bioconductor.org/biocLite.R") 5 | # biocLite("edgeR") 6 | # biocLite("DESeq") 7 | library(DESeq) 8 | library(edgeR) 9 | 10 | 11 | library(ggplot2) # 绘制火山图 12 | 13 | 14 | # 设定工作目录 -- 随后所有的文件从这个 15 | setwd(".") 16 | ######## 17 | # 基因层面 18 | ########## 19 | targetFileLists<-list.files(pattern = "*.genes.results") # 这个模式可以设定为1个参数 20 | targetFileLists 21 | # merged_gene_Counts 22 | # counts 23 | counts<-read.table(targetFileLists[1],header=T)[,1]; 24 | for(i in 1:length(targetFileLists)){ 25 | counts<-data.frame(counts,read.table(targetFileLists[i],header=T,sep="\t")[,5]); 26 | } 27 | names(counts)<-c("ID",targetFileLists) # 这里可以设定为1个参数 28 | rownames(counts)<-counts[,1] 29 | counts<-counts[,-1] 30 | 31 | head(counts) 32 | 33 | keep <- rowSums(cpm(counts)>1) >= 1 34 | counts <- counts[keep,]+1 35 | # conds <- c("Control","Case1","Case2") 36 | conds<-colnames(counts) 37 | # 由于edgeR要求counts是整数，所以对齐进行向上取整，如何取整看心情 38 | cds <- newCountDataSet(ceiling(counts), conds) 39 | 40 | getPaitList<-function(vector){ 41 | list<-c() 42 | for(i in 1:(length(vector)-1)){ 43 | for(j in (i+1):(length(vector))){ 44 | list<-rbind(list,c(i,j)) 45 | } 46 | } 47 | list 48 | } 49 | c1<-c("a","b","c") 50 | # getPaitList(c1) 51 | 52 | 53 | # 直接使用变量连接，之后可以形成pipelines 54 | Pairindex<-getPaitList(colnames(counts)) 55 | for(i in 1:nrow(Pairindex)){ 56 | # 获取所有对应index组合 57 | j<-Pairindex[i,][1] 58 | k<-Pairindex[i,][2] 59 | print(paste(j,k)) 60 | 61 | 62 | # 这个图没什么用 63 | cds1 = cds[,c(j,k)] 64 | cds1 <- estimateSizeFactors(cds1) 65 | cds1 <- estimateDispersions(cds1, method="blind", sharingMode="fit-only",fitType="local") 66 | pdf(file=paste0(colnames(counts)[j],"_",colnames(counts)[k],"_Dispersion.pdf")) 67 | plotDispEsts(cds1) 68 | dev.off() 69 | 70 | res1 <- nbinomTest(cds1, colnames(counts)[j], colnames(counts)[k]) 71 | r1 <- res1[!is.na(res1$padj),] 72 | r1<-(r1[order(r1[,7],decreasing=F),]) 73 | write.table(r1,file=paste0(colnames(counts)[j],"_",colnames(counts)[k],"_DEbyDESeqAll.xls"),row.names=F,sep="\t",quote=FALSE) 74 | 75 | 76 | 77 | vocanoP<-(data.frame(r1$log2FoldChange,log2(r1$pval))) 78 | names(vocanoP)<-c("log2FoldChange","log2Pvalue") 79 | 80 | p<-ggplot(vocanoP)+geom_point(aes(x=log2FoldChange,y=abs(log2Pvalue),color=!(abs(log2FoldChange)>=1& log2Pvalue<=log2(0.05))),size=1)+theme_bw()+ 81 | theme( 82 | legend.position = "none", 83 | plot.background = element_blank(), 84 | # axis.line = element_line(color = 'black'), 85 | panel.border = element_rect(linetype = "solid", color="black"), 86 | # panel.border = element_blank(), 87 | panel.grid.major = element_blank(), 88 | panel.grid.minor = element_blank() 89 | ) 90 | ggsave(p,file=paste0(colnames(counts)[j],"_",colnames(counts)[k],"_vocanoPlot.pdf")) 91 | #dev.off() 92 | 93 | 94 | pdf(file=paste0(colnames(counts)[j],"_",colnames(counts)[k],"_MAPlot.pdf")) 95 | DESeq::plotMA(res1) 96 | dev.off() 97 | # 一般无重复进行BH矫正，会过分严格，干脆不矫正，直接限制pvalue 98 | r1 <- r1[abs(r1$log2FoldChange)>1 & r1$pval<0.05,] 99 | write.table(r1,file=paste0(colnames(counts)[j],"_",colnames(counts)[k],"_DEbyDESeq_log2FC1_pval0.05.xls"),row.names=F,sep="\t",quote=FALSE) 100 | 101 | 102 | } 103 | 104 | 105 | 106 | 107 | # res1 <- nbinomTest(cds1, colnames(counts)[j], colnames(counts)[k]) 108 | # r1 <- res1[!is.na(res1$padj),] 109 | # r1<-(r1[order(r1[,7],decreasing=F),]) 110 | # write.table(r1,file=paste0(colnames(counts)[j],"_",colnames(counts)[k],"_DEbyDESeqAll.xls"),row.names=F,sep="\t",quote=FALSE) 111 | 112 | 113 | # dev.off() 114 | 115 | 116 | -------------------------------------------------------------------------------- /BioScripts/Rscript/绘制GO富集柱形图R码: -------------------------------------------------------------------------------- 1 | 2 | #### 3 | # 绘制GO富集的柱形图 4 | ### 5 | 6 | perl -F'\t' -lane '$name=$ARGV;$name=~s/.*txt_(.*?)_.*/$1/;print qq{$1\t$_} if $F[5]>=5 and $F[3]<=0.05' Up.A_vs_B_Up.and.A_vs_C_Up.and.A_vs_D_Up.common.797.txt_*_EnrichResult.xls.sorted.padjust.xls > BCD.tab 7 | 8 | # 9 | 10 | setwd("C:\\Users\\CJ\\Desktop\\其他人的项目\\王秀荣\\最终GO数据") 11 | 12 | tbtoolsCounts<-read.delim("BCD.tab",header=F) 13 | library(ggplot2) 14 | head(tbtoolsCounts) 15 | 16 | tbtoolsCounts$V5<-(-log(tbtoolsCounts$V5,10)) 17 | # tbtoolsCounts$V2<-substr(tbtoolsCounts$V2,0,60) 18 | 19 | # GeneCount<-11578 # 所有有GO注释的基因个数 20 | # GeneCount<-tbtoolsCounts$TotalGeneAnnotated[1] # 所有有GO注释的基因个数 21 | # 先对Counts进行一次排序 22 | 23 | tbtoolsCounts<-tbtoolsCounts[order(tbtoolsCounts$V5,decreasing=F),] 24 | tbtoolsCounts<-tbtoolsCounts[order(tbtoolsCounts$V1,decreasing=T),] 25 | 26 | 27 | # tbtoolsCounts<-tbtoolsCounts[order(tbtoolsCounts$Class,tbtoolsCounts$Counts,decreasing=F),] 28 | str(tbtoolsCounts) 29 | tbtoolsCounts$V2<-factor(tbtoolsCounts$V2,unique(as.character(tbtoolsCounts$V2))) 30 | names(table(tbtoolsCounts$V1)) 31 | 32 | 33 | 34 | ########### 35 | # 横向图 36 | ######### 37 | library(ggsci) 38 | p<-ggplot(tbtoolsCounts) 39 | p+geom_bar(aes(x=V2,y=V5,fill=V1),stat="identity")+ 40 | # geom_text(aes(x=Description,y=(Counts+max(tbtoolsCounts$Counts)*0.06)/GeneCount*100,label=Counts))+ 41 | # ylim(0,max(tbtoolsCounts$Counts)*1.08)+ 42 | # geom_line(aes(x=Description,group=1,color=Class),size=5,y=max(tbtoolsCounts$Counts*1.2))+ 43 | 44 | #geom_text(aes(x=sum(tbtoolsCounts$Class==Class)/4*1,label=Class),y=max(tbtoolsCounts$Counts*1.2))+ 45 | geom_text(aes(x=V2,y=(V5+max(tbtoolsCounts$V5)*0.08),label=round(V5,2)))+ 46 | 47 | coord_flip()+theme_bw()+theme( 48 | # legend.position="none", 49 | # axis.text.x = element_text(size=10), 50 | # axis.line=element_line(size=0.5,linetype="dashed"), 51 | # axis.ticks=element_blank(), 52 | legend.position = c(.7, .1), 53 | panel.grid.major=element_blank() , 54 | panel.grid.minor=element_blank() , 55 | panel.background=element_blank() # , 56 | # panel.border=element_rect(linetype="dashed") 57 | )+ylab("-log10(Fisher'exact test p value)")+xlab("Gene Ontology Term")+scale_fill_npg() 58 | ggsave("GO.pdf") 59 | 60 | -------------------------------------------------------------------------------- /BioScripts/ShellScript/OneKP数据下载: -------------------------------------------------------------------------------- 1 | ############# 2 | # OneKP数据下载 3 | ############### 4 | cd /home/XiaLab/OneKP 5 | # 6 | . 7 | ├── 1kP-Sample-List.csv # http://www.onekp.com/samples/list.php 底部下载 8 | └── view-source_www.onekp.com_public_data.html # http://www.onekp.com/public_data.html 查看网页源码 9 | 10 | # 11 | wc -l 1kP-Sample-List.csv 12 | # 1329 1kP-Sample-List.csv 13 | 14 | 15 | perl -lne 'print for map {/"(http:.*?)"/g} $_' view-source_www.onekp.com_public_data.html|perl -pe 's/http:\/\/206.12.96.204\/onekp\/(\w{4}).*/$1/'|uniq|sort|uniq|wc -l 16 | # 1441 17 | 18 | # 也就是实际数据数目比表格中的还多 19 | perl -lne 'print for map {/"(http:.*?)"/g} $_' ../view-source_www.onekp.com_public_data.html > Download.links 20 | 21 | # 22 | for link in `cat Download.links`;do echo $link;aria2c "$link";done -------------------------------------------------------------------------------- /BioScripts/ShellScript/smallRNAadapterRM.sh: -------------------------------------------------------------------------------- 1 | # 查看数据 2 | tree . 3 | # 4 | . 5 | ├── WT-12h-rep1.fq.gz 6 | ├── WT-12h-rep2.fq.gz 7 | ├── WT-1h-rep2.fq.gz 8 | ├── WT-1h-rep3.fq.gz 9 | ├── WT-6h-rep1.fq.gz 10 | └── WT-6h-rep2.fq.gz 11 | # 将fq.gz 转换为 fa文件 12 | # 对数据进行质控 13 | head -n 100000 WT-1h-rep2.fq|perl -lane 'print if $.%4==2'|sort|uniq -c|sort -n|tail|perl -lane 'print qq{>},$count++,qq{_},$F[0],qq{\n},$F[1]'|muscle -clw -quiet 14 | # 15 | WT-12h-rep1.fq CTGTAGGCAC 16 | WT-12h-rep2.fq CTGTAGGCAC 17 | WT-1h-rep2.fq CTGTAGGCAC 18 | ... 19 | # 人工查看，几乎都一样 20 | 21 | ls *.fq|parallel -j 10 'echo -ne {}"\t";head -n 40000 {}|grep -c CTGTAGGCAC' 22 | 23 | # WT-12h-rep1.fq 9793 24 | # WT-12h-rep2.fq 9702 25 | # WT-1h-rep2.fq 9809 26 | # WT-1h-rep3.fq 9803 27 | # WT-6h-rep1.fq 9780 28 | # WT-6h-rep2.fq 9799 29 | 30 | # head *.fq 31 | # 查看质量值之后发现，都是phred+33的，可以直接转换成fa格式 32 | 33 | # 转换成fa 34 | ls *.fq|parallel -j 20 "fastq_to_fasta -v -r -i {} -o {=s/.fq$//=}.fa -Q 33 1>{=s/.fq$//=}.fq2fa.log 2>{=s/.fq$//=}.fq2fa.error" 35 | 36 | # 由于接头是一致的，直接去除接头 37 | ls *.fa|parallel -j 20 'fastx_clipper -v -c -l 15 -a "CTGTAGGCAC" -i {} -o {=s/.fa$//=}.trimmed.fa 1>{=s/.fa$//=}.faClip.log 2>{=s/.fa$//=}.faClip.error' 38 | # 只是回帖 fastq文件，所以不需要 collaspe 39 | ls *.trimmed.fa|parallel -j 18 'fastx_collapser -i {} -o {=s/.trimmed.fa//=}.mc.fasta 1>{=s/.trimmed.fa//=}.collapse.log 2>{=s/.trimmed.fa//=}.collapse.error' 40 | 41 | -------------------------------------------------------------------------------- /BioScripts/ShellScript/使用genBlast从所有植物基因组中提取目标基因家族成员.sh: -------------------------------------------------------------------------------- 1 | ############## 2 | # 使用genBlast从所有植物基因组中提取目标基因家族成员 3 | ############## 4 | if [ ! -n "$1" ] || [ ! -s $1 ];then 5 | echo "Please provide a valid query protein set [sh $0 query.pro.fa]" 6 | exit 7 | fi 8 | # 9 | # Dicer.fa 为查询的基因家族的蛋白序列文件， fasta格式 10 | # 11 | targetFa=$1 12 | cp -r /tools/genBlast/genBlast_v138_linux_x86_64/* . 13 | for file in /data/data2/XiaLab/allPlantGenome/*.fna;do ln -s $file;done 14 | # 15 | for genome in *.fna;do 16 | ./genblast_v138_linux_x86_64 -p genblastg -q $targetFa -t $genome -e 1e-2 -g T -f F -a 0.5 -d 100000 -r 10 -c 0.5 -s 0 -i 15 -x 20 -n 20 -v 2 -h 0 -j 3 -gff -cdna -pro -o $genome 17 | # 18 | perl -lane 'next unless $F[2] eq qq{transcript};($chr,$start,$end,$score,$strand,$_)=(@F[0,3,4,5,6],$_);push @pre,[$chr,$start,$end,$score,$strand,$_];END{$count=0;for $cur (sort {$a->[0] cmp $b->[0]||$a->[1]<=>$b->[1]} @pre){if(grep {$cur->[0] eq $_->[0] and $cur->[4] eq $_->[4] and !($_->[1]>$cur->[2]||$_->[2]<$cur->[1])} @{$group{$count}}){push @{$group{$count}},$cur}else{push @{$group{++$count}},$cur}}for $gc (sort keys %group){print join qq{\t},$gc,@{$_} for @{$group{$gc}}}}' ${genome}_1.1c_2.3_s2_tdshift0_tddis0_tcls3.0_m2_score_i0_d16_1.gff > $genome.group.gff 19 | # 20 | perl -e '$file=shift;$key=shift;$value=shift;open IN,$file;while(){chomp;@F=split /\t/,$_;$seen{$F[$key]}=$F[$value] if ($seen{$F[$key]}<$F[$value]);}seek IN,0,0;while(){chomp;@F=split /\t/,$_;print qq{$_\n} if ($F[$value]>=$seen{$F[$key]})}' $genome.group.gff 0 4 > $genome.maxScore.gff 21 | # 22 | cat $genome.maxScore.gff|perl -lane '($chr,$start,$end,$score,$strand,$ID)=($F[14]=~s/^ID=(.*?);.*$/$1/r,@F[1,2,3,4,5]);print join qq{\t},(($chr,$start,$end,$score,$strand,$ID))' > $genome.ids 23 | # 24 | perl -lne 'if($switch){if(/^>/){$flag=0;m/^>?(\S+).*?$/;$flag=1 if $need{$1};}print if $flag}else{m/^>?(\S+).*?$/;$need{$1}++}$switch=1 if eof(ARGV)' ${genome}.ids ${genome}_1.1c_2.3_s2_tdshift0_tddis0_tcls3.0_m2_score_i0_d16_1.pro > ${genome}.potential.$targetFa 25 | done -------------------------------------------------------------------------------- /BioScripts/UsageOfSomeCliSoftware/VARNA/Color.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJ-Chen/TBtools-Manual/7c582a84ad6c1bfd65d31f4368c8dd6802743aef/BioScripts/UsageOfSomeCliSoftware/VARNA/Color.jpg -------------------------------------------------------------------------------- /BioScripts/UsageOfSomeCliSoftware/VARNA/default.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CJ-Chen/TBtools-Manual/7c582a84ad6c1bfd65d31f4368c8dd6802743aef/BioScripts/UsageOfSomeCliSoftware/VARNA/default.png -------------------------------------------------------------------------------- /BioScripts/UsageOfSomeCliSoftware/VARNA/命令行使用VARNA可视化miRNA结构.md: -------------------------------------------------------------------------------- 1 | # 命令行使用VARNA可视化miRNA结构 2 | 3 | [VARNA](http://varna.lri.fr/index.php?lang=en&page=command&css=varna)，非常强大的一个RNA结构可视化工具，同时支持GUI和command line 操作... 4 | 5 | 因为GUI版本操作起来比较慢，也不方便，故整理命令行一例如下 6 | 7 | --- 8 | 9 | 基本命令 10 | 11 | ```shell 12 | java -cp VARNAvX-Y.jar fr.orsay.lri.varna.applications.VARNAcmd 13 | [-i inputFile|-sequenceDBN XXX -structureDBN YYY] -o outFile [opts] 14 | ``` 15 | 16 | 其中，各参数信息如下： 17 | 18 | ```shell 19 | inFile: An input file using one of the supported formats (Vienna, CT, BPSeq or RNAML). 20 | XXX: An RNA sequence. #是可以直接贴上RNAseq 21 | YYY: A well-parenthesized expression with dots whose size matches that of the input sequence. #是可以直接贴上Structure 22 | outFile: An output file whose format is guessed from the extension. #会依据后缀自动识别输出文件格式 23 | ``` 24 | 25 | 调试命令 26 | 27 | ```shell 28 | # 如果在windows上直接运行这个命令，那么会弹出操作界面，没啥意义 29 | java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd 30 | # 速度极其快，真的，秒出 31 | java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd -sequenceDBN GAAAUAUUUGAGCAUUUGAGAGUUGUAUGUAAGAACUGGAAAAAUCCAAAUAGAUUAUUUUGUUAUUAAUGUAUUCUGUUUGGUUUCCUCCUGUAUCUUAUCUCCAACUUCUAAACUACAAAAAUUCCCUCCAGUCUUGUACUUUGAGCAAUCUGCCGAGAGUCACACCUUCCUCAGACUUGUACUUGAGUAACUCACCGA -structureDBN ((..((((((((..((((.((((((...((((((((.(((..((.(((((((((.(((.(((....))).))).))))))))).))..))).)).))))))...)))))).)))).................(((((((((((((..(((...)))..))))).)))........)))))....))))))))..))..... -o default.png -border "20x30" -resolution 10 32 | ``` 33 | 34 | 基本结果 35 | 36 | ![](default.png ) 37 | 38 | 39 | 40 | 41 | 42 | 设置某个区段的颜色（**miRNA的 mature seq 和 star seq**） 43 | 44 | ```shell 45 | # 我们知道miRNA mature对应的是79到100的bases 46 | perl -le "print join qq{,},79..100" 47 | 79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100 48 | # star链对应的是 30到50 49 | perl -le "print join qq{,},30..50" 50 | 30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50 51 | # 基于已知的序列和 fold的结果，以及上面的位置，直接标记 52 | java -cp VARNAv3-93.jar fr.orsay.lri.varna.applications.VARNAcmd -sequenceDBN GAAAUAUUUGAGCAUUUGAGAGUUGUAUGUAAGAACUGGAAAAAUCCAAAUAGAUUAUUUUGUUAUUAAUGUAUUCUGUUUGGUUUCCUCCUGUAUCUUAUCUCCAACUUCUAAACUACAAAAAUUCCCUCCAGUCUUGUACUUUGAGCAAUCUGCCGAGAGUCACACCUUCCUCAGACUUGUACUUGAGUAACUCACCGA -structureDBN ((..((((((((..((((.((((((...((((((((.(((..((.(((((((((.(((.(((....))).))).))))))))).))..))).)).))))))...)))))).)))).................(((((((((((((..(((...)))..))))).)))........)))))....))))))))..))..... -basesStyle1 "fill=#FF0000,outline=#FF0000,label=#000000,number=#FF0000" -applyBasesStyle1on "79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100" -basesStyle2 "fill=#FF6600,outline=#FF6600,label=#FFFFFF,number=#FF6600" -applyBasesStyle2on "30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50" -o Color.jpg -border "20x30" -resolution 10 53 | # 可以直接得到图片，如果需要矢量图，那么修改输出文件后缀为 .svg 即可 54 | # 也可以直接 55 | ``` 56 | 57 | ![](Color.jpg) 58 | 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 CJ-Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TBtools-Manual 2 | Start a project to make a useful and readable manual for TBtools. 3 | 4 | ### Functions:what TBtools can do? 5 | TBtools, short for **"Tools for Biologist"**, is a Toolset and also a project. 6 | From the very beginning, I just want to develop a Toolset, which can be useful for my self in Command LIne mode, and for my colleage in Graphics User Interface mode. 7 | However, some friends saw this work and said TBtools would also be useful to them. So, I post it on web. And consequently, more and more friends send me feature calls and more and more functions were added into TBtools. 8 | Thus, Tills now(2017/06/29), TBtools *as less* contains functions as bellowed: 9 | ##### In CLI Mode: 10 | 11 | To run TBtools in CLI mode, 12 | * if the running environment has *no* graphics device, like under terminal to server, just type `java -jar TBtools_vX.XX.jar`, and then all available tools will be shown. Copy the command for specific tools and use it. 13 | * if the running environment has graphics device, like under windows, mac or X-windows, just type `java -jar TBtools_vX.XX.jar anyString`... If no string after, then TBtools will try to run in GUI mode and in many cases, a TBtools main windows will shown. 14 | 15 | **List of Tools under CLI Mode** 16 | will add in the furture... 17 | 18 | 19 | ##### In GUI Mode: 20 | To runu TBtools in GUI mode, 21 | If the .jar file has been linked to java, then just **Double-click the .jar file**. the TBtools main window will show up. 22 | If user install TBtools from .exe file **under windows**, just run TBtools like many other software. 23 | If user want to run TBtools in GUI mode with useful debug information, just type `java -jar TBtools_vX.XX.jar debug`. 24 | 25 | **List of Tools under GUI Mode** 26 | 27 | * Sequence Toolkits 28 | + Amazing Fasta **Extractor** 29 | Transformat Fasta file, make a Fasta-Index and extract Fasta Record in a **very** quick way. 30 | + Fasta **Stater** 31 | Stat a Fasta file, which can generate a summary information (record counts, total Len, N50, GC contents ...) of all Fasta record and a file contain sequence features (simplified ID, length, GC content ...) of each fasta record. 32 | + Fasta **Extractor** 33 | Directly extract Fasta record from fasta file, which might be slower than *Amazing Fasta Extractor*... This function will be decrepeted or updated... Recommand use **Amazing Fasta Extractor** instead. 34 | + Fasta **Subseq** 35 | Similar to *Fasta Extractor*, however, this function was developed for extract **Sequence Region** of specific Fasta records... This function will ne decrepeted or updated... Recommand Use **Amazing Fasta Extractor** instead. 36 | + Fasta **Merge and Split** 37 | Merge several fasta file into one fasta file Or Split one fasta file into several fasta files. 38 | + Sequence **Manipulator** 39 | Tranformat, Reverse, Complement of sequence 40 | + NCBI **Seq Downloader** 41 | Batch download Sequnces from NCBI according to GI or Accession Number list. 42 | + Get Completes **ORF**(Open Reading Frame) 43 | Predict complete ORF from input sequence. At present, this function only detect **complete ORF** and only the classic codon usage table. That is, a **complete ORF** here only refer to sequence region starts from *ATG* and ends with *TGA*,*TAA* and *TAG*. 44 | + Check **Primers**(Simple PCR) 45 | Directly check the primers sequence location to simply stat the specifity of input primer in the specific sequnce database, such as the transcriptome of one species. 46 | + GFF/GTF Sequence Extractor 47 | Extractor Sequences from genome according to gene structure annotation file(.gff/.gtf). 48 | * Blast 49 | + **Remote** Blast(No Need for Preinstalled Blast) 50 | Conduct remote blast through NCBI Blast API...This funtion is *not stable* now, because *JRE1.6* doesn't support *SSL protocol* and I do not want to add third party library for this. Will be updated later. 51 | + Auto **Blast** Several Sequences To a Big File[Commonly Used] 52 | Invoke Blast *in the environment* to compare several sequences to a big in Fasta format. 53 | + Auto **Blast** Two Sequence Sets 54 | Like above. 55 | + Auto **Blast** Two Sequence Sets -Big File- 56 | Like above. *This three functions will be merged into one in the feature*. 57 | + **Blast** Several Seq To FastQ 58 | Build a blast database from input fastQ file and Blast several Seq onto it. 59 | + **Reciprocal Blast** 60 | Conduct reciprocal Blast between two input file in fasta format. 61 | + **Blast** XML Alignment Shower 62 | Make a alignment graph of blast result, which can be used to check the coverage of query sequence or subject sequence. 63 | + **Blast** XML Dotpot 64 | Make a dot plot graph of blast result. 65 | + **Blast** Pileup Grapher 66 | Make a pileup graph of blast result, which is similar to NCBI blast web serveive result page. 67 | + TransFormat **Blast**.xml to TBtools.table 68 | TBtools has definded a tab-seperated format to store and descript the blast result. This table contains some useful staticstic info, like **weighted-Cov**. 69 | + TransFormat **Blast**.xml to Blast Table 70 | This function just transformat the blast xml file to a tab-delimed file also the same as the default blast+ outfmt-6. 71 | + **e-GenomeWalkiing or e-Race** 72 | Use a sequnce to **FISH** ovelapping reads and assemble into a *long* sequence. This function might be useful to conduction Genome-walking on Re-sequencing data or 5'RACE or 3'RACE on RNAsequencing data *in silco*. 73 | 74 | * GO and KEGG 75 | + **GO** Annotation 76 | Conduct a gene-ontology annotation. This function is only a ID-mapping tools, which map the input GI,Accession Number of NCBI,Uniprot or Tremble to GO ID. 77 | + **GO** Enrichment 78 | Conduct gene-ontology term enrichment analysis basing on hypergeometrix distribution. 79 | + **GO** Level Counter 80 | Count gene number at specific GO level, output a statistics table and an optional graph. 81 | + **GO** Level Compare 82 | Compare two GO annotation result on specific GO level and output a graph. This function *do not* work well now. Will fix it. 83 | + **GO** Term Parser 84 | Parse GO annotation, from which user can know every gene2GO and GO2gene mapping information. 85 | + Prepare **GO** Annotation for BiNGO in cytoscape 86 | Transformat *many* gene-ontology annotation into a formated file for BiNGO. 87 | + **KEGG** Enrichment Analysis 88 | Conduct KEGG pathway analysis basing on hypergeometrix distribution. 89 | + **KEGG** Pathwat Map Drawer 90 | Hightlight genes on KEGG pathway map. 91 | * Other 92 | + **C**o**l**o**r** **P**i**c**k**e**r 93 | Pick color code freely and save color code graph as needed. 94 | + **Table** ID Manipulator 95 | Extract, Filter or Rank table *Row* 96 | + **Table** Column Manipulator 97 | Rank or keep only *Column* selected... Extract or Filter funstion **WILL BE ADDED**. 98 | + **Big Text** Viewer 99 | Take a glance of text file with very **Big Size** in a **very quick** way. 100 | + **Big Table** Viewer 101 | Take a glance of table file with very **Big Size** in a **very quick** way. 102 | + **Text Block** Extractor 103 | Extract Text Block with specific ID list and record seperate string. This function was developed for extracting synteny block. 104 | + **Expression** Shower 105 | Visulize expression trend of one gene or several gene. 106 | + **Expression** Calculator 107 | Calculate expression value (**RPKM** or **TPM** value) according gene.counts and gene.len 108 | + **Wonderful Venn** (Up to Six Sets) 109 | Conduct Venn analysis in an interative way. 110 | + **Map** Genes On Genome From Sequence Files 111 | Conduct Blast to get the positon region of input gene on genome and then output a graph 112 | + **Map** Genes On Genome From Position Info File 113 | Draw a gene on genome file basing on input gene and genome info. 114 | + **Dual Synteny Plot** from MCScanX output 115 | Visulize result from MCScanX in a interactive way. 116 | + **Domain/Motif** Pattern Drawers 117 | Visulzie result from MEME suite, NCBI Batch-CD search, pfam-search or GFF/GTF. 118 | * About 119 | + About TBtools 120 | Show some information about TBtools. 121 | + Debug Dialog 122 | Show Debug dialog, which could be used for bugs tracking. 123 | + Resize JVM Memory 124 | Some functions need high memory. So, this function could be used to get a new TBtools Windows wich higher memory. It may be failed under some environment. 125 | 126 | 127 | ### Usage:how to use TBtools? 128 | A very very long way to go... 129 | 130 | 131 | 132 | ### FAQs:why TBtools fails? 133 | 1. Sequence extraction failed. 134 | Most of the time, if TBtools fail to extract sequences from fasta file, the reason should the *Input ID List* is not match in the *Subject Fasta File*. Test it. 135 | 2. Blast-related Functions failed. 136 | Thest function has been test for a many many times by many many friends, they should have been statble and robust for quite a long time. If TBtools faided in these function, please check whether the *blast* could be normally run in Command line environment. 137 | ... 138 | ### Bugs:where to report the bugs and help developing TBtools? 139 | There are many ways to *report bugs* and send *feature calls* for TBtools: 140 | - Send Me an email: ccj0410@gmail.com 141 | - Through QQ chatting group: TBools使用交流群(553679029) 142 | ... 143 | ### To Updates: 144 | - [ ] Fasta Extractor 145 | - [ ] Fasta Subseq 146 | - [ ] Fasta Split 147 | 148 | ### To DO: 149 | - [ ] Seven set and Eight set Venn 150 | 151 | --------------------------------------------------------------------------------