├── workflow.png ├── mergebam.pl ├── README.md ├── qcreads.pl ├── align-end2end.pl ├── align-local.pl ├── qcbam.pl └── scBS-map.pl /workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wupengomics/scBS-map/HEAD/workflow.png -------------------------------------------------------------------------------- /mergebam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # scBSmap - mergebam.pl 4 | # 5 | # Copyright (C) Peng Wu 6 | # Contact: Peng Wu 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | use strict; 27 | use Getopt::Long qw(:config no_ignore_case); 28 | use Pod::Usage; 29 | 30 | =pod 31 | 32 | =head1 DESCRIPTION 33 | 34 | Merge alignments from end-to-end and local mapping 35 | 36 | =head1 USAGE 37 | 38 | mergebam [options] -e <.end2end.bam> -l <.local.bam> -o 39 | 40 | Options: 41 | -e File name of end2end alignment, .bam format. 42 | -l File name of local alignment, .bam format. 43 | -o Output file name, .bam format. 44 | -p Number of threads. [default: 12]. 45 | -h Help message. 46 | 47 | =head1 AUTHOR 48 | 49 | Contact: Peng Wu; wupeng1@ihcams.ac.cn 50 | Last update: 2018-10-24 51 | 52 | =cut 53 | 54 | ## Parsing arguments from command line 55 | my ($end2end, $local, $samtools, $out, $threads, $help); 56 | 57 | GetOptions( 58 | 'e:s' => \$end2end, 59 | 'l:s' => \$local, 60 | 's:s' => \$samtools, 61 | 'o:s' => \$out, 62 | 'p:i' => \$threads, 63 | 'h|help' => \$help 64 | ); 65 | 66 | ## Print usage 67 | pod2usage( { -verbose => 2, -output => \*STDERR } ) if ( $help ); 68 | ( $end2end and $local and $out ) or pod2usage(); 69 | 70 | 71 | ## Set default 72 | $samtools ||= `which samtools`; 73 | chomp $samtools; 74 | $samtools or pod2usage(); 75 | 76 | $threads ||= 12; 77 | 78 | 79 | 80 | ## Step5. mergebam 81 | `$samtools merge -@ $threads -f $out $end2end $local`; 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scBS-map 2 | 3 | - **Description**: Single-cell Bisulfite Sequencing Data Mapping. scBS-map includes 5 steps: 4 | 5 | 6 | - **Version**: 1.0.0 7 | 8 | - **System requirements** 9 | 10 | - Install [SAMtools](http://samtools.sourceforge.net/) 11 | - wget samtools-*.tar.bz2 12 | - tar -xjvf samtools-*.tar.bz2 13 | - cd samtools-* 14 | - ./configure --prefix=/samtools_install_path/ 15 | - make 16 | - make install 17 | - export PATH=/samtools_install_path/:$PATH 18 | 19 | - Install [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/) 20 | - wget bowtie2-*.zip 21 | - unzip bowtie2-*.zip 22 | - export PATH=/bowtie2_install_path/:$PATH 23 | 24 | - Install [BS-Seeker2](http://pellegrini-legacy.mcdb.ucla.edu/bs_seeker2/) 25 | - wget BSseeker2*.zip 26 | - unzip BSseeker2*.zip 27 | - export PATH=/BSseeker2_install_path/:$PATH 28 | 29 | - **Usage**: `perl scBS-map.pl [options] [-f <.fastq>] [-g ] [-o ]` 30 | 31 | ``` 32 | -l Length of trimming bases from the 5' end of the read [default: 10]. 33 | -p Number of threads. [default: 12]. 34 | -s Path to samtools eg. /home/user/bin/samtools 35 | - By default, we try to search samtools in system PATH. 36 | -a Path to bs_seeker2-align eg. /home/user/bin/bs_seeker2-align.py 37 | - By default, we try to search bs_seeker2-ailgn in system PATH. 38 | -b Path to bs_seeker2-build eg. /home/user/bin/bs_seeker2-build.py 39 | - By default, we try to search bs_seeker2-build in system PATH. 40 | -w Logical to determine if the genome index needs to be built or not [default: FALSE]. 41 | -n Length of removing microhomology regions from bam files [default: 10]. 42 | -k Logical to determine whether to keep temporary files [default: FALSE]. 43 | -f File name for sequencing reads, .fastq format. 44 | - a compressed file (.fastq.gz) is also supported. 45 | -g Genome file name, fasta format. 46 | -o Output file name, bam format. 47 | -h Help message. 48 | ``` 49 | 50 | - **Example**: `perl scBS-map.pl -l 9 -p 40 -n 10 -f Sample1.R1.fastq.gz -g hg38.fa -o Sample1.R1.bam` 51 | 52 | - **Output files**: 53 | 54 | - **.end2end.bam**: `Output file by end-to-end mode, .bam format.` 55 | 56 | - **.local.bam**: `Output file by local mode, .bam format.` 57 | 58 | - **.unaligned.fq**: `Unaligned reads, .fq format.` 59 | 60 | - **.multihits.fq**: `Multiple-hits reads, .fq format.` 61 | 62 | - **.out.bam**: `Final alignment file, .bam format` 63 | 64 | - **.scBS-map.report**: `Report for the alignment results.` 65 | ``` 66 | -------------------------------------------------- 67 | scBS-map report for test sample 68 | -------------------------------------------------- 69 | Number of reads: 10000 70 | Number of bases: 1312217 71 | Number of reads after quality control: 9852 72 | Number of bases after quality control: 1219137 73 | 74 | Number of mapped reads using the end-to-end mode: 2827 75 | Number of mapped bases using the end-to-end mode: 356285 76 | 77 | Number of mapped reads using the local mode: 1427 78 | Number of mapped bases using the local mode: 98277 79 | 80 | Number of unmapped reads: 4688 81 | Number of multi-hits reads: 910 82 | 83 | Number of mapped reads in total: 4254 84 | Mappability in total: 42.54% 85 | 86 | Number of mapped bases in total: 454562 87 | Mappability at base level in total: 34.64% 88 | -------------------------------------------------- 89 | ``` 90 | 91 | - **Note**: `scBS-map includes 5 subcommands. You can directly run the scBS-map.pl command for the entire pipeline or select one of the following subcommonds according to your own needs.` 92 | 93 | - **Subcommands**: 94 | 95 | 1. **qcreads**: 96 | 97 | - **Description**: Trim low quality sequences. 98 | 99 | - **Usage**: `perl qcreads.pl [-f <.fastq>] [-l length] [-o output]` 100 | 101 | ``` 102 | -f FILE File name for sequencing data, fastq format. 103 | -l INT Length of removed bases from the 5' end of the read [default: 10]. 104 | -o OUTFILE Output file name, .fastq.gz format. 105 | ``` 106 | 107 | - **Example**: `perl qcreads.pl -f Sample.R1.fastq.gz -l 10 -o Sample.R1.trim.fastq.gz` 108 | 109 | 2. **align-end2end**: 110 | 111 | - **Description**: Perform end-to-end alignment on clean reads. 112 | 113 | - **Usage**: `perl align-end2end.pl [-f input<.fastq>] [-g genome<.fa>] [-p threads] [-u unmappedreads] [-o output]` 114 | 115 | ``` 116 | -f FILE File name for clean data, fastq format. 117 | -g FILE Genome file name, fasta format. 118 | -p INT Number of launching threads [default: 12]. 119 | -u OUTFILE File name for unmapped reads if needed. 120 | -o OUTFILE Output file name, bam format. 121 | ``` 122 | 123 | - **Example**: `perl align-end2end.pl -f Sample.R1.trim.fastq.gz -g hg38.genome.fa -p 40 -u Sample.R1.unmapped.bam -o Sample.R1.end2end.bam` 124 | 125 | 3. **align-local**: 126 | 127 | - **Description**: Perform local alignment on clean reads. 128 | 129 | - **Usage**: `perl align-local.pl [-f input<.fastq>] [-g genome<.fa>] [-p threads] [-o output]` 130 | 131 | ``` 132 | -f FILE File name for clean data, fastq format. 133 | -g FILE Genome file name, fasta format. 134 | -p INT Number of launching threads [default: 12]. 135 | -o OUTFILE Output file name, bam format. 136 | ``` 137 | 138 | - **Example**: `perl align-local.pl -f Sample.R1.clean.fastq.gz -g hg38.genome.fa -p 40 -o Sample.R1.local.bam` 139 | 140 | 4. **qcbam**: 141 | 142 | - **Description**: Remove the low confidence alignments within microhomology regions 143 | 144 | - **Usage**: `perl qcbam.pl [-f ] [-n number] [-p threads] [-o ]` 145 | 146 | ``` 147 | -f FILE File name for local alignment, bam format. 148 | -n INT Number of trimming bases [default: 10] 149 | -p INT Number of launching threads [default: 12]. 150 | -o OUTFILE Output file name, bam format. 151 | ``` 152 | 153 | - **Example**: `perl qcbam.pl -f Sample.R1.local.bam -n 10 -o Sample.R1.local.hc.bam` 154 | 155 | 5. **mergebam**: 156 | 157 | - **Description**: Merge alignments from end-to-end and local mapping if available 158 | 159 | - **Usage**: `perl mergebam.pl [-e <.end2end.bam>] [-l <.local.bam>] [-p threads] [-o output]` 160 | 161 | ``` 162 | -e FILE File name of end2end alignment, .bam format. 163 | -l FILE File name of local alignment, .bam format. 164 | -p INT Number of launching threads [default: 12]. 165 | -o OUTFILE Output file name, bam format. 166 | ``` 167 | 168 | - **Example**: `perl mergebam.pl -e Sample.R1.end2end.bam -l Sample.R1.local.hc.bam -p 40 -o Sample.R1.merge.bam` 169 | 170 | - **Contact**: 171 | 172 | Peng Wu; wupeng1@ihcams.ac.cn 173 | -------------------------------------------------------------------------------- /qcreads.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # scBSmap - qcreads.pl 4 | # 5 | # Copyright (C) Peng Wu 6 | # Contact: Peng Wu 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | use strict; 27 | use Getopt::Long qw(:config no_ignore_case); 28 | use Pod::Usage; 29 | 30 | =pod 31 | 32 | =head1 DESCRIPTION 33 | 34 | Trim low quality sequences 35 | 36 | =head1 USAGE 37 | 38 | qcreads -l -f <.fastq> -o <.trim.fastq.gz> 39 | 40 | Options: 41 | -f File name for sequencing reads, .fastq format. 42 | - a compressed file (.fastq.gz) is also supported 43 | -l Length of trimming bases from the 5' end of the read [default: 10]. 44 | -o Output file name, .fastq.gz format. 45 | -h Help message. 46 | 47 | =head1 AUTHOR 48 | 49 | Contact: Peng Wu; wupeng1@ihcams.ac.cn 50 | Last update: 2018-10-24 51 | 52 | =cut 53 | 54 | ## Parsing arguments from command line 55 | my ($reads, $length, $out, $help); 56 | 57 | GetOptions( 58 | 'f:s' => \$reads, 59 | 'l:i' => \$length, 60 | 'o:s' => \$out, 61 | 'h|help' => \$help 62 | ); 63 | 64 | ## Print usage 65 | pod2usage( { -verbose => 2, -output => \*STDERR } ) if ( $help ); 66 | ( $reads and $length and $out ) or pod2usage(); 67 | 68 | 69 | ## Set default 70 | $out ||= "trim.fastq.gz"; 71 | $length ||= 10; 72 | 73 | 74 | ## Step1. qcreads 75 | if($reads=~/gz/){ 76 | open IN, "gzip -dc $reads |" or die $!; 77 | }else{ 78 | open IN, $reads or die $!; 79 | } 80 | $reads=~s/.gz//; 81 | $reads=~s/.fastq//; 82 | $reads=~s/.fq//; 83 | open OUT, "| gzip -c > $out" or die $!; 84 | 85 | while (){ 86 | chomp (my $line0 = $_); 87 | chomp (my $line1 = ); 88 | chomp (my $line2 = ); 89 | chomp (my $line3 = ); 90 | 91 | my $line1_new=substr($line1, $length-1, length($line1)); 92 | my $line3_new=substr($line3, $length-1, length($line3)); 93 | 94 | if(length($line1_new)>=35){ 95 | print OUT "$line0\n$line1_new\n$line2\n$line3_new\n"; 96 | } 97 | } 98 | 99 | close IN; 100 | close OUT; 101 | -------------------------------------------------------------------------------- /align-end2end.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # scBSmap - align-end2end.pl 4 | # 5 | # Copyright (C) Peng Wu 6 | # Contact: Peng Wu 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | use strict; 27 | use Getopt::Long qw(:config no_ignore_case); 28 | use Pod::Usage; 29 | 30 | =pod 31 | 32 | =head1 DESCRIPTION 33 | 34 | Perform end-to-end alignment on clean reads. 35 | 36 | =head1 USAGE 37 | 38 | align-end2end [options] -f <.fastq> -g -o -u 39 | 40 | Options: 41 | -f File name for sequencing reads, .fastq format. 42 | - a compressed file (.fastq.gz) is also supported 43 | -g Genome file name, fasta format. 44 | -o Output file name, bam format. 45 | -u File name for unmapped reads if needed. 46 | -a Path to bs_seeker2-align eg. /home/user/bin/bs_seeker2-align.py 47 | - By default, we try to search bs_seeker2-ailgn in system PATH. 48 | -b Path to bs_seeker2-build eg. /home/user/bin/bs_seeker2-build.py 49 | - By default, we try to search bs_seeker2-build in system PATH. 50 | -w Logical to determine if the genome index needs to be built or not [default: FALSE]. 51 | -h Help message. 52 | 53 | =head1 AUTHOR 54 | 55 | Contact: Peng Wu; wupeng1@ihcams.ac.cn 56 | Last update: 2018-10-24 57 | 58 | =cut 59 | 60 | ## Parsing arguments from command line 61 | my ($reads, $bs_seeker2_align, $bs_seeker2_build, $buildornot, $genome, $out, $threads, $unmappedout, $help); 62 | 63 | GetOptions( 64 | 'f:s' => \$reads, 65 | 'a:s' => \$bs_seeker2_align, 66 | 'b:s' => \$bs_seeker2_build, 67 | 'w:s' => \$buildornot, 68 | 'g:s' => \$genome, 69 | 'o:s' => \$out, 70 | 'p:i' => \$threads, 71 | 'u:s' => \$unmappedout, 72 | 'h|help' => \$help 73 | ); 74 | 75 | ## Print usage 76 | pod2usage( { -verbose => 2, -output => \*STDERR } ) if ( $help ); 77 | ( $reads and $genome and $out ) or pod2usage(); 78 | 79 | 80 | ## Set default 81 | $bs_seeker2_align ||= `which bs_seeker2-align.py`; 82 | chomp $bs_seeker2_align; 83 | $bs_seeker2_align or pod2usage(); 84 | 85 | $bs_seeker2_build ||= `which bs_seeker2-build.py`; 86 | chomp $bs_seeker2_build; 87 | $bs_seeker2_build or pod2usage(); 88 | 89 | $out ||= "output.end2end.bam"; 90 | $buildornot ||= "FALSE"; 91 | $unmappedout ||= "unmapped.fastq"; 92 | $threads ||= 12; 93 | 94 | 95 | ## Step2. align-end2end 96 | if($buildornot eq "TRUE"){ 97 | `$bs_seeker2_build -f $genome --aligner=bowtie2`; 98 | } 99 | 100 | `$bs_seeker2_align -i $reads -g $genome -t Y -m 0.04 -o $out -u $unmappedout --bt2-p $threads --bt2--end-to-end --aligner=bowtie2`; 101 | 102 | -------------------------------------------------------------------------------- /align-local.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # scBSmap - align-local.pl 4 | # 5 | # Copyright (C) Peng Wu 6 | # Contact: Peng Wu 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | use strict; 27 | use Getopt::Long qw(:config no_ignore_case); 28 | use Pod::Usage; 29 | 30 | =pod 31 | 32 | =head1 DESCRIPTION 33 | 34 | Perform local alignment on clean reads. 35 | 36 | =head1 USAGE 37 | 38 | align-local [options] -f <.fastq> -g -o -u 39 | 40 | Options: 41 | -f File name for sequencing reads, .fastq format. 42 | - a compressed file (.fastq.gz) is also supported 43 | -g Genome file name, fasta format. 44 | -o Output file name, bam format. 45 | -u File name for unmapped reads if needed. 46 | -s Path to samtools eg. /home/user/bin/samtools 47 | - By default, we try to search samtools in system PATH. 48 | -a Path to bs_seeker2-align eg. /home/user/bin/bs_seeker2-align.py 49 | - By default, we try to search bs_seeker2-ailgn in system PATH. 50 | -b Path to bs_seeker2-build eg. /home/user/bin/bs_seeker2-build.py 51 | - By default, we try to search bs_seeker2-build in system PATH. 52 | -w Logical to determine if the genome index needs to be built or not [default: FALSE]. 53 | -h Help message. 54 | 55 | =head1 AUTHOR 56 | 57 | Contact: Peng Wu; wupeng1@ihcams.ac.cn 58 | Last update: 2018-10-24 59 | 60 | =cut 61 | 62 | ## Parsing arguments from command line 63 | my ($reads, $bs_seeker2_align, $bs_seeker2_build, $buildornot, $genome, $out, $threads, $unmappedout, $help); 64 | 65 | GetOptions( 66 | 'f:s' => \$reads, 67 | 'a:s' => \$bs_seeker2_align, 68 | 'b:s' => \$bs_seeker2_build, 69 | 'w:s' => \$buildornot, 70 | 'g:s' => \$genome, 71 | 'o:s' => \$out, 72 | 'p:i' => \$threads, 73 | 'u:s' => \$unmappedout, 74 | 'h|help' => \$help 75 | ); 76 | 77 | ## Print usage 78 | pod2usage( { -verbose => 2, -output => \*STDERR } ) if ( $help ); 79 | ( $reads and $genome and $out ) or pod2usage(); 80 | 81 | 82 | ## Set default 83 | $bs_seeker2_align ||= `which bs_seeker2-align.py`; 84 | chomp $bs_seeker2_align; 85 | $bs_seeker2_align or pod2usage(); 86 | 87 | $bs_seeker2_build ||= `which bs_seeker2-build.py`; 88 | chomp $bs_seeker2_build; 89 | $bs_seeker2_build or pod2usage(); 90 | 91 | $out ||= "output.end2end.bam"; 92 | $buildornot ||= "FALSE"; 93 | $unmappedout ||= "unmapped.fastq"; 94 | $threads ||= 12; 95 | 96 | 97 | ## Step3. local 98 | if($buildornot eq "TRUE"){ 99 | `$bs_seeker2_build -f $genome --aligner=bowtie2`; 100 | } 101 | 102 | `$bs_seeker2_align -i $reads -g $genome -t Y -m 0.04 -o $out -u $unmappedout --bt2-p $threads --bt2--local --aligner=bowtie2`; 103 | 104 | -------------------------------------------------------------------------------- /qcbam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # scBSmap - qcbam.pl 4 | # 5 | # Copyright (C) Peng Wu 6 | # Contact: Peng Wu 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | use strict; 27 | use Getopt::Long qw(:config no_ignore_case); 28 | use Pod::Usage; 29 | 30 | =pod 31 | 32 | =head1 DESCRIPTION 33 | 34 | Remove the low confidence alignments within microhomology regions 35 | 36 | =head1 USAGE 37 | 38 | qcbam [options] -f -o 39 | 40 | Options: 41 | -f File name for alignment data, .bam format. 42 | -o Output file name, bam format. 43 | -p Number of threads. [default: 12]. 44 | -s Path to samtools eg. /home/user/bin/samtools 45 | - By default, we try to search samtools in system PATH. 46 | -n Length of removing microhomology regions from bam files [default: 10]. 47 | -h Help message. 48 | 49 | =head1 AUTHOR 50 | 51 | Contact: Peng Wu; wupeng1@ihcams.ac.cn 52 | Last update: 2021-5-19 53 | 54 | =cut 55 | 56 | ## Parsing arguments from command line 57 | my ($inbam, $samtools, $out, $threads, $num, $help); 58 | 59 | GetOptions( 60 | 'f:s' => \$inbam, 61 | 's:s' => \$samtools, 62 | 'o:s' => \$out, 63 | 'p:i' => \$threads, 64 | 'n:i' => \$num, 65 | 'h|help' => \$help 66 | ); 67 | 68 | ## Print usage 69 | pod2usage( { -verbose => 2, -output => \*STDERR } ) if ( $help ); 70 | ( $inbam and $out ) or pod2usage(); 71 | 72 | 73 | ## Set default 74 | $samtools ||= `which samtools`; 75 | chomp $samtools; 76 | $samtools or pod2usage(); 77 | 78 | $out ||= "output.bam"; 79 | $threads ||= 12; 80 | $num ||= 10; 81 | 82 | 83 | ## Step4. qcbam 84 | open IN_bam,"$samtools view $inbam -h|" or die $!; 85 | my $fileprefix=$inbam; 86 | $fileprefix=~s/.bam$//; 87 | open OUT_tmp, "> $fileprefix."."tmp.sam"; 88 | open LOW, ">$fileprefix."."local.lowquality.sam"; 89 | while(){ 90 | chomp; 91 | my @l=split/\t/; 92 | if(/^@/){ 93 | print OUT_tmp "$_\n"; 94 | print LOW "$_\n"; 95 | }else{ 96 | if($l[5] !~ /S/){ 97 | print OUT_tmp "$_\n"; 98 | }elsif($l[5]=~/^(\d+)S(\d+)M$/){ 99 | my $len=$2-$num; 100 | my $seq=substr($l[9],$1+$num,$len); 101 | $l[9]=$seq; 102 | $l[5]="$len"."M"; 103 | $l[3]+=$num; 104 | 105 | if($l[11] eq "XO:Z:+FW" || $l[11] eq "XO:Z:+RC"){ 106 | #cut cginfo 107 | my @cginfo=split/:/,$l[14]; 108 | $cginfo[2]=substr($cginfo[2],$num,$len); 109 | $l[14]=join ":",@cginfo; 110 | 111 | #cut refseq 112 | my @refseq=split/:/,$l[15]; 113 | my $startseq=substr($refseq[2],$num+1,2); 114 | $refseq[2]=substr($refseq[2],$num+3,$len+3); 115 | $refseq[2]=$startseq."_".$refseq[2]; 116 | $l[15]=join ":",@refseq; 117 | }else{ 118 | #cut cginfo 119 | my @cginfo=split/:/,$l[14]; 120 | $cginfo[2]=substr($cginfo[2],0,$len); 121 | $l[14]=join ":",@cginfo; 122 | 123 | #cut refseq 124 | my @refseq=split/:/,$l[15]; 125 | my $endseq=substr($refseq[2],$len+3,2); 126 | $refseq[2]=substr($refseq[2],0,$len+3); 127 | $refseq[2]=$refseq[2]."_".$endseq; 128 | $l[15]=join ":",@refseq; 129 | } 130 | 131 | my $ll=join ";",@l; 132 | $ll=~s/;/\t/g; 133 | print OUT_tmp "$ll\n"; 134 | }elsif($l[5]=~/^(\d+)M(\d+)S$/){ 135 | my $len=$1-$num; 136 | my $seq=substr($l[9],0,$len); 137 | $l[9]=$seq; 138 | $l[5]="$len"."M"; 139 | 140 | if($l[11] eq "XO:Z:+FW" || $l[11] eq "XO:Z:+RC"){ 141 | #cut cginfo 142 | my @cginfo=split/:/,$l[14]; 143 | $cginfo[2]=substr($cginfo[2],0,$len); 144 | $l[14]=join ":",@cginfo; 145 | 146 | #cut refseq 147 | my @refseq=split/:/,$l[15]; 148 | my $endseq=substr($refseq[2],$len+3,2); 149 | $refseq[2]=substr($refseq[2],0,$len+3); 150 | $refseq[2]=$refseq[2]."_".$endseq; 151 | $l[15]=join ":",@refseq; 152 | }else{ 153 | #cut cginfo 154 | my @cginfo=split/:/,$l[14]; 155 | $cginfo[2]=substr($cginfo[2],$num,$len); 156 | $l[14]=join ":",@cginfo; 157 | 158 | #cut refseq 159 | my @refseq=split/:/,$l[15]; 160 | my $startseq=substr($refseq[2],$num+1,2); 161 | $refseq[2]=substr($refseq[2],$num+3,$len+3); 162 | $refseq[2]=$startseq."_".$refseq[2]; 163 | $l[15]=join ":",@refseq; 164 | } 165 | 166 | my $ll=join ";",@l; 167 | $ll=~s/;/\t/g; 168 | print OUT_tmp "$ll\n"; 169 | }elsif($l[5]=~/^(\d+)S(\d+)M(\d+)S$/){ 170 | my $len=$2-$num-$num; 171 | my $seq=substr($l[9],$1+$num,$len); 172 | $l[9]=$seq; 173 | $l[5]="$len"."M"; 174 | $l[3]+=$num; 175 | 176 | #cut cginfo 177 | my @cginfo=split/:/,$l[14]; 178 | $cginfo[2]=substr($cginfo[2],$num,$len); 179 | $l[14]=join ":",@cginfo; 180 | 181 | #cut refseq 182 | my @refseq=split/:/,$l[15]; 183 | my $startseq=substr($refseq[2],$num+1,2); 184 | my $endseq=substr($refseq[2],$len+$num+3,2); 185 | $refseq[2]=substr($refseq[2],$num+3,$len); 186 | $refseq[2]=$startseq."_".$refseq[2]."_".$endseq; 187 | $l[15]=join ":",@refseq; 188 | 189 | 190 | my $ll=join ";",@l; 191 | $ll=~s/;/\t/g; 192 | print OUT_tmp "$ll\n"; 193 | }else{ 194 | print LOW "$_\n"; 195 | } 196 | } 197 | } 198 | 199 | close IN_bam; 200 | close LOW; 201 | close OUT_tmp; 202 | 203 | `samtools view -S -@ $threads $fileprefix.tmp.sam -o $out`; 204 | `rm $fileprefix.tmp.sam`; 205 | `samtools view -S -@ $threads $fileprefix.local.lowquality.sam -o $fileprefix.local.lowquality.bam`; 206 | `rm $fileprefix.local.lowquality.sam`; 207 | -------------------------------------------------------------------------------- /scBS-map.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # scBSmap - scBSmap.pl 4 | # 5 | # Copyright (C) Peng Wu 6 | # Contact: Peng Wu 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | use strict; 27 | use Getopt::Long qw(:config no_ignore_case); 28 | use Pod::Usage; 29 | 30 | =pod 31 | 32 | =head1 DESCRIPTION 33 | 34 | Single-cell Bisulfite Sequencing Data Mapping 35 | 36 | =head1 USAGE 37 | 38 | perl scBS-map.pl [options] -f <.fastq> -g -o 39 | 40 | Options: 41 | -f File name for sequencing reads, .fastq format. 42 | - a compressed file (.fastq.gz) is also supported 43 | -g Genome file name, fasta format. 44 | -o Output file name, bam format. 45 | -l Length of trimming bases from the 5' end of the read [default: 10]. 46 | -p Number of threads. [default: 12]. 47 | -s Path to samtools eg. /home/user/bin/samtools 48 | - By default, we try to search samtools in system PATH. 49 | -a Path to bs_seeker2-align eg. /home/user/bin/bs_seeker2-align.py 50 | - By default, we try to search bs_seeker2-ailgn in system PATH. 51 | -b Path to bs_seeker2-build eg. /home/user/bin/bs_seeker2-build.py 52 | - By default, we try to search bs_seeker2-build in system PATH. 53 | -w Logical to determine if the genome index needs to be built or not [default: FALSE]. 54 | -n Length of removing microhomology regions from bam files [default: 10]. 55 | -k Logical to determine whether to keep temporary files [default: FALSE]. 56 | -h Help message. 57 | 58 | =head1 AUTHOR 59 | 60 | Contact: Peng Wu; wupeng1@ihcams.ac.cn 61 | Last update: 2018-10-24 62 | 63 | =cut 64 | 65 | ## Parsing arguments from command line 66 | my ($reads, $length, $samtools, $bs_seeker2_align, $bs_seeker2_build, $buildornot, $genome, $out, $threads, $num, $keeptmp, $help); 67 | 68 | GetOptions( 69 | 'f:s' => \$reads, 70 | 'l:i' => \$length, 71 | 's:s' => \$samtools, 72 | 'a:s' => \$bs_seeker2_align, 73 | 'b:s' => \$bs_seeker2_build, 74 | 'w:s' => \$buildornot, 75 | 'g:s' => \$genome, 76 | 'o:s' => \$out, 77 | 'p:i' => \$threads, 78 | 'n:i' => \$num, 79 | 'k:s' => \$keeptmp, 80 | 'h|help' => \$help 81 | ); 82 | 83 | ## Print usage 84 | pod2usage( { -verbose => 2, -output => \*STDERR } ) if ( $help ); 85 | ( $reads and $genome and $out ) or pod2usage(); 86 | 87 | 88 | ## Set default 89 | $samtools ||= `which samtools`; 90 | chomp $samtools; 91 | $samtools or pod2usage(); 92 | 93 | $bs_seeker2_align ||= `which bs_seeker2-align.py`; 94 | chomp $bs_seeker2_align; 95 | $bs_seeker2_align or pod2usage(); 96 | 97 | $bs_seeker2_build ||= `which bs_seeker2-build.py`; 98 | chomp $bs_seeker2_build; 99 | $bs_seeker2_build or pod2usage(); 100 | 101 | 102 | $out ||= "output.bam"; 103 | $buildornot ||= "FALSE"; 104 | $keeptmp ||= "FALSE"; 105 | $length ||= 10; 106 | $threads ||= 12; 107 | $num ||= 10; 108 | 109 | ## Step1. qcreads 110 | my $datestring = localtime(); 111 | print "[$datestring] ------------------ scBS-map BEGIN -------------------\n"; 112 | print "[$datestring] Input file: $reads\n"; 113 | print "[$datestring] Start trimming the input reads..."; 114 | if($reads=~/gz/){ 115 | open IN, "gzip -dc $reads |" or die $!; 116 | }else{ 117 | open IN, $reads or die $!; 118 | } 119 | $reads=~s/.gz//; 120 | $reads=~s/.fastq//; 121 | $reads=~s/.fq//; 122 | open OUT, "| gzip -c > $reads.trim.fastq.gz" or die $!; 123 | 124 | open REP, "> $reads.scBS-map.report"; 125 | print REP "--------------------------------------------------\n"; 126 | print REP "scBS-map report for $reads\n"; 127 | print REP "--------------------------------------------------\n"; 128 | 129 | my $readsnumber; 130 | my $basesnumber; 131 | my $readsnumber_qc; 132 | my $basesnumber_qc; 133 | while (){ 134 | chomp (my $line0 = $_); 135 | chomp (my $line1 = ); 136 | chomp (my $line2 = ); 137 | chomp (my $line3 = ); 138 | $readsnumber++; 139 | $basesnumber+=length($line1); 140 | my $line1_new=substr($line1, $length-1, length($line1)); 141 | my $line3_new=substr($line3, $length-1, length($line3)); 142 | 143 | if(length($line1_new)>=35){ 144 | print OUT "$line0\n$line1_new\n$line2\n$line3_new\n"; 145 | $readsnumber_qc++; 146 | $basesnumber_qc+=length($line1_new); 147 | } 148 | } 149 | 150 | close IN; 151 | close OUT; 152 | 153 | print REP "Number of reads: $readsnumber\n"; 154 | print REP "Number of bases: $basesnumber\n\n"; 155 | print REP "Number of reads after quality control: $readsnumber_qc\n"; 156 | print REP "Number of bases after quality control: $basesnumber_qc\n\n"; 157 | 158 | ## Step2. align-end2end 159 | $datestring = localtime(); 160 | print "Finish!\n[$datestring] Start mapping using the end-to-end mode..."; 161 | if($buildornot eq "TRUE"){ 162 | `$bs_seeker2_build -f $genome --aligner=bowtie2`; 163 | } 164 | 165 | `$bs_seeker2_align -i $reads.trim.fastq.gz -g $genome -t Y -m 0.04 -o $reads.end2end.bam -M $reads.multihits.fq -u $reads.unaligned.fq --bt2-p $threads --bt2--end-to-end --aligner=bowtie2`; 166 | `rm $reads.end2end.bam.bs_seeker2_log`; 167 | 168 | my $end2end_basesnumber; 169 | my $end2end_readsnumber; 170 | open OUT_localbam,"$samtools view $reads.end2end.bam |" or die $!; 171 | while(){ 172 | chomp; 173 | $end2end_readsnumber++; 174 | my @bamline=split/\t/; 175 | $end2end_basesnumber+=length($bamline[9]); 176 | if($bamline[5]=~/^(\d+)S/){ 177 | $end2end_basesnumber-=$1; 178 | } 179 | if($bamline[5]=~/(\d+)S$/){ 180 | $end2end_basesnumber-=$1; 181 | } 182 | } 183 | 184 | close OUT_localbam; 185 | 186 | print REP "Number of mapped reads using the end-to-end mode: $end2end_readsnumber\n"; 187 | print REP "Number of mapped bases using the end-to-end mode: $end2end_basesnumber\n\n"; 188 | 189 | ## Step3. align-local 190 | $datestring = localtime(); 191 | print "Finish!\n[$datestring] Start mapping using the local mode..."; 192 | `$bs_seeker2_align -i $reads.unaligned.fq -g $genome -t Y -m 0.04 -o $reads.local.tmp.bam -M $reads.multihits.local.fq -u $reads.unaligned.local.fq --bt2-p $threads --aligner=bowtie2`; 193 | `rm $reads.local.tmp.bam.bs_seeker2_log`; 194 | 195 | `cat $reads.multihits.local.fq >>$reads.multihits.fq`; 196 | `rm $reads.multihits.local.fq`; 197 | `mv $reads.unaligned.local.fq $reads.unaligned.fq`; 198 | 199 | ## Step4. qcbam 200 | $datestring = localtime(); 201 | print "Finish!\n[$datestring] Start removing the microhomology regions for local alignment..."; 202 | open IN_bam,"$samtools view $reads.local.tmp.bam -h|" or die $!; 203 | open OUT_tmp, "> $reads.local.sam"; 204 | open LOW, "> $reads.local.lowquality.sam"; 205 | while(){ 206 | chomp; 207 | my @l = split/\t/; 208 | if(/^@/){ 209 | print OUT_tmp "$_\n"; 210 | }else{ 211 | if($l[5] !~ /S/){ 212 | print OUT_tmp "$_\n"; 213 | }elsif($l[5]=~/^(\d+)S(\d+)M$/){ 214 | my $len=$2-$num; 215 | my $seq=substr($l[9],$1+$num,$len); 216 | $l[9]=$seq; 217 | $l[5]="$len"."M"; 218 | $l[3]+=$num; 219 | my $ll=join ";",@l; 220 | $ll=~s/;/\t/g; 221 | print OUT_tmp "$ll\n"; 222 | }elsif($l[5]=~/^(\d+)M(\d+)S$/){ 223 | my $len=$1-$num; 224 | my $seq=substr($l[9],0,$len); 225 | $l[9]=$seq; 226 | $l[5]="$len"."M"; 227 | my $ll=join ";",@l; 228 | $ll=~s/;/\t/g; 229 | print OUT_tmp "$ll\n"; 230 | }elsif($l[5]=~/^(\d+)S(\d+)M(\d+)S$/){ 231 | my $len=$2-$num-$num; 232 | my $seq=substr($l[9],$1+$num,$len); 233 | $l[9]=$seq; 234 | $l[5]="$len"."M"; 235 | $l[3]+=$num; 236 | my $ll=join ";",@l; 237 | $ll=~s/;/\t/g; 238 | print OUT_tmp "$ll\n"; 239 | }else{ 240 | print LOW "$_\n"; 241 | } 242 | } 243 | } 244 | 245 | close IN_bam; 246 | close LOW; 247 | close OUT_tmp; 248 | 249 | `$samtools view -S -@ $threads $reads.local.sam -o $reads.local.bam`; 250 | if($keeptmp eq "FALSE"){ 251 | `rm $reads.local.sam`; 252 | `rm $reads.local.tmp.bam`; 253 | `rm $reads.local.lowquality.sam`; 254 | } 255 | 256 | 257 | my $local_basesnumber; 258 | my $local_readsnumber; 259 | open OUT_localbam,"$samtools view $reads.local.bam |" or die $!; 260 | while(){ 261 | chomp; 262 | $local_readsnumber++; 263 | my @bamline=split/\t/; 264 | $local_basesnumber+=length($bamline[9]); 265 | if($bamline[5]=~/^(\d+)S/){ 266 | $local_basesnumber-=$1; 267 | } 268 | if($bamline[5]=~/(\d+)S$/){ 269 | $local_basesnumber-=$1; 270 | } 271 | } 272 | 273 | close OUT_localbam; 274 | 275 | print REP "Number of mapped reads using the local mode: $local_readsnumber\n"; 276 | print REP "Number of mapped bases using the local mode: $local_basesnumber\n\n"; 277 | 278 | ## Step5. mergebam 279 | $datestring = localtime(); 280 | print "Finish!\n[$datestring] Start merging the end-to-end and local alignments..."; 281 | `$samtools merge -@ $threads -f $out $reads.end2end.bam $reads.local.bam`; 282 | 283 | my $mapped_basesnumber; 284 | my $mapped_readsnumber; 285 | open OUT_bam,"$samtools view $out |" or die $!; 286 | while(){ 287 | chomp; 288 | $mapped_readsnumber++; 289 | my @bamline=split/\t/; 290 | $mapped_basesnumber+=length($bamline[9]); 291 | if($bamline[5]=~/^(\d+)S/){ 292 | $mapped_basesnumber-=$1; 293 | } 294 | if($bamline[5]=~/(\d+)S$/){ 295 | $mapped_basesnumber-=$1; 296 | } 297 | } 298 | 299 | close OUT_bam; 300 | 301 | my $multinumber=`grep '^>' $reads.multihits.fq -c`; 302 | my $unmappednumber=$readsnumber_qc-$multinumber-$mapped_readsnumber; 303 | print REP "Number of unmapped reads: $unmappednumber\n"; 304 | print REP "Number of multi-hits reads: $multinumber\n"; 305 | print REP "Number of mapped reads in total: $mapped_readsnumber\n"; 306 | my $mapratio=sprintf "%.4f", $mapped_readsnumber/$readsnumber; 307 | print REP "Mappability in total: ",$mapratio*100,"%\n\n"; 308 | print REP "Number of mapped bases in total: $mapped_basesnumber\n"; 309 | my $mapratio_base=sprintf "%.4f", $mapped_basesnumber/$basesnumber; 310 | print REP "Mappability at base level in total: ",$mapratio_base*100,"%\n"; 311 | print REP "--------------------------------------------------\n"; 312 | 313 | close REP; 314 | 315 | $datestring = localtime(); 316 | print "Finish!\n[$datestring] Output alignment file: $out\n"; 317 | print "[$datestring] ------------------- scBS-map END --------------------\n"; 318 | --------------------------------------------------------------------------------