├── .github
└── FUNDING.yml
├── README.md
├── conf
├── parallelLastz.pl
└── testDATA
├── qsample1.fa
└── tsample.fa
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: jnarayan81 # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # parallelLastz v0.2
2 | ## Lastz with multi-threads support.
3 |
4 | [](https://anaconda.org/jnarayan81/parallellastz)
5 | [](https://anaconda.org/jnarayan81/parallellastz)
6 |
7 |
8 | Running Lastz (https://github.com/lastz/lastz) in parallel mode. This program is for single computer with multiple core processors.
9 |
10 | When the query file format is fasta, you can specify many threads to process it. It can reduce run time linearly, and use almost equal memory as the original lastz program. This is useful when you lastz a big query file to a huge reference like human whole genome sequence.
11 |
12 | The program is an extension on the original lastz program which was written by Bob Harris (the LASTZ guy).
13 |
14 | parallelLastz can run on Linux and Mac OS.
15 |
16 | It run lastz in parallel mode and generate .lz (tab file) file.
17 |
18 | perl parallelLastz.pl -h for more help
19 |
20 | ```
21 | Usage: parallelLastz.pl --qfile <> --tfile <> --cfile <> --speedup <#>
22 | Options:
23 | --qfile|-q Query multifasta/fasta file
24 | --tfile|-t Target genome file
25 | --cfile|-c Config file
26 | --speedup|-s Number of cores to use
27 | --length|-l Minimum length of sequences to process
28 | --unmask|-u Unmask lowercase in target and query files
29 | --wipe|-w Wipe intermediate files
30 | --verbose|-v Enable verbose logging
31 | --retry|-r Number of retry attempts for failed jobs
32 | --output|-o Output directory for saving results
33 | --help|-h Show this help message
34 |
35 | ```
36 |
37 | ## Conda
38 |
39 | To install parallelLastz conda packages, in the terminal or an Anaconda Prompt, run:
40 |
41 | ```
42 | conda install -c jnarayan81 parallellastz
43 | ```
44 | The test data can be found at https://github.com/jnarayan81/parallelLastz/tree/master/testDATA, and the sample configuration file at https://github.com/jnarayan81/parallelLastz/blob/master/conf.
45 |
46 | ## Citation
47 | Harris, R.S. (2007) Improved pairwise alignment of genomic DNA. Ph.D. Thesis, The Pennsylvania State University.
48 |
49 | Please feel free to give this repository a few likes as encouragement. :+1: :pray: :clap:
50 |
51 | ## Help
52 | Contact me at jnarayan81@gmail.com or info@bioinformaticsonline.com
53 |
--------------------------------------------------------------------------------
/conf:
--------------------------------------------------------------------------------
1 | # Write you comments here with "#"
2 | # You can write all lastz parameters here EXECPT --output (if you wanna change it then see script line number 84)
3 | # Do not use seq.fa[multiple], this script do it by one chromosome/scaffolds/contig/sequence at a time
4 | # The outfile will be named after each fasta sequence name: seeALN_.lz
5 | # Write all command in newline
6 | # You can find detail of parameters here http://www.bx.psu.edu/miller_lab/dist/README.lastz-1.02.00/README.lastz-1.02.00a.html
7 |
8 | #-----------------------------------------------------------------------------------------------------------------
9 | # Parameters you can use for precise genome location, set below (one flag per line)
10 | # --gapped --gap=600,150 --hspthresh=4500, --seed=12of19 --notransition --ydrop=15000 --chain
11 | #-----------------------------------------------------------------------------------------------------------------
12 |
13 | #All LASTZ parameters goes here
14 |
15 | #For chain only
16 | --chain
17 |
18 | #Outfile format ‑‑format=general for tabbed file;
19 | #Use can also use plotted using the ‑‑format=rdotplot output option and the R statistical package.
20 | --format=general-
21 | --progress
22 |
23 | #Identity here
24 | --identity=90
25 |
26 | #Coverage
27 | #‑‑coverage=90
28 |
29 | #Ambiguous to take care of N or ambiguous=n
30 | --ambiguous=iupac
31 |
32 | #Using ‑‑step=10, we will only be looking for seeds at every 10th base
33 | #‑‑step=10
34 |
35 | #Use ‑‑exact=20 so that a 20-base exact match is required to qualify as an HSP
36 | #‑‑exact=20
37 |
38 | #The stricter ‑‑match=1,5. This scores matching bases as +1 and mismatches as −5
39 | #‑‑match=1,5
40 |
41 | #Check the trand ‑‑strand=minus ‑‑strand=plus
42 | #‑‑strand=both
43 |
--------------------------------------------------------------------------------
/parallelLastz.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use strict;
4 | use warnings;
5 | use Getopt::Long qw(GetOptions);
6 | use Parallel::ForkManager;
7 | use Bio::SeqIO;
8 | use File::Temp;
9 | use File::Spec;
10 | use Term::ProgressBar;
11 | use Log::Dispatch;
12 |
13 | # Author: Jitendra Narayan / jnarayan81@gmail.com
14 | # Usage: perl parallelLastz.pl
15 | # perl parallelLastz.pl -q testDATA/qsample1.fa -t testDATA/tsample.fa -c conf -s 4 -w -l 10
16 |
17 | print <<'WELCOME';
18 | _ _ _ __ _
19 | _ __ __ _ _ __ __ _| | | ___| | / / __ _ ___| |_ ____
20 | | '_ \ / _` | '__/ _` | | |/ _ \ |/ / / _` / __| __|_ /
21 | | |_) | (_| | | | (_| | | | __/ / /__| (_| \__ \ |_ / /
22 | | .__/ \__,_|_| \__,_|_|_|\___|_\____/\__,_|___/\__/___|v0.2
23 | |_|
24 | parallelLastz: Run lastz jobs in parallel
25 | Contact: jnarayan81@gmail.com for support
26 |
27 | WELCOME
28 |
29 | # Variables declaration
30 | my ($qfile, $tfile, $config, $thread, $length, $wipe, $help, $unmask, $verbose, $retry, $output_dir);
31 | my $version = 0.2;
32 |
33 | # Parse command line options
34 | GetOptions(
35 | 'qfile|q=s' => \$qfile,
36 | 'tfile|t=s' => \$tfile,
37 | 'cfile|c=s' => \$config,
38 | 'speedup|s=i' => \$thread,
39 | 'length|l=i' => \$length,
40 | 'wipe|w' => \$wipe,
41 | 'unmask|u' => \$unmask,
42 | 'verbose|v' => \$verbose,
43 | 'retry|r=i' => \$retry,
44 | 'output|o=s' => \$output_dir,
45 | 'help|h' => \$help
46 | ) or die usage($version);
47 | usage($version) if $help;
48 |
49 | # Validate mandatory inputs
50 | my @missing;
51 | push @missing, 'query file (--qfile or -q)' unless $qfile;
52 | push @missing, 'target file (--tfile or -t)' unless $tfile;
53 | push @missing, 'config file (--cfile or -c)' unless $config;
54 | push @missing, 'length (--length or -l)' unless $length;
55 |
56 | if (@missing) {
57 | print "Error: Missing the following required option(s):\n";
58 | print " - $_\n" for @missing;
59 | exit;
60 | }
61 |
62 | $thread ||= `grep -c '^processor' /proc/cpuinfo` || 1;
63 | $output_dir ||= '.';
64 | mkdir $output_dir unless -d $output_dir;
65 |
66 | $retry ||= 1;
67 | $verbose ||= 0;
68 |
69 | # Initialize logging
70 | my $logger = Log::Dispatch->new(
71 | outputs => [
72 | ['Screen', min_level => 'info', newline => 1]
73 | ]
74 | );
75 |
76 | # Read configuration file
77 | my $parameters = readConfig($config);
78 | my $param = join(' ', @$parameters);
79 |
80 | # Optionally convert query file to uppercase
81 | if ($unmask) {
82 | my $qfile_corrected = File::Spec->catfile($output_dir, 'tmpUC');
83 | convertToUpperCase($qfile, $qfile_corrected);
84 | $qfile = $qfile_corrected;
85 | }
86 |
87 | # Locate lastz binary
88 | my $lastZ_tool = locateLastz() or die "No lastz command found.\nTry installing it using conda: 'conda install -c bioconda lastz'\n";
89 |
90 | # Load sequences from target file
91 | my %sequences = loadSequences($tfile);
92 |
93 | # Initialize parallel processing
94 | my $pm = Parallel::ForkManager->new($thread);
95 |
96 | # Initialize progress bar
97 | my $progress = Term::ProgressBar->new({
98 | name => 'Processing',
99 | count => scalar(keys %sequences),
100 | fh => \*STDOUT
101 | });
102 |
103 | $pm->run_on_finish(
104 | sub {
105 | my ($pid, $exit_code, $ident) = @_;
106 | $progress->update(); # Update progress bar
107 | if ($verbose) {
108 | $logger->info("Process $ident finished with exit code $exit_code.");
109 | }
110 | }
111 | );
112 |
113 | $pm->run_on_start(
114 | sub {
115 | my ($pid, $ident) = @_;
116 | $logger->info("Process $ident started with PID $pid.") if $verbose;
117 | }
118 | );
119 |
120 | # Process each sequence in parallel
121 | foreach my $id (keys %sequences) {
122 | next if length($sequences{$id}) <= $length;
123 |
124 | my $pid = $pm->start($id) and next;
125 | my $attempts = 0;
126 | my $success = 0;
127 |
128 | while ($attempts < $retry && !$success) {
129 | eval {
130 | runLastz($id, $sequences{$id}, $qfile, $param, $unmask, $lastZ_tool, $output_dir);
131 | $success = 1;
132 | };
133 | if ($@) {
134 | $logger->error("Error running lastz for $id: $@");
135 | $attempts++;
136 | sleep 1; # Wait before retrying
137 | }
138 | }
139 |
140 | $pm->finish if $success;
141 | if (!$success) {
142 | $logger->error("Failed to process $id after $retry attempts.");
143 | }
144 | }
145 |
146 | # Wait for all jobs to complete
147 | $pm->wait_all_children;
148 | $logger->info("All jobs completed.");
149 |
150 | # Optional: Clean up intermediate files
151 | if ($wipe) {
152 | cleanUpIntermediateFiles($output_dir);
153 | $logger->info("Alignment completed. Final results are in 'finalAlign.tsv'.");
154 | } else {
155 | $logger->info("Alignment completed. Check the individual '.lz' files in $output_dir.");
156 | }
157 |
158 | # Subroutines
159 |
160 | # Usage message
161 | sub usage {
162 | my $ver = shift;
163 | print "\n parallelLastz v$ver\n";
164 | print "Usage: $0 --qfile <> --tfile <> --cfile <> --speedup <#>\n";
165 | print "Options:\n";
166 | print " --qfile|-q Query multifasta/fasta file\n";
167 | print " --tfile|-t Target genome file\n";
168 | print " --cfile|-c Config file\n";
169 | print " --speedup|-s Number of cores to use\n";
170 | print " --length|-l Minimum length of sequences to process\n";
171 | print " --unmask|-u Unmask lowercase in target and query files\n";
172 | print " --wipe|-w Wipe intermediate files\n";
173 | print " --verbose|-v Enable verbose logging\n";
174 | print " --retry|-r Number of retry attempts for failed jobs\n";
175 | print " --output|-o Output directory for saving results\n";
176 | print " --help|-h Show this help message\n";
177 | exit;
178 | }
179 |
180 | # Read sequences from a fasta file
181 | sub loadSequences {
182 | my ($filename) = @_;
183 | my %sequences;
184 | my $seqio = Bio::SeqIO->new(-file => $filename, -format => "fasta");
185 |
186 | while (my $seqobj = $seqio->next_seq) {
187 | my $id = $seqobj->display_id;
188 | my $seq = $seqobj->seq;
189 |
190 | # Replace spaces in the header with underscores
191 | $id =~ s/\s/_/g;
192 |
193 | $sequences{$id} = $seq;
194 | }
195 |
196 | return %sequences;
197 | }
198 |
199 |
200 | # Run lastz for a specific sequence
201 | sub runLastz {
202 | my ($name, $seq, $qfile, $param, $unmask, $lastZ_tool, $output_dir) = @_;
203 | $logger->info("Processing $name...");
204 |
205 | # Create a temporary file for the sequence
206 | my $tmp_fh = File::Temp->new(UNLINK => 1);
207 | print $tmp_fh ">$name\n$seq\n";
208 | close $tmp_fh;
209 |
210 | # Build and execute lastz command
211 | my $output_file = File::Spec->catfile($output_dir, "seeALN_$name.lz");
212 | my $myLASTZ = "$lastZ_tool $tmp_fh->filename $qfile --output=$output_file $param";
213 | system($myLASTZ) == 0 or die "Error running lastz on $name: $!";
214 | $logger->info("lastz completed for $name, output saved to $output_file.");
215 | }
216 |
217 | # Locate lastz in the system path
218 | sub locateLastz {
219 | my $lastZ_tool = "lastz";
220 | my $tool_path = '';
221 | my $found = 0;
222 |
223 | # Check if 'lastz' exists in the current directory (./)
224 | if (-f "./$lastZ_tool" && -x "./$lastZ_tool") {
225 | $logger->info("'$lastZ_tool' found in the current directory.");
226 | $tool_path = "./$lastZ_tool";
227 | $found = 1;
228 | } else {
229 | # If not found in the current directory, search in system's PATH
230 | for my $path (split /:/, $ENV{PATH}) {
231 | if (-f "$path/$lastZ_tool" && -x "$path/$lastZ_tool") {
232 | $logger->info("'$lastZ_tool' found in $path");
233 | $tool_path = "$path/$lastZ_tool";
234 | $found = 1;
235 | last;
236 | }
237 | }
238 | }
239 |
240 | return $tool_path if $found;
241 | die "No '$lastZ_tool' command found in the current directory or system's PATH.\nTry installing it using conda: 'conda install -c bioconda lastz'\n";
242 | }
243 |
244 | # Convert sequences to uppercase
245 | sub convertToUpperCase {
246 | my ($infile, $outfile) = @_;
247 | open my $in_fh, '<', $infile or die "Cannot open input file $infile: $!";
248 | open my $out_fh, '>', $outfile or die "Cannot open output file $outfile: $!";
249 |
250 | while (<$in_fh>) {
251 | print $out_fh uc($_);
252 | }
253 |
254 | close $in_fh;
255 | close $out_fh;
256 | }
257 |
258 | # Read config file
259 | sub readConfig {
260 | my ($file) = @_;
261 | open my $fh, '<', $file or die "Cannot open config file $file: $!";
262 | my @lines;
263 |
264 | while (<$fh>) {
265 | chomp;
266 | next if /^#/ || /^\s*$/;
267 | push @lines, $_;
268 | }
269 |
270 | close $fh;
271 | return \@lines;
272 | }
273 |
274 | # Clean up intermediate files
275 | sub cleanUpIntermediateFiles {
276 | my ($output_dir) = @_;
277 | my @files = glob(File::Spec->catfile($output_dir, '*.lz'));
278 |
279 | # Avoid using a large number of files with `cat` directly
280 | my $final_output = File::Spec->catfile($output_dir, 'finalAlign.tsv');
281 |
282 | open my $out_fh, '>', $final_output or die "Cannot open final output file $final_output: $!";
283 |
284 | for my $file (@files) {
285 | open my $in_fh, '<', $file or die "Cannot open file $file: $!";
286 | while (<$in_fh>) {
287 | print $out_fh $_;
288 | }
289 | close $in_fh;
290 | unlink $file; # Remove the intermediate file after processing
291 | }
292 |
293 | close $out_fh;
294 | }
295 |
296 |
--------------------------------------------------------------------------------
/testDATA/qsample1.fa:
--------------------------------------------------------------------------------
1 | >seq1
2 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA
3 |
4 | >seq2
5 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA
6 |
7 | >seq3
8 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA
9 |
--------------------------------------------------------------------------------
/testDATA/tsample.fa:
--------------------------------------------------------------------------------
1 | >chr1
2 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA
3 |
4 | >chr2
5 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA
6 |
7 |
--------------------------------------------------------------------------------