├── .github └── FUNDING.yml ├── README.md ├── conf ├── parallelLastz.pl └── testDATA ├── qsample1.fa └── tsample.fa /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: jnarayan81 # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # parallelLastz v0.2 2 | ## Lastz with multi-threads support. 3 | 4 | [![Conda](https://anaconda.org/jnarayan81/parallellastz/badges/installer/conda.svg)](https://anaconda.org/jnarayan81/parallellastz) 5 | [![Linux](https://anaconda.org/jnarayan81/parallellastz/badges/platforms.svg)](https://anaconda.org/jnarayan81/parallellastz) 6 | 7 | 8 | Running Lastz (https://github.com/lastz/lastz) in parallel mode. This program is for single computer with multiple core processors. 9 | 10 | When the query file format is fasta, you can specify many threads to process it. It can reduce run time linearly, and use almost equal memory as the original lastz program. This is useful when you lastz a big query file to a huge reference like human whole genome sequence. 11 | 12 | The program is an extension on the original lastz program which was written by Bob Harris (the LASTZ guy). 13 | 14 | parallelLastz can run on Linux and Mac OS. 15 | 16 | It run lastz in parallel mode and generate .lz (tab file) file. 17 | 18 | perl parallelLastz.pl -h for more help 19 | 20 | ``` 21 | Usage: parallelLastz.pl --qfile <> --tfile <> --cfile <> --speedup <#> 22 | Options: 23 | --qfile|-q Query multifasta/fasta file 24 | --tfile|-t Target genome file 25 | --cfile|-c Config file 26 | --speedup|-s Number of cores to use 27 | --length|-l Minimum length of sequences to process 28 | --unmask|-u Unmask lowercase in target and query files 29 | --wipe|-w Wipe intermediate files 30 | --verbose|-v Enable verbose logging 31 | --retry|-r Number of retry attempts for failed jobs 32 | --output|-o Output directory for saving results 33 | --help|-h Show this help message 34 | 35 | ``` 36 | 37 | ## Conda 38 | 39 | To install parallelLastz conda packages, in the terminal or an Anaconda Prompt, run: 40 | 41 | ``` 42 | conda install -c jnarayan81 parallellastz 43 | ``` 44 | The test data can be found at https://github.com/jnarayan81/parallelLastz/tree/master/testDATA, and the sample configuration file at https://github.com/jnarayan81/parallelLastz/blob/master/conf. 45 | 46 | ## Citation 47 | Harris, R.S. (2007) Improved pairwise alignment of genomic DNA. Ph.D. Thesis, The Pennsylvania State University. 48 | 49 | Please feel free to give this repository a few likes as encouragement. :+1: :pray: :clap: 50 | 51 | ## Help 52 | Contact me at jnarayan81@gmail.com or info@bioinformaticsonline.com 53 | -------------------------------------------------------------------------------- /conf: -------------------------------------------------------------------------------- 1 | # Write you comments here with "#" 2 | # You can write all lastz parameters here EXECPT --output (if you wanna change it then see script line number 84) 3 | # Do not use seq.fa[multiple], this script do it by one chromosome/scaffolds/contig/sequence at a time 4 | # The outfile will be named after each fasta sequence name: seeALN_.lz 5 | # Write all command in newline 6 | # You can find detail of parameters here http://www.bx.psu.edu/miller_lab/dist/README.lastz-1.02.00/README.lastz-1.02.00a.html 7 | 8 | #----------------------------------------------------------------------------------------------------------------- 9 | # Parameters you can use for precise genome location, set below (one flag per line) 10 | # --gapped --gap=600,150 --hspthresh=4500, --seed=12of19 --notransition --ydrop=15000 --chain 11 | #----------------------------------------------------------------------------------------------------------------- 12 | 13 | #All LASTZ parameters goes here 14 | 15 | #For chain only 16 | --chain 17 | 18 | #Outfile format ‑‑format=general for tabbed file; 19 | #Use can also use plotted using the ‑‑format=rdotplot output option and the R statistical package. 20 | --format=general- 21 | --progress 22 | 23 | #Identity here 24 | --identity=90 25 | 26 | #Coverage 27 | #‑‑coverage=90 28 | 29 | #Ambiguous to take care of N or ambiguous=n 30 | --ambiguous=iupac 31 | 32 | #Using ‑‑step=10, we will only be looking for seeds at every 10th base 33 | #‑‑step=10 34 | 35 | #Use ‑‑exact=20 so that a 20-base exact match is required to qualify as an HSP 36 | #‑‑exact=20 37 | 38 | #The stricter ‑‑match=1,5. This scores matching bases as +1 and mismatches as −5 39 | #‑‑match=1,5 40 | 41 | #Check the trand ‑‑strand=minus ‑‑strand=plus 42 | #‑‑strand=both 43 | -------------------------------------------------------------------------------- /parallelLastz.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long qw(GetOptions); 6 | use Parallel::ForkManager; 7 | use Bio::SeqIO; 8 | use File::Temp; 9 | use File::Spec; 10 | use Term::ProgressBar; 11 | use Log::Dispatch; 12 | 13 | # Author: Jitendra Narayan / jnarayan81@gmail.com 14 | # Usage: perl parallelLastz.pl 15 | # perl parallelLastz.pl -q testDATA/qsample1.fa -t testDATA/tsample.fa -c conf -s 4 -w -l 10 16 | 17 | print <<'WELCOME'; 18 | _ _ _ __ _ 19 | _ __ __ _ _ __ __ _| | | ___| | / / __ _ ___| |_ ____ 20 | | '_ \ / _` | '__/ _` | | |/ _ \ |/ / / _` / __| __|_ / 21 | | |_) | (_| | | | (_| | | | __/ / /__| (_| \__ \ |_ / / 22 | | .__/ \__,_|_| \__,_|_|_|\___|_\____/\__,_|___/\__/___|v0.2 23 | |_| 24 | parallelLastz: Run lastz jobs in parallel 25 | Contact: jnarayan81@gmail.com for support 26 | 27 | WELCOME 28 | 29 | # Variables declaration 30 | my ($qfile, $tfile, $config, $thread, $length, $wipe, $help, $unmask, $verbose, $retry, $output_dir); 31 | my $version = 0.2; 32 | 33 | # Parse command line options 34 | GetOptions( 35 | 'qfile|q=s' => \$qfile, 36 | 'tfile|t=s' => \$tfile, 37 | 'cfile|c=s' => \$config, 38 | 'speedup|s=i' => \$thread, 39 | 'length|l=i' => \$length, 40 | 'wipe|w' => \$wipe, 41 | 'unmask|u' => \$unmask, 42 | 'verbose|v' => \$verbose, 43 | 'retry|r=i' => \$retry, 44 | 'output|o=s' => \$output_dir, 45 | 'help|h' => \$help 46 | ) or die usage($version); 47 | usage($version) if $help; 48 | 49 | # Validate mandatory inputs 50 | my @missing; 51 | push @missing, 'query file (--qfile or -q)' unless $qfile; 52 | push @missing, 'target file (--tfile or -t)' unless $tfile; 53 | push @missing, 'config file (--cfile or -c)' unless $config; 54 | push @missing, 'length (--length or -l)' unless $length; 55 | 56 | if (@missing) { 57 | print "Error: Missing the following required option(s):\n"; 58 | print " - $_\n" for @missing; 59 | exit; 60 | } 61 | 62 | $thread ||= `grep -c '^processor' /proc/cpuinfo` || 1; 63 | $output_dir ||= '.'; 64 | mkdir $output_dir unless -d $output_dir; 65 | 66 | $retry ||= 1; 67 | $verbose ||= 0; 68 | 69 | # Initialize logging 70 | my $logger = Log::Dispatch->new( 71 | outputs => [ 72 | ['Screen', min_level => 'info', newline => 1] 73 | ] 74 | ); 75 | 76 | # Read configuration file 77 | my $parameters = readConfig($config); 78 | my $param = join(' ', @$parameters); 79 | 80 | # Optionally convert query file to uppercase 81 | if ($unmask) { 82 | my $qfile_corrected = File::Spec->catfile($output_dir, 'tmpUC'); 83 | convertToUpperCase($qfile, $qfile_corrected); 84 | $qfile = $qfile_corrected; 85 | } 86 | 87 | # Locate lastz binary 88 | my $lastZ_tool = locateLastz() or die "No lastz command found.\nTry installing it using conda: 'conda install -c bioconda lastz'\n"; 89 | 90 | # Load sequences from target file 91 | my %sequences = loadSequences($tfile); 92 | 93 | # Initialize parallel processing 94 | my $pm = Parallel::ForkManager->new($thread); 95 | 96 | # Initialize progress bar 97 | my $progress = Term::ProgressBar->new({ 98 | name => 'Processing', 99 | count => scalar(keys %sequences), 100 | fh => \*STDOUT 101 | }); 102 | 103 | $pm->run_on_finish( 104 | sub { 105 | my ($pid, $exit_code, $ident) = @_; 106 | $progress->update(); # Update progress bar 107 | if ($verbose) { 108 | $logger->info("Process $ident finished with exit code $exit_code."); 109 | } 110 | } 111 | ); 112 | 113 | $pm->run_on_start( 114 | sub { 115 | my ($pid, $ident) = @_; 116 | $logger->info("Process $ident started with PID $pid.") if $verbose; 117 | } 118 | ); 119 | 120 | # Process each sequence in parallel 121 | foreach my $id (keys %sequences) { 122 | next if length($sequences{$id}) <= $length; 123 | 124 | my $pid = $pm->start($id) and next; 125 | my $attempts = 0; 126 | my $success = 0; 127 | 128 | while ($attempts < $retry && !$success) { 129 | eval { 130 | runLastz($id, $sequences{$id}, $qfile, $param, $unmask, $lastZ_tool, $output_dir); 131 | $success = 1; 132 | }; 133 | if ($@) { 134 | $logger->error("Error running lastz for $id: $@"); 135 | $attempts++; 136 | sleep 1; # Wait before retrying 137 | } 138 | } 139 | 140 | $pm->finish if $success; 141 | if (!$success) { 142 | $logger->error("Failed to process $id after $retry attempts."); 143 | } 144 | } 145 | 146 | # Wait for all jobs to complete 147 | $pm->wait_all_children; 148 | $logger->info("All jobs completed."); 149 | 150 | # Optional: Clean up intermediate files 151 | if ($wipe) { 152 | cleanUpIntermediateFiles($output_dir); 153 | $logger->info("Alignment completed. Final results are in 'finalAlign.tsv'."); 154 | } else { 155 | $logger->info("Alignment completed. Check the individual '.lz' files in $output_dir."); 156 | } 157 | 158 | # Subroutines 159 | 160 | # Usage message 161 | sub usage { 162 | my $ver = shift; 163 | print "\n parallelLastz v$ver\n"; 164 | print "Usage: $0 --qfile <> --tfile <> --cfile <> --speedup <#>\n"; 165 | print "Options:\n"; 166 | print " --qfile|-q Query multifasta/fasta file\n"; 167 | print " --tfile|-t Target genome file\n"; 168 | print " --cfile|-c Config file\n"; 169 | print " --speedup|-s Number of cores to use\n"; 170 | print " --length|-l Minimum length of sequences to process\n"; 171 | print " --unmask|-u Unmask lowercase in target and query files\n"; 172 | print " --wipe|-w Wipe intermediate files\n"; 173 | print " --verbose|-v Enable verbose logging\n"; 174 | print " --retry|-r Number of retry attempts for failed jobs\n"; 175 | print " --output|-o Output directory for saving results\n"; 176 | print " --help|-h Show this help message\n"; 177 | exit; 178 | } 179 | 180 | # Read sequences from a fasta file 181 | sub loadSequences { 182 | my ($filename) = @_; 183 | my %sequences; 184 | my $seqio = Bio::SeqIO->new(-file => $filename, -format => "fasta"); 185 | 186 | while (my $seqobj = $seqio->next_seq) { 187 | my $id = $seqobj->display_id; 188 | my $seq = $seqobj->seq; 189 | 190 | # Replace spaces in the header with underscores 191 | $id =~ s/\s/_/g; 192 | 193 | $sequences{$id} = $seq; 194 | } 195 | 196 | return %sequences; 197 | } 198 | 199 | 200 | # Run lastz for a specific sequence 201 | sub runLastz { 202 | my ($name, $seq, $qfile, $param, $unmask, $lastZ_tool, $output_dir) = @_; 203 | $logger->info("Processing $name..."); 204 | 205 | # Create a temporary file for the sequence 206 | my $tmp_fh = File::Temp->new(UNLINK => 1); 207 | print $tmp_fh ">$name\n$seq\n"; 208 | close $tmp_fh; 209 | 210 | # Build and execute lastz command 211 | my $output_file = File::Spec->catfile($output_dir, "seeALN_$name.lz"); 212 | my $myLASTZ = "$lastZ_tool $tmp_fh->filename $qfile --output=$output_file $param"; 213 | system($myLASTZ) == 0 or die "Error running lastz on $name: $!"; 214 | $logger->info("lastz completed for $name, output saved to $output_file."); 215 | } 216 | 217 | # Locate lastz in the system path 218 | sub locateLastz { 219 | my $lastZ_tool = "lastz"; 220 | my $tool_path = ''; 221 | my $found = 0; 222 | 223 | # Check if 'lastz' exists in the current directory (./) 224 | if (-f "./$lastZ_tool" && -x "./$lastZ_tool") { 225 | $logger->info("'$lastZ_tool' found in the current directory."); 226 | $tool_path = "./$lastZ_tool"; 227 | $found = 1; 228 | } else { 229 | # If not found in the current directory, search in system's PATH 230 | for my $path (split /:/, $ENV{PATH}) { 231 | if (-f "$path/$lastZ_tool" && -x "$path/$lastZ_tool") { 232 | $logger->info("'$lastZ_tool' found in $path"); 233 | $tool_path = "$path/$lastZ_tool"; 234 | $found = 1; 235 | last; 236 | } 237 | } 238 | } 239 | 240 | return $tool_path if $found; 241 | die "No '$lastZ_tool' command found in the current directory or system's PATH.\nTry installing it using conda: 'conda install -c bioconda lastz'\n"; 242 | } 243 | 244 | # Convert sequences to uppercase 245 | sub convertToUpperCase { 246 | my ($infile, $outfile) = @_; 247 | open my $in_fh, '<', $infile or die "Cannot open input file $infile: $!"; 248 | open my $out_fh, '>', $outfile or die "Cannot open output file $outfile: $!"; 249 | 250 | while (<$in_fh>) { 251 | print $out_fh uc($_); 252 | } 253 | 254 | close $in_fh; 255 | close $out_fh; 256 | } 257 | 258 | # Read config file 259 | sub readConfig { 260 | my ($file) = @_; 261 | open my $fh, '<', $file or die "Cannot open config file $file: $!"; 262 | my @lines; 263 | 264 | while (<$fh>) { 265 | chomp; 266 | next if /^#/ || /^\s*$/; 267 | push @lines, $_; 268 | } 269 | 270 | close $fh; 271 | return \@lines; 272 | } 273 | 274 | # Clean up intermediate files 275 | sub cleanUpIntermediateFiles { 276 | my ($output_dir) = @_; 277 | my @files = glob(File::Spec->catfile($output_dir, '*.lz')); 278 | 279 | # Avoid using a large number of files with `cat` directly 280 | my $final_output = File::Spec->catfile($output_dir, 'finalAlign.tsv'); 281 | 282 | open my $out_fh, '>', $final_output or die "Cannot open final output file $final_output: $!"; 283 | 284 | for my $file (@files) { 285 | open my $in_fh, '<', $file or die "Cannot open file $file: $!"; 286 | while (<$in_fh>) { 287 | print $out_fh $_; 288 | } 289 | close $in_fh; 290 | unlink $file; # Remove the intermediate file after processing 291 | } 292 | 293 | close $out_fh; 294 | } 295 | 296 | -------------------------------------------------------------------------------- /testDATA/qsample1.fa: -------------------------------------------------------------------------------- 1 | >seq1 2 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA 3 | 4 | >seq2 5 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA 6 | 7 | >seq3 8 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA 9 | -------------------------------------------------------------------------------- /testDATA/tsample.fa: -------------------------------------------------------------------------------- 1 | >chr1 2 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA 3 | 4 | >chr2 5 | ATATATGATGGTGAGGATGAGTGAGAGAGAGTAGATTGATGATGATAGTAGTAGTAGTATGATGATAGTAGTAGTA 6 | 7 | --------------------------------------------------------------------------------