├── BaseCount_sequenceCount_from_sff_file.pl ├── Calc_coverage_from_spades_assembly.pl ├── IonTorrent_SE_run.bash ├── Kraken_krona_fastq.bash ├── README.md ├── adapters.fasta ├── calc_N50_GC_genomesize.pl ├── contig_size_select.pl ├── cut_and_paste_seq.pl ├── estimate_core_genome_from_bam.pl ├── fastqMcf-bowtie2-SPAdes.bash ├── fix_evm_for_gbrowse.pl ├── gff3_2_gff3EVM.pl ├── run_IonT_SPAdes.pl ├── run_SPAdes.pl ├── run_bowtie2_and_pilon.pl ├── run_bowtie2_subtract_mapped_reads_with_picard.pl ├── run_bowtie2_subtract_unmapped_reads.pl ├── run_fastqMcf.pl ├── run_kSNP.pl └── run_kneaddata_only_human_removal.pl /BaseCount_sequenceCount_from_sff_file.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Written by Tom de Man 4 | 5 | use strict; 6 | 7 | my $usage = qq( 8 | USAGE: perl BaseCount_sequenceCount_from_sff_file.pl /path/to/sff/files 9 | ************************************************* 10 | Arguments is the path to all sff files of a particular library 11 | \n); 12 | 13 | die($usage) if (@ARGV == 0); 14 | 15 | my $sff_path = $ARGV[0]; 16 | 17 | my @sffs = &getSff; 18 | my ($base_amount2, $contig_amount2) = &main(@sffs); #put two references into variables 19 | 20 | 21 | my $count = scalar @$base_amount2; 22 | print "bases: @$base_amount2 \n"; #print dereferenced array 23 | print "reads: @$contig_amount2 \n";#print dereferenced array 24 | 25 | my $total_amount_bp = 0; 26 | my $total_amount_contig = 0; 27 | for (my $i = 0; $i < $count; $i++) { 28 | my $element = pop(@$base_amount2); 29 | my $element2 = pop(@$contig_amount2); 30 | $total_amount_bp = $element + $total_amount_bp; 31 | $total_amount_contig = $element2 + $total_amount_contig; 32 | } 33 | print "The total amount of bases is: $total_amount_bp \n"; 34 | print "The total amount of reads is: $total_amount_contig \n"; 35 | 36 | sub main { 37 | my @base_amount; 38 | my @contig_amount; 39 | foreach my $line (@_) { 40 | system("sffinfo -s $sff_path$line > $line.fasta"); 41 | my $bases = &count_base("$line"); 42 | my $contigs = &count_read("$line"); 43 | #my $contigs = `grep -c '>' $line.fasta`; 44 | #print "blaat: $contigs \n"; 45 | chomp($contigs); 46 | system("rm $line.fasta"); 47 | unshift (@base_amount, $bases); 48 | unshift (@contig_amount, $contigs); 49 | } 50 | return (\@base_amount, \@contig_amount);#make references of the two arrays 51 | } 52 | 53 | sub getSff { 54 | my @sff_docs; 55 | opendir(DIR, $sff_path) or die "Cannot open $sff_path \n"; 56 | my @sff_files = readdir(DIR); 57 | close DIR; 58 | 59 | foreach my $sff_file (@sff_files) { 60 | next if (!($sff_file =~ /\.sff$/)); 61 | push @sff_docs, $sff_file; 62 | } 63 | return @sff_docs; 64 | } 65 | 66 | sub count_base { 67 | open FASTA, "<@_[0].fasta"; 68 | my $total_bases = 0; 69 | while () { 70 | if (!(/^>/)) { 71 | chomp; 72 | s/\r//g; 73 | $total_bases += length; 74 | } 75 | } 76 | return $total_bases; 77 | close(FASTA); 78 | } 79 | 80 | sub count_read { 81 | my $counter = 0; 82 | open FASTA, "<@_[0].fasta"; 83 | while () { 84 | $counter++ if tr/>//; 85 | } 86 | return $counter; 87 | } 88 | -------------------------------------------------------------------------------- /Calc_coverage_from_spades_assembly.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Written by Tom de Man 4 | 5 | use strict; 6 | 7 | my $fasta = shift; 8 | my $contig_count = 0; 9 | my $sum_coverage_contigs = 0; 10 | 11 | open FA, "$fasta" || die "cannot open $fasta for reading"; 12 | 13 | while () { 14 | chomp; 15 | if ((/^>/)) { 16 | chomp; 17 | $contig_count += 1; 18 | my @split_header = split ("_", $_); 19 | $sum_coverage_contigs += $split_header[5]; 20 | } else { 21 | print OUT "$_\n"; 22 | } 23 | } 24 | 25 | my $coverage = $sum_coverage_contigs / $contig_count; 26 | 27 | print "Amount of contigs: $contig_count\n"; 28 | print "--------------------------------\n"; 29 | print "coverage on average: $coverage\n"; -------------------------------------------------------------------------------- /IonTorrent_SE_run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir assemblies 4 | mkdir assemblies_500 5 | mkdir BUSCO_stats 6 | 7 | perl run_IonT_SPAdes.pl raw_reads/* 8 | 9 | #move assemblies to different folder 10 | for file in *_assembly/scaffolds.fasta; do new="$(echo "$file" | cut -d '_' -f 2)".scaffolds.fasta; cp "$file" "assemblies/$new"; done 11 | 12 | #filter contigs on length 13 | for i in assemblies/*.fasta; do perl contig_size_select.pl -low 500 $i > $i.500.fna; done 14 | mv assemblies/*.fna assemblies_500 15 | 16 | #Check if genomes are complete gene content wise. Make sure to add the right lineage db to -l 17 | for i in assemblies_500/*.fna 18 | 19 | do 20 | isolate="$(echo "$i" | cut -d '/' -f 2)" 21 | BUSCO.py -i $i -o BUSCO_$isolate -l /path/to/BUSCO/lineage_data/species_odb9 -m geno 22 | done 23 | 24 | #take all summaries 25 | for file in run_BUSCO_*/*.txt; do cp $file BUSCO_stats; done 26 | BUSCO_plot.py -wd BUSCO_stats 27 | -------------------------------------------------------------------------------- /Kraken_krona_fastq.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kraken_db="/path/to/custom/Kraken-RefSeq/database/" 4 | 5 | for file in *_R1_001.fastq 6 | 7 | do 8 | b=$(basename $file _R1_001.fastq) 9 | rev=${b}_R2_001.fastq 10 | 11 | echo "[:] Analyzing $file and $rev" 12 | 13 | kraken --version 14 | 15 | echo "[:] Running kraken. Output: $file.kraken / $file.classified" 16 | 17 | kraken --fastq-input --paired --db $kraken_db --preload --threads 12 --output $file.kraken --classified-out $file.classified $file $rev 18 | 19 | echo "[:] Generating metaphlan compatible report." 20 | 21 | kraken-mpa-report --db $kraken_db $file.kraken >> $file.mpa 22 | 23 | echo "[:] Generating krona output for $file." 24 | 25 | python metaphlan2krona.py -p $file.mpa -k $file.krona 26 | 27 | done 28 | ktImportText -o all_output.html *.krona -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Sequence-scripts 2 | ================ 3 | 4 | ## Extract metagenomic reads from a particular species or strain using [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [samtools](https://github.com/samtools/), and [Picard](https://broadinstitute.github.io/picard/). 5 | 6 | ### Usage 7 | perl run_bowtie2_subtract_mapped_reads_with_picard directory/containing/metagenomic/samples/only/* 8 | ------------------------------------------------------------------------------------------------------ 9 | 10 | ## Remove human DNA from metagenomic samples to comply with HIPAA regulations (useful when uploading raw data to SRA/ENA) 11 | 12 | ### Usage 13 | perl run_kneaddata_only_human_removal.pl directory/containing/metagenomic/samples/only/* 14 | -------------------------------------------------------------------------------------------- 15 | 16 | ## Cut DNA sequence at a user defined position (e.g. position 63789) and move the sequence that ranges from start to 63789 to the end of the sequence. Useful when generating plasmid sequence comparison plots with Easyfig or Geneious. As a convention, folks usually cut right before the plasmid replicon gene. 17 | 18 | ### Usage 19 | perl cut_and_paste_seq.pl -cut 63789 -strand -seq Plasmid_DNA.fasta > Plasmid_DNA_new_sequence_order.fasta 20 | ----------------------------------------------------------------------------------------------------------------------------------- 21 | 22 | ## Determine the core genome size of a given dataset. The script is assessing sorted BAM files and the mapping reference in order to estimate the core genome size for a certain depth of coverage. The script needs [samtools](https://github.com/samtools/), [bedtools](http://bedtools.readthedocs.io/en/latest/), and awk. 23 | 24 | ### Usage 25 | perl estimate_core_genome_from_bam.pl -bam /path/to/bam/files -genome mapping/reference/fasta/file -depth 10 26 | ---------------------------------------------------------------------------------------------------------------- 27 | 28 | ## Calculate simple genome assembly stats including N50, number of contigs, total bases, and G+C content 29 | 30 | ### Usage 31 | perl calc_N50_GC_genomesize.pl -i genomeAssembly.fasta -o output.stats 32 | -------------------------------------------------------------------------- 33 | 34 | ## Screen raw reads for contamination and get an impression of the bacterial composition of your sample(s). Script is using [Kraken](https://ccb.jhu.edu/software/kraken/) for determining species composition, [KronaTools](https://github.com/marbl/Krona/wiki/KronaTools) for generating multi-layered pie charts, and conversion script [metaphlan2krona.py](https://bitbucket.org/nsegata/metaphlan/src/2f1b17a1f4e9775fe1ce42c8481279a5e69f291f/conversion_scripts/metaphlan2krona.py?at=default) 35 | 36 | ### Usage 37 | bash Kraken_krona_fastq.bash 38 | -------------------------------- 39 | 40 | ## Trimming raw reads and remove sequencing adapters using [fastq-mcf](http://ea-utils.googlecode.com/svn/wiki/FastqMcf.wiki) 41 | 42 | ### Usage 43 | perl run_fastqMcf.pl directory/containing/raw/reads/only/* 44 | -------------------------------------------------------------- 45 | 46 | ## Map trimmed reads to contaminant (e.g. PhiX) database and subtract unmapped reads for downstream analysis using [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [SAMtools](http://samtools.sourceforge.net) and [bam2fastq](https://gsl.hudsonalpha.org/information/software/bam2fastq) 47 | 48 | ### Usage 49 | perl run_bowtie2_subtract_unmapped_reads.pl directory/containing/trimmed/reads/only/* 50 | ----------------------------------------------------------------------------------------- 51 | 52 | ## Assembling the trimmed and contaminant free reads using [SPAdes](http://spades.bioinf.spbau.ru) 53 | 54 | ### Usage 55 | perl run_SPAdes.pl directory/containing/trimmed/and/virus/free/reads/only/* 56 | ------------------------------------------------------------------------------- 57 | 58 | ## Perform the previous three steps using one Shell script. It runs fastq-MCF, Bowtie2, SAMtools, bam2fastq and SPAdes assembler in batch 59 | 60 | ### Usage 61 | bash fastqMcf-bowtie2-SPAdes.bash 62 | -------------------------------------- 63 | 64 | ## Calculating average K-mer coverage of SPAdes assembly, from your highest K value (usually k=127) 65 | 66 | ### Usage 67 | perl Calc_coverage_from_spades_assembly.pl 68 | ---------------------------------------------------------------- 69 | 70 | ## Correcting PacBio data with Illumina reads by means of [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) and [Pilon](https://github.com/broadinstitute/pilon/wiki) 71 | 72 | ### Usage 73 | perl run_bowtie2_and_pilon.pl path/to/trimmed/Illumina/reads/* 74 | ----------------------------------------------------------------------------------------- 75 | 76 | ## Running [kSNP](https://sourceforge.net/projects/ksnp/files/) version 2 using assembled microbial genomes (in fasta format) 77 | 78 | ### Usage 79 | perl run_kSNP.pl full/path/containing/the/input/files projectname 80 | --------------------------------------------------------------------- 81 | 82 | IonTorrent scripts 83 | ================== 84 | 85 | ## Assemble Single-End (SE) IonTorrent reads with [SPAdes](http://spades.bioinf.spbau.ru) 86 | 87 | ### Usage 88 | perl run_IonT_SPAdes.pl directory/containing/trimmed/SE-reads/only/* 89 | -------------------------------------------------------------------------- 90 | 91 | ## Bash workflow script for trimming SE IonTorrent reads, assembling trimmed reads, and quality check contigs using [BUSCO 2.0](http://busco.ezlab.org) 92 | 93 | ### Usage 94 | bash IonTorrent_SE_run.bash 95 | ------------------------------- 96 | 97 | 454 scripts 98 | =========== 99 | 100 | ## Quickly assess binary 454 Standard Flowgram Format (SFF) files from a 454 sequencing run. This simple script counts amount of reads and bases. Script needs SFFinfo 101 | 102 | ### Usage 103 | perl BaseCount_sequenceCount_from_sff_file.pl /directory/to/sff/files 104 | -------------------------------------- 105 | 106 | Eukaryotic part 107 | ================ 108 | 109 | ## Generate [EVM](https://evidencemodeler.github.io) suitable GFF3 files from [MAKER](http://www.yandell-lab.org/software/maker.html) de novo gene prediction GFF 110 | 111 | ### Usage 112 | perl gff3_2_gff3EVM.pl 113 | -------------------------------------- 114 | 115 | ## Make EVM data compatible with [Gbrowse](http://gbrowse.org/index.html) 116 | 117 | ### Usage 118 | perl fix_evm_for_gbrowse.pl < inputfile.gff3 119 | -------------------------------------- 120 | -------------------------------------------------------------------------------- /adapters.fasta: -------------------------------------------------------------------------------- 1 | >gnl|uv|NGB00360.1|Illumina_Paired_End_PCR_Primer_1.0 2 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 3 | >gnl|uv|NGB00362.1|Illumina_Paired_End_PCR_Primer_2.0 4 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 5 | >gnl|uv|NGB00727.1|Nextera_PCR_primer_i5_N501 6 | AATGATACGGCGACCACCGAGATCTACACTAGATCGCTCGTCGGCAGCGTC 7 | >gnl|uv|NGB00728.1|Nextera_PCR_primer_i5_N502 8 | AATGATACGGCGACCACCGAGATCTACACCTCTCTATTCGTCGGCAGCGTC 9 | >gnl|uv|NGB00729.1|Nextera_PCR_primer_i5_N503 10 | AATGATACGGCGACCACCGAGATCTACACTATCCTCTTCGTCGGCAGCGTC 11 | >gnl|uv|NGB00730.1|Nextera_PCR_primer_i5_N504 12 | AATGATACGGCGACCACCGAGATCTACACAGAGTAGATCGTCGGCAGCGTC 13 | >gnl|uv|NGB00731.1|Nextera_PCR_primer_i5_N505 14 | AATGATACGGCGACCACCGAGATCTACACGTAAGGAGTCGTCGGCAGCGTC 15 | >gnl|uv|NGB00732.1|Nextera_PCR_primer_i5_N506 16 | AATGATACGGCGACCACCGAGATCTACACACTGCATATCGTCGGCAGCGTC 17 | >gnl|uv|NGB00733.1|Nextera_PCR_primer_i5_N507 18 | AATGATACGGCGACCACCGAGATCTACACAAGGAGTATCGTCGGCAGCGTC 19 | >gnl|uv|NGB00734.1|Nextera_PCR_primer_i5_N508 20 | AATGATACGGCGACCACCGAGATCTACACCTAAGCCTTCGTCGGCAGCGTC 21 | >gnl|uv|NGB00735.1|Nextera_PCR_primer_i7_N701 22 | CAAGCAGAAGACGGCATACGAGATTCGCCTTAGTCTCGTGGGCTCGG 23 | >gnl|uv|NGB00736.1|Nextera_PCR_primer_i7_N702 24 | CAAGCAGAAGACGGCATACGAGATCTAGTACGGTCTCGTGGGCTCGG 25 | >gnl|uv|NGB00737.1|Nextera_PCR_primer_i7_N703 26 | CAAGCAGAAGACGGCATACGAGATTTCTGCCTGTCTCGTGGGCTCGG 27 | >gnl|uv|NGB00738.1|Nextera_PCR_primer_i7_N704 28 | CAAGCAGAAGACGGCATACGAGATGCTCAGGAGTCTCGTGGGCTCGG 29 | >gnl|uv|NGB00739.1|Nextera_PCR_primer_i7_N705 30 | CAAGCAGAAGACGGCATACGAGATAGGAGTCCGTCTCGTGGGCTCGG 31 | >gnl|uv|NGB00740.1|Nextera_PCR_primer_i7_N706 32 | CAAGCAGAAGACGGCATACGAGATCATGCCTAGTCTCGTGGGCTCGG 33 | >gnl|uv|NGB00741.1|Nextera_PCR_primer_i7_N707 34 | CAAGCAGAAGACGGCATACGAGATGTAGAGAGGTCTCGTGGGCTCGG 35 | >gnl|uv|NGB00742.1|Nextera_PCR_primer_i7_N708 36 | CAAGCAGAAGACGGCATACGAGATCCTCTCTGGTCTCGTGGGCTCGG 37 | >gnl|uv|NGB00743.1|Nextera_PCR_primer_i7_N709 38 | CAAGCAGAAGACGGCATACGAGATAGCGTAGCGTCTCGTGGGCTCGG 39 | >gnl|uv|NGB00744.1|Nextera_PCR_primer_i7_N710 40 | CAAGCAGAAGACGGCATACGAGATCAGCCTCGGTCTCGTGGGCTCGG 41 | >gnl|uv|NGB00745.1|Nextera_PCR_primer_i7_N711 42 | CAAGCAGAAGACGGCATACGAGATTGCCTCTTGTCTCGTGGGCTCGG 43 | >gnl|uv|NGB00746.1|Nextera_PCR_primer_i7_N712 44 | CAAGCAGAAGACGGCATACGAGATTCCTCTACGTCTCGTGGGCTCGG 45 | >gnl|uv|NGB00842.1|Nextera_DNA_Sample_Prep_Kit_Transposon 46 | GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAG 47 | >gnl|uv|NGB00843.1|Nextera_DNA_Sample_Prep_Kit_Transposon 48 | GCCTTGCCAGCCCGCTCAGAGATGTGTATAAGAGACAG 49 | >gnl|uv|NGB00844.1|Nextera_DNA_Sample_Prep_Kit_Adaptor 50 | AATGATACGGCGACCACCGAGATCTACACGCCTCCCTCGCGCCATCAG 51 | >gnl|uv|NGB00726.1|Illumina_Nextera_transposase 52 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG 53 | >gnl|uv|NGB00725.1|Illumina_Nextera_transposase 54 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG 55 | >gnl|uv|NGB00755.1|TruSeq_DNA_HT_and_RNA_HT_i7_D701 56 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTACTCGATCTCGTATGCCGTCTTCTGCTTG 57 | >gnl|uv|NGB00756.1|TruSeq_DNA_HT_and_RNA_HT_i7_D702 58 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCCGGAGAATCTCGTATGCCGTCTTCTGCTTG 59 | >gnl|uv|NGB00757.1|TruSeq_DNA_HT_and_RNA_HT_i7_D703 60 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGCTCATTATCTCGTATGCCGTCTTCTGCTTG 61 | >gnl|uv|NGB00758.1|TruSeq_DNA_HT_and_RNA_HT_i7_D704 62 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTG 63 | >gnl|uv|NGB00759.1|TruSeq_DNA_HT_and_RNA_HT_i7_D705 64 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCAGAAATCTCGTATGCCGTCTTCTGCTTG 65 | >gnl|uv|NGB00760.1|TruSeq_DNA_HT_and_RNA_HT_i7_D706 66 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAATTCGTATCTCGTATGCCGTCTTCTGCTTG 67 | >gnl|uv|NGB00761.1|TruSeq_DNA_HT_and_RNA_HT_i7_D707 68 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTGAAGCTATCTCGTATGCCGTCTTCTGCTTG 69 | >gnl|uv|NGB00762.1|TruSeq_DNA_HT_and_RNA_HT_i7_D708 70 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAATGCGCATCTCGTATGCCGTCTTCTGCTTG 71 | >gnl|uv|NGB00763.1|TruSeq_DNA_HT_and_RNA_HT_i7_D709 72 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGGCTATGATCTCGTATGCCGTCTTCTGCTTG 73 | >gnl|uv|NGB00764.1|TruSeq_DNA_HT_and_RNA_HT_i7_D710 74 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCCGCGAAATCTCGTATGCCGTCTTCTGCTTG 75 | >gnl|uv|NGB00765.1|TruSeq_DNA_HT_and_RNA_HT_i7_D711 76 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCTCGCGCATCTCGTATGCCGTCTTCTGCTTG 77 | >gnl|uv|NGB00766.1|TruSeq_DNA_HT_and_RNA_HT_i7_D712 78 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGCGATAGATCTCGTATGCCGTCTTCTGCTTG 79 | >gnl|uv|NGB00360.1|TruSeq_Illumina_PCR_Primer 80 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 81 | >gnl|uv|NGB00362.1|Illumina_Paired_End_PCR_Primer_2.0 82 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 83 | >gnl|uv|NGB00846.1|NEBNext_Adaptor 84 | GATCGGAAGAGCACACGTCTGAACTCCAGTCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 85 | >gnl|uv|NGB00847.1|NEBNext_1_Primer 86 | CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 87 | >gnl|uv|NGB00848.1|NEBNext_2_Primer 88 | CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 89 | >gnl|uv|NGB00849.1|NEBNext_3_Primer 90 | CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 91 | >gnl|uv|NGB00850.1|NEBNext_4_Primer 92 | CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 93 | >gnl|uv|NGB00851.1|NEBNext_5_Primer 94 | CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 95 | >gnl|uv|NGB00852.1|NEBNext_6_Primer 96 | CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 97 | >gnl|uv|NGB00853.1|NEBNext_7_Primer 98 | CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 99 | >gnl|uv|NGB00854.1|NEBNext_8_Primer 100 | CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 101 | >gnl|uv|NGB00855.1|NEBNext_9_Primer 102 | CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 103 | >gnl|uv|NGB00856.1|NEBNext_10_Primer 104 | CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 105 | >gnl|uv|NGB00857.1|NEBNext_11_Primer 106 | CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 107 | >gnl|uv|NGB00858.1|NEBNext_12_Primer 108 | CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 109 | >gnl|uv|NGB00859.1|NEBNext_13_Primer 110 | CAAGCAGAAGACGGCATACGAGATTGTTGACTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 111 | >gnl|uv|NGB00860.1|NEBNext_14_Primer 112 | CAAGCAGAAGACGGCATACGAGATACGGAACTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 113 | >gnl|uv|NGB00861.1|NEBNext_15_Primer 114 | CAAGCAGAAGACGGCATACGAGATTCTGACATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 115 | >gnl|uv|NGB00862.1|NEBNext_16_Primer 116 | CAAGCAGAAGACGGCATACGAGATCGGGACGGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 117 | >gnl|uv|NGB00863.1|NEBNext_18_Primer 118 | CAAGCAGAAGACGGCATACGAGATGTGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 119 | >gnl|uv|NGB00864.1|NEBNext_19_Primer 120 | CAAGCAGAAGACGGCATACGAGATCGTTTCACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 121 | >gnl|uv|NGB00865.1|NEBNext_20_Primer 122 | CAAGCAGAAGACGGCATACGAGATAAGGCCACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 123 | >gnl|uv|NGB00866.1|NEBNext_21_Primer 124 | CAAGCAGAAGACGGCATACGAGATTCCGAAACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 125 | >gnl|uv|NGB00867.1|NEBNext_22_Primer 126 | CAAGCAGAAGACGGCATACGAGATTACGTACGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 127 | >gnl|uv|NGB00868.1|NEBNext_23_Primer 128 | CAAGCAGAAGACGGCATACGAGATATCCACTCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 129 | >gnl|uv|NGB00869.1|NEBNext_25_Primer 130 | CAAGCAGAAGACGGCATACGAGATATATCAGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 131 | >gnl|uv|NGB00870.1|NEBNext_27_Primer 132 | CAAGCAGAAGACGGCATACGAGATAAAGGAATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT -------------------------------------------------------------------------------- /calc_N50_GC_genomesize.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #Written by Tom de Man 4 | #Calculates basic contig stats like G+C content, N50, and number of contigs 5 | 6 | use strict; 7 | use warnings; 8 | use List::Util qw(sum min max); 9 | use Getopt::Long; 10 | use File::Basename; 11 | 12 | my $As = 0; 13 | my $Ts = 0; 14 | my $Gs = 0; 15 | my $Cs = 0; 16 | my $Ns = 0; 17 | 18 | my $file; 19 | my $helpAsked; 20 | my $outFile = ""; 21 | 22 | GetOptions( 23 | "i=s" => \$file, 24 | "h|help" => \$helpAsked, 25 | "o|outputFile=s" => \$outFile, 26 | ); 27 | 28 | if(defined($helpAsked)) { 29 | Usage(); 30 | exit; 31 | } 32 | if(!defined($file)) { 33 | Error("No input files are provided"); 34 | } 35 | 36 | my ($fileName, $filePath) = fileparse($file); 37 | $outFile = $file . "_n50_GC_genomesize_stats" if($outFile eq ""); 38 | 39 | open(IN, "<$file") or die "Cannot open file: $file\n"; 40 | open(OUT, ">$outFile") or die "Cannot open file: $outFile\n"; 41 | 42 | my @len = (); 43 | my $prevFastaSeqId = ""; 44 | my $fastaSeqId = ""; 45 | my $fastaSeq = ""; 46 | 47 | while() { 48 | chomp; 49 | if($_ =~ /^>/) { 50 | $prevFastaSeqId = $fastaSeqId; 51 | $fastaSeqId = $_; 52 | if($fastaSeq ne "") { 53 | push(@len, length $fastaSeq); 54 | baseCount($fastaSeq); 55 | } 56 | $fastaSeq = ""; 57 | } 58 | else { 59 | $fastaSeq .= $_; 60 | } 61 | } 62 | close IN; 63 | 64 | if($fastaSeq ne "") { 65 | $prevFastaSeqId = $fastaSeqId; 66 | push(@len, length $fastaSeq); 67 | baseCount($fastaSeq); 68 | } 69 | 70 | my $totalContigs = scalar @len; 71 | my $bases = sum(@len); 72 | my $minContigLen = min(@len); 73 | my $maxContigLen = max(@len); 74 | my $n50 = calcN50(\@len, 50); 75 | my $GCcont = ($Gs+$Cs)/$bases*100; 76 | 77 | #MMB sheet order 78 | print "$totalContigs\t$bases\n"; 79 | 80 | printf OUT "%-25s %d\n", "Number of reads/contigs", $totalContigs; 81 | printf OUT "%-25s %d\n", "Total assembly length", $bases; 82 | #GC 83 | printf OUT "%-25s %0.2f %s\n", "(G + C)s", ($Gs+$Cs)/$bases*100, "%"; 84 | #N50 85 | printf OUT "%-25s %d\n", "N50 length", $n50; 86 | 87 | 88 | print "Contig Statistics file: $outFile\n"; 89 | close OUT; 90 | exit; 91 | 92 | sub calcN50 { 93 | my @x = @{$_[0]}; 94 | my $n = $_[1]; 95 | @x=sort{$b<=>$a} @x; 96 | my $total = sum(@x); 97 | my ($count, $n50)=(0,0); 98 | for (my $j=0; $j<@x; $j++){ 99 | $count+=$x[$j]; 100 | if(($count>=$total*$n/100)){ 101 | $n50=$x[$j]; 102 | last; 103 | } 104 | } 105 | return $n50; 106 | } 107 | 108 | sub baseCount { 109 | my $seq = $_[0]; 110 | my $tAs += $seq =~ s/A/A/gi; 111 | my $tTs += $seq =~ s/T/T/gi; 112 | my $tGs += $seq =~ s/G/G/gi; 113 | my $tCs += $seq =~ s/C/C/gi; 114 | $Ns += (length $seq) - $tAs - $tTs - $tGs - $tCs; 115 | $As += $tAs; 116 | $Ts += $tTs; 117 | $Gs += $tGs; 118 | $Cs += $tCs; 119 | } 120 | 121 | sub Usage { 122 | print "\n Usage: perl $0 \n\n"; 123 | print "### Input reads/contigs (FASTA) (Required)\n"; 124 | print " -i \n"; 125 | print " Read/Sequence in fasta format\n"; 126 | print " -o | -outputFile \n"; 127 | print " default: By default, N50 statistics file will be stored where the input file is\n"; 128 | print " -h | -help\n"; 129 | print " Prints this help\n"; 130 | print "\n"; 131 | } 132 | 133 | sub Error { 134 | my $msg = $_[0]; 135 | printf STDERR "|%-70s|\n", " Error!!"; 136 | printf STDERR "|%-70s|\n", " $msg"; 137 | 138 | Usage(); 139 | exit; 140 | } 141 | -------------------------------------------------------------------------------- /contig_size_select.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | #Copied from Umer Zeeshan Ijaz at University of Glasgow 4 | 5 | use strict; 6 | use Getopt::Long; 7 | 8 | my ($low,$high)=(0,99999999); 9 | 10 | GetOptions( "low=i" => \$low, 11 | "high=i" => \$high, 12 | "help|?" => sub {Usage()} 13 | ); 14 | sub Usage 15 | { 16 | print STDERR "perl $0 -low -high \n\n"; 17 | exit; 18 | } 19 | if (scalar(@ARGV)!=1) {print STDERR "Please give one input fasta file\n";&Usage;} 20 | 21 | my $seq; 22 | my $id; 23 | my $len; 24 | my @seq; 25 | open (IN,"$ARGV[0]") or die ":$!"; 26 | while() 27 | { 28 | chomp; 29 | if(/^>(.*)/) 30 | { 31 | if ($seq){ 32 | if ($seq=~/\d+/) 33 | { 34 | chop $seq; 35 | @seq = split /\s+/,$seq; 36 | $len=scalar(@seq); 37 | } 38 | else 39 | { 40 | $len=length ($seq); 41 | } 42 | if ($len>=$low and $len<=$high){ 43 | print ">$id\n$seq\n"; 44 | } 45 | } 46 | $id =$1; 47 | $seq =""; 48 | } 49 | else 50 | { 51 | if ($_ =~/\d+/) # for qual file 52 | { 53 | $seq .= $_." "; 54 | } 55 | else 56 | { 57 | $seq .= $_; 58 | } 59 | } 60 | } 61 | if ($seq){ 62 | if ($seq=~/\d+/) 63 | { 64 | chop $seq; 65 | @seq = split /\s+/,$seq; 66 | $len=scalar(@seq); 67 | } 68 | else 69 | { 70 | $len=length ($seq); 71 | } 72 | if ($len>=$low and $len<=$high){ 73 | print ">$id\n$seq\n"; 74 | } 75 | } 76 | 77 | close IN; -------------------------------------------------------------------------------- /cut_and_paste_seq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Written by Tom de Man 4 | # Cut DNA sequence at a user defined position, and paste it at the end of that same sequence 5 | # Useful when generating plasmid DNA sequence comparison plots with Easyfig or Geneious 6 | # Needs BioPerl 7 | 8 | use strict; 9 | use warnings; 10 | use Bio::SeqIO; 11 | use Getopt::Long; 12 | 13 | my $end_pos; 14 | my $fasta; 15 | my $strand; 16 | 17 | GetOptions( "cut=s" => \$end_pos, 18 | "seq=s" => \$fasta, 19 | "strand=s" => \$strand, 20 | "help|?" => sub {Usage()} 21 | ); 22 | 23 | if (($end_pos) && ($fasta) && ($strand)) { 24 | &subseq($end_pos, $fasta, $strand); 25 | } else { 26 | &Usage; 27 | } 28 | 29 | sub subseq { 30 | my ($end, $input_seq, $direction) = @_; 31 | my $seqin = Bio::SeqIO->new(-file => "$input_seq", -format => "fasta"); 32 | while (my $seq = $seqin->next_seq) { 33 | my $acc = $seq->display_id; 34 | my $sequence = $seq->seq; 35 | 36 | print ">$acc\n"; 37 | if ($strand eq "forward") { 38 | my $first = substr($sequence, 0, $end); 39 | my $last = substr($sequence, $end); 40 | print "$last"."$first"."\n"; 41 | } elsif ($strand eq "reverse") { 42 | my $revcomp = reverse($sequence); 43 | $revcomp =~ tr/ABCDGHMNRSTUVWXYabcdghmnrstuvwxy/TVGHCDKNYSAABWXRtvghcdknysaabwxr/; 44 | my $len = length $revcomp; 45 | my $rev_pos = $len - $end; 46 | 47 | my $first = substr($revcomp, 0, $rev_pos); 48 | my $last = substr($revcomp, $rev_pos); 49 | my $out = "$last"."$first"; 50 | print "$out \n"; 51 | } 52 | } 53 | } 54 | 55 | sub Usage { 56 | print STDERR "\n Please provide input sequence file, cutting position, and strand direction\n\n"; 57 | print STDERR "\n Usage: perl $0 -cut -strand -seq \n\n"; 58 | exit; 59 | } -------------------------------------------------------------------------------- /estimate_core_genome_from_bam.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Written by Tom de Man 4 | #script needs sorted bam files 5 | #script also needs samtools, bedtools, and awk in order to operate 6 | 7 | use strict; 8 | use warnings; 9 | use Getopt::Long; 10 | use String::ShellQuote qw(shell_quote); 11 | use Array::Utils qw(:all); 12 | use Data::Dumper qw(Dumper); 13 | 14 | my $sortbam_path; 15 | my $genome; 16 | my $depth; 17 | 18 | my $genome_size; 19 | my $starttime = localtime; 20 | my $version = "1.0"; 21 | 22 | GetOptions( "bam=s" => \$sortbam_path, 23 | "genome=s" => \$genome, 24 | "depth=s" => \$depth, 25 | "help|?" => sub {Usage()} 26 | ); 27 | 28 | if (($sortbam_path) && ($genome) && ($depth)) { 29 | my @bams = &get_files("bam"); 30 | print STDERR "Hi $ENV{USER}, you are now running $0 version: $version on $starttime \n\n"; 31 | print STDERR "BAM files included in core genome estimate: \n"; 32 | foreach (@bams) { 33 | print "$_\n"; 34 | } 35 | $genome_size = &genome_size_calc; 36 | &estimate_core(@bams); 37 | } else { 38 | &Usage; 39 | } 40 | 41 | sub genome_size_calc { 42 | print STDERR "\n"; 43 | print STDERR "your mapping reference is $genome \n"; 44 | open FASTA, "$genome" or die "cannot open $genome for reading \n"; 45 | my $total_bases = 0; 46 | while () { 47 | if (!(/^>/)) { 48 | chomp; 49 | s/\r//g; 50 | $total_bases += length; 51 | } 52 | } 53 | return $total_bases; 54 | close FASTA; 55 | } 56 | 57 | sub estimate_core { 58 | #create fasta index 59 | print STDERR "running samtools..... generating a contig list file\n"; 60 | system("samtools faidx $genome"); 61 | open INDEX, "$genome.fai" or die "cannot open $genome.fai for reading \n"; 62 | open (my $fh, '>', "$genome.contig"); 63 | while () { 64 | chomp; 65 | my @split = split ("\t", $_); 66 | print $fh "$split[0]\t$split[1]\n"; 67 | } 68 | close $fh; 69 | close INDEX; 70 | 71 | my $cnt = 0; 72 | #calculate genome wide coverage for each nucleotide position, for each sorted BAM file 73 | foreach my $file (@_) { 74 | $cnt += 1; 75 | my $bam = "$sortbam_path/$file"; 76 | my $con_len = "$genome.contig"; 77 | my $out = "$sortbam_path/$file.$depth.cov"; 78 | print STDERR "running bedtools for sample $cnt..... generating genome coverage data \n"; 79 | system("bedtools genomecov -ibam ".shell_quote($bam)." -g ".shell_quote($con_len)." -d | awk '{if(\$3>=$depth){ print \$0}}' > ".shell_quote($out).""); 80 | } 81 | 82 | #get the .cov files 83 | my @covs = &get_files("cov"); 84 | my @cov2d; 85 | foreach (@covs) { 86 | my @n; 87 | my $c = "$sortbam_path/$_"; 88 | open COV, $c or die "cannot open $c for reading \n"; 89 | while () { 90 | chomp; 91 | my @split = split ("\t", $_); 92 | my $contig_position = $split[0]."_".$split[1]; 93 | push @n, $contig_position; 94 | } 95 | close COV; 96 | push @cov2d, \@n; 97 | 98 | } 99 | my $rows = scalar @cov2d; 100 | print STDERR "You are going to estimate a core genome for $rows isolates ..... how exciting!!! \n"; 101 | 102 | my $start_ref = shift @cov2d; 103 | my @overlap = @$start_ref; 104 | my $remainder = $rows - 1; 105 | #pairwise comparison via intersect 106 | for (my $i=0; $i < $remainder; $i++) { 107 | my $comparison = shift @cov2d; 108 | my $size = @$comparison; 109 | @overlap = intersect(@$comparison, @overlap); 110 | } 111 | my $core = scalar @overlap; 112 | my $percentage = ($core/$genome_size)* 100; 113 | my $rounded = sprintf "%.2f", $percentage; 114 | print STDERR "Core genome size for $rows genomes is: $core base pairs, which equals $rounded% of the mapping reference genome\n"; 115 | 116 | } 117 | 118 | sub get_files { 119 | my $ext = qr/$_[0]/; 120 | my @bamfiles; 121 | opendir(DIR, $sortbam_path) or die "cannot open $sortbam_path \n"; 122 | my @files = readdir(DIR); 123 | close DIR; 124 | 125 | foreach my $file (@files){ 126 | next if (!($file =~ /\.$ext$/)); 127 | push @bamfiles, $file; 128 | } 129 | return @bamfiles; 130 | } 131 | 132 | sub Usage { 133 | print STDERR "\n Please provide input files!!!\n\n"; 134 | print STDERR "\n Usage: perl $0 -bam -genome -depth \n\n"; 135 | exit; 136 | } 137 | -------------------------------------------------------------------------------- /fastqMcf-bowtie2-SPAdes.bash: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir trimmed_reads 4 | mkdir trimmed_viral_free_reads 5 | mkdir assemblies 6 | 7 | perl run_fastqMcf.pl raw_reads/* 8 | 9 | mv raw_reads/*trimmed* trimmed_reads 10 | 11 | perl run_bowtie2_subtract_unmapped_reads.pl trimmed_reads/* 12 | 13 | mv trimmed_reads/*bacterial* trimmed_viral_free_reads 14 | 15 | perl run_SPAdes.pl trimmed_viral_free_reads/* 16 | 17 | for file in *_assembly/scaffolds.fasta; do new="$(echo "$file" | cut -d '_' -f 1)".scaffolds.fasta; cp "$file" "assemblies/$new"; done 18 | -------------------------------------------------------------------------------- /fix_evm_for_gbrowse.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #Written by Tom de Man 4 | 5 | use strict; 6 | my %count; 7 | 8 | while(<>) { 9 | next if /^\s*$/; 10 | chomp; 11 | my @row = split(/\t/,$_); 12 | 13 | if($row[2] eq 'CDS') { 14 | my %grp = map { split(/=/,$_) } split(/;/,pop @row); 15 | $grp{ID} .= ".cds".++$count{$grp{ID}}; 16 | push @row, join(";", map { sprintf("%s=%s",$_,$grp{$_})} qw(ID Parent)); 17 | } 18 | print join("\t", @row), "\n"; 19 | } -------------------------------------------------------------------------------- /gff3_2_gff3EVM.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | #create EVM suitable GFF3 files from MAKER de novo gene prediction GFF 5 | #Written by Tom de Man 6 | 7 | my @pre_name; 8 | my @last_col_split_name; 9 | my @pre_name_extra; 10 | my @pre_name_plus; 11 | 12 | my $exon_count = 0; 13 | 14 | open(my $gene => ">gene_predictions.EVM.gff3")|| die $!; 15 | 16 | while(<>) { 17 | chomp; 18 | my @row = split(/\t/,$_); 19 | my $last_col = pop(@row); 20 | my @last_col_split = split(";",$last_col); 21 | if ($row[1] eq "genemark" && $row[2] eq "gene") { 22 | @pre_name = split("-", $last_col_split[1]); 23 | @pre_name_plus = split("=", $last_col_split[1]); 24 | print $gene join ("\t", @row), "\t$last_col_split[0];$pre_name[0] model $pre_name_plus[1]\n"; 25 | }elsif ($row[1] eq "genemark" && $row[2] eq "mRNA") { 26 | @last_col_split_name = split("=",$last_col_split[0]); 27 | print $gene join ("\t", @row), "\tID=$pre_name_plus[1];Parent=$last_col_split_name[1]\n"; 28 | } 29 | elsif ($row[1] eq "genemark" && $row[2] eq "CDS") { 30 | $exon_count+=1; 31 | print $gene "$row[0]\t$row[1]\texon\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\tID=e$exon_count;Parent=$pre_name_plus[1]\n"; 32 | print $gene join ("\t", @row), "\tID=cds_of_$pre_name_plus[1];Parent=$pre_name_plus[1]\n"; 33 | }elsif ($row[1] eq "augustus" && $row[2] eq "gene") { 34 | @pre_name = split("_", $last_col_split[1]); 35 | print $gene join ("\t", @row), "\t$last_col_split[0];$pre_name[0] model $pre_name[1]\n"; 36 | }elsif ($row[1] eq "augustus" && $row[2] eq "mRNA") { 37 | @last_col_split_name = split("=",$last_col_split[0]); 38 | print $gene join ("\t", @row), "\tID=$pre_name[1];Parent=$last_col_split_name[1]\n"; 39 | }elsif ($row[1] eq "augustus" && $row[2] eq "CDS") { 40 | $exon_count+=1; 41 | print $gene "$row[0]\t$row[1]\texon\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\tID=e$exon_count;Parent=$pre_name[1]\n"; 42 | print $gene join ("\t", @row), "\tID=cds_of_$pre_name[1];Parent=$pre_name[1]\n"; 43 | }elsif ($row[1] eq "snap" && $row[2] eq "gene") { 44 | @pre_name = split ("_", $last_col_split[1]); 45 | print $gene join ("\t", @row), "\t$last_col_split[0];$pre_name[0] model $pre_name[1]\n"; 46 | }elsif ($row[1] eq "snap" && $row[2] eq "mRNA") { 47 | @last_col_split_name = split("=",$last_col_split[0]); 48 | print $gene join ("\t", @row), "\tID=$pre_name[1];Parent=$last_col_split_name[1]\n"; 49 | }elsif ($row[1] eq "snap" && $row[2] eq "CDS") { 50 | $exon_count+=1; 51 | print $gene "$row[0]\t$row[1]\texon\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\tID=e$exon_count;Parent=$pre_name[1]\n"; 52 | print $gene join ("\t", @row), "\tID=cds_of_$pre_name[1];Parent=$pre_name[1]\n"; 53 | } 54 | } -------------------------------------------------------------------------------- /run_IonT_SPAdes.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #runs SPAdes assembler for single end IonTorrent reads in batch 4 | #Written by Tom de Man 5 | 6 | use warnings; 7 | use strict; 8 | use File::Basename; 9 | 10 | my @files=@ARGV; 11 | 12 | foreach my $file (@files){ 13 | my ($file_name,$dir)=fileparse($file); 14 | my @base = split (/\./, $file_name); 15 | my $name = $base[0]; 16 | 17 | my $cmd="spades.py -t 12 --iontorrent -k 21,33,55,77,99,127 --careful -s $file -o $name"."_assembly"; 18 | print $cmd,"\n"; 19 | die if system($cmd); 20 | } 21 | -------------------------------------------------------------------------------- /run_SPAdes.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #runs SPAdes assembler in batch 4 | #Written by Tom de Man 5 | 6 | use warnings; 7 | use strict; 8 | use File::Basename; 9 | 10 | my @files=@ARGV; 11 | 12 | my %paired_files; 13 | foreach my $file (@files){ 14 | my ($file_name,$dir)=fileparse($file); 15 | if($file_name =~ /(.+)_R([1|2])_/){ 16 | $paired_files{$1}[$2-1]=$file; 17 | #attempt different naming scheme 18 | }elsif($file_name =~ /(.+)_([1|2])/){ 19 | $paired_files{$1}[$2-1]=$file; 20 | }else{ 21 | warn "Input file does not contain '_R1_' or '_R2_' in name: $file"; 22 | } 23 | } 24 | 25 | foreach my $name (sort keys %paired_files){ 26 | unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){ 27 | warn "Couldn't find matching paired end files for file starting with: $name"; 28 | next; 29 | } 30 | print "assembling your data....\n"; 31 | print "----------------------\n"; 32 | print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n"; 33 | 34 | my $cmd="spades.py -t 12 --careful --only-assembler -1 $paired_files{$name}[0] -2 $paired_files{$name}[1] -o $name"."_assembly"; 35 | print $cmd,"\n"; 36 | die if system($cmd); 37 | } 38 | -------------------------------------------------------------------------------- /run_bowtie2_and_pilon.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #needs bowtie2, samtools and pilon in path 4 | #runs bowtie2 in batch 5 | #converts SAM to sorted BAM 6 | #indexing the sorted BAM 7 | #running pilon 8 | #Written by Tom de Man 9 | 10 | use warnings; 11 | use strict; 12 | use File::Basename; 13 | 14 | my $DB = shift; 15 | my @files=@ARGV; 16 | my $path; 17 | my $DB_index; 18 | 19 | my %paired_files; 20 | foreach my $file (@files){ 21 | my ($file_name,$dir)=fileparse($file); 22 | $path = $dir; 23 | if($file_name =~ /(.+)_R([1|2])_/){ 24 | $paired_files{$1}[$2-1]=$file; 25 | #attempt different naming scheme 26 | }elsif($file_name =~ /(.+)_([1|2])/){ 27 | $paired_files{$1}[$2-1]=$file; 28 | }else{ 29 | warn "Input file does not contain '_R1_' or '_R2_' in name: $file"; 30 | } 31 | } 32 | 33 | foreach my $name (sort keys %paired_files){ 34 | unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){ 35 | warn "Couldn't find matching paired end files for file starting with: $name"; 36 | next; 37 | } 38 | print "mapping to $DB - converting - running pilon\n"; 39 | print "----------------------\n"; 40 | print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n"; 41 | 42 | system("bowtie2-build $DB $DB.index"); 43 | system("bowtie2 -x $DB.index -1 $paired_files{$name}[0] -2 $paired_files{$name}[1] -S $path$name.sam --end-to-end -p 12"); 44 | system("samtools view -bS $path$name.sam | samtools sort - $path$name.sort"); 45 | system("samtools index $path$name.sort.bam $path$name.sort.bai"); 46 | system("pilon --genome $DB --frags $path$name.sort.bam --mindepth 0.6 --minqual 30 --minmq 30 --fix all --changes --vcf --output $path$name.$DB.pilon"); 47 | } 48 | -------------------------------------------------------------------------------- /run_bowtie2_subtract_mapped_reads_with_picard.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #needs bowtie2, samtools, and bedtools in path 4 | #runs bowtie2 in batch 5 | #converts SAM to sorted BAM 6 | #extracts mapped reads from BAM file and generates two FASTQ files per sample 7 | #Written by Tom de Man 8 | 9 | use warnings; 10 | use strict; 11 | use File::Basename; 12 | 13 | my @files=@ARGV; 14 | 15 | #bowtie2 index 16 | my $DB = "/path/to/mapping-reference-file"; 17 | my $SamToFastq = "/path/to/picard-tools-1.96/SamToFastq.jar"; 18 | my $path; 19 | 20 | my %paired_files; 21 | foreach my $file (@files){ 22 | my ($file_name,$dir)=fileparse($file); 23 | $path = $dir; 24 | if($file_name =~ /(.+)_R([1|2])_/){ 25 | $paired_files{$1}[$2-1]=$file; 26 | #attempt different naming scheme 27 | }elsif($file_name =~ /(.+)_([1|2])/){ 28 | $paired_files{$1}[$2-1]=$file; 29 | }else{ 30 | warn "Input file does not contain '_R1_' or '_R2_' in name: $file"; 31 | } 32 | } 33 | 34 | foreach my $name (sort keys %paired_files){ 35 | unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){ 36 | warn "Couldn't find matching paired end files for file starting with: $name"; 37 | next; 38 | } 39 | print "mapping - converting - subtracting your mapped read data....\n"; 40 | print "----------------------\n"; 41 | print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n"; 42 | 43 | my $re1 = $paired_files{$name}[0]; 44 | my $re2 = $paired_files{$name}[1]; 45 | my $fq1 = "$path$name"."_R1_001.fastq"; 46 | my $fq2 = "$path$name"."_R2_001.fastq"; 47 | 48 | system("bowtie2 -x $DB -1 $re1 -2 $re2 -p 12 --end-to-end -D 10 -R 2 -N 0 -L 30 -i S,0,2.50 | samtools view -bS - > $path$name.bam"); 49 | system("samtools view -F 4 $path$name.bam -o $path$name.mappedReads.bam"); 50 | system("samtools sort -n $path$name.mappedReads.bam -o $path$name.mappedReadsSorted.bam"); 51 | system("java -Xmx4g -jar $SamToFastq I=$path$name.mappedReadsSorted.bam F=$fq1 F2=$fq2"); 52 | } 53 | -------------------------------------------------------------------------------- /run_bowtie2_subtract_unmapped_reads.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #needs bowtie2, samtools and bam2fastq in path 4 | #runs bowtie2 in batch 5 | #converts SAM to BAM 6 | #extracts unmapped reads from BAM file and generates FASTQ files 7 | #Written by Tom de Man 8 | 9 | use warnings; 10 | use strict; 11 | use File::Basename; 12 | 13 | my @files=@ARGV; 14 | #for now hard coded, will change later 15 | #refSeq-Viral is the bowtie2 index 16 | my $DB = "/path/to/refSeq-viral"; 17 | my $path; 18 | 19 | my %paired_files; 20 | foreach my $file (@files){ 21 | my ($file_name,$dir)=fileparse($file); 22 | $path = $dir; 23 | if($file_name =~ /(.+)_R([1|2])_/){ 24 | $paired_files{$1}[$2-1]=$file; 25 | #attempt different naming scheme 26 | }elsif($file_name =~ /(.+)_([1|2])/){ 27 | $paired_files{$1}[$2-1]=$file; 28 | }else{ 29 | warn "Input file does not contain '_R1_' or '_R2_' in name: $file"; 30 | } 31 | } 32 | 33 | foreach my $name (sort keys %paired_files){ 34 | unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){ 35 | warn "Couldn't find matching paired end files for file starting with: $name"; 36 | next; 37 | } 38 | print "mapping - converting - subtracting your data....\n"; 39 | print "----------------------\n"; 40 | print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n"; 41 | 42 | system("bowtie2 -x $DB -1 $paired_files{$name}[0] -2 $paired_files{$name}[1] -S $path$name.sam -p 16 --end-to-end"); 43 | system("samtools view -bS $path$name.sam > $path$name.bam"); 44 | system("bam2fastq --no-aligned -o $path$name"."_R\#_bacterial.fastq $path$name.bam"); 45 | } 46 | 47 | system("ls $path*_R_1_bacterial.fastq | sed -e 'p;s/_R_1_/_R1_/' | xargs -n2 mv"); 48 | system("ls $path*_R_2_bacterial.fastq | sed -e 'p;s/_R_2_/_R2_/' | xargs -n2 mv"); 49 | -------------------------------------------------------------------------------- /run_fastqMcf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #runs fastq-mcf in batch 4 | #Written by Tom de Man 5 | 6 | use warnings; 7 | use strict; 8 | use File::Basename; 9 | 10 | my $path; 11 | 12 | #hard coded 13 | my $adaptors = "/path/to/adapters.fasta"; 14 | my @files=@ARGV; 15 | 16 | my %paired_files; 17 | foreach my $file (@files){ 18 | my ($file_name,$dir)=fileparse($file); 19 | $path = $dir; 20 | if($file_name =~ /(.+)_R([1|2])_/){ 21 | $paired_files{$1}[$2-1]=$file; 22 | #attempt different naming scheme 23 | }elsif($file_name =~ /(.+)_([1|2])/){ 24 | $paired_files{$1}[$2-1]=$file; 25 | }else{ 26 | warn "Input file does not contain '_R1_' or '_R2_' in name: $file"; 27 | } 28 | } 29 | 30 | foreach my $name (sort keys %paired_files){ 31 | unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){ 32 | warn "Couldn't find matching paired end files for file starting with: $name"; 33 | next; 34 | } 35 | print "trimming your data....\n"; 36 | print "----------------------\n"; 37 | print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n"; 38 | 39 | my $cmd="fastq-mcf $adaptors $paired_files{$name}[0] $paired_files{$name}[1]" . " -o $path$name" . "_R1_trimmed.fastq " . " -o $path$name" . "_R2_trimmed.fastq -C 1000000 -q 30 -p 10 -u -x 0.01"; 40 | print $cmd,"\n"; 41 | die if system($cmd); 42 | } 43 | -------------------------------------------------------------------------------- /run_kSNP.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | #convert dashes in FASTA headers to spaces 3 | #merge all the contigs per sample into one long sequence 4 | #run kSNP 5 | 6 | #Written by Tom de Man 7 | 8 | #kSNP, Jellyfish, FastTree, MUMmer and Parsimonator all need to be in the PATH variable before launching this script 9 | use strict; 10 | 11 | #path containing the input files 12 | my $file_path = shift; 13 | #project name 14 | my $project = shift; 15 | my @fastas = &getFile; 16 | 17 | foreach my $fasta (@fastas) { 18 | open FILE, "$fasta"; 19 | my @lines = ; 20 | close FILE; 21 | 22 | open STDOUT, ">$fasta"; 23 | for (@lines) { 24 | $_ =~ s/_/ /g; 25 | print; 26 | } 27 | close STDOUT; 28 | } 29 | 30 | my @new_fastas = &getFile; 31 | foreach my $file (@new_fastas) { 32 | system("merge_fasta_contigs.pl $file > $file.merged.fasta"); 33 | } 34 | 35 | #combine all the fasta files that come from merge_fasta_contigs 36 | system("cat *.fasta > $project.fasta"); 37 | #run kSNP using the merged fasta file 38 | `kSNP -f $project.fasta -k 23 -d $project -p $project.finished`; 39 | 40 | sub getFile { 41 | my @file_docs; 42 | opendir(DIR, $file_path) or die "Cannot open $file_path \n"; 43 | my @file_files = readdir(DIR); 44 | close DIR; 45 | 46 | foreach my $file (@file_files) { 47 | next if (!($file =~ /\.fa$/)); 48 | push @file_docs, $file; 49 | } 50 | return @file_docs; 51 | } 52 | -------------------------------------------------------------------------------- /run_kneaddata_only_human_removal.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #runs KneadData in batch. Only for human DNA removal, no quality trimming 4 | #Written by Tom de Man 5 | 6 | use warnings; 7 | use strict; 8 | use File::Basename; 9 | 10 | my @files=@ARGV; 11 | my $database = "/path/to/database/Homo_sapiens_Bowtie2_v0.1/"; 12 | 13 | my %paired_files; 14 | foreach my $file (@files){ 15 | my ($file_name,$dir)=fileparse($file); 16 | if($file_name =~ /(.+)_R([1|2])_/){ 17 | $paired_files{$1}[$2-1]=$file; 18 | #attempt different naming scheme 19 | }elsif($file_name =~ /(.+)_([1|2])/){ 20 | $paired_files{$1}[$2-1]=$file; 21 | }else{ 22 | warn "Input file does not contain '_R1_' or '_R2_' in name: $file"; 23 | } 24 | } 25 | 26 | foreach my $name (sort keys %paired_files){ 27 | unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){ 28 | warn "Couldn't find matching paired end files for file starting with: $name"; 29 | next; 30 | } 31 | print "processing your data....\n"; 32 | print "----------------------\n"; 33 | print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n"; 34 | 35 | my $cmd="kneaddata -i $paired_files{$name}[0] -i $paired_files{$name}[1] -o kneaddata_nohuman -db $database --bypass-trim --bowtie2-options \"--very-sensitive --dovetail\" --remove-intermediate-output"; 36 | print $cmd,"\n"; 37 | die if system($cmd); 38 | } 39 | --------------------------------------------------------------------------------