├── BaseCount_sequenceCount_from_sff_file.pl
├── Calc_coverage_from_spades_assembly.pl
├── IonTorrent_SE_run.bash
├── Kraken_krona_fastq.bash
├── README.md
├── adapters.fasta
├── calc_N50_GC_genomesize.pl
├── contig_size_select.pl
├── cut_and_paste_seq.pl
├── estimate_core_genome_from_bam.pl
├── fastqMcf-bowtie2-SPAdes.bash
├── fix_evm_for_gbrowse.pl
├── gff3_2_gff3EVM.pl
├── run_IonT_SPAdes.pl
├── run_SPAdes.pl
├── run_bowtie2_and_pilon.pl
├── run_bowtie2_subtract_mapped_reads_with_picard.pl
├── run_bowtie2_subtract_unmapped_reads.pl
├── run_fastqMcf.pl
├── run_kSNP.pl
└── run_kneaddata_only_human_removal.pl


/BaseCount_sequenceCount_from_sff_file.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Written by Tom de Man
 4 | 
 5 | use strict;
 6 | 
 7 | my $usage = qq(
 8 |         USAGE: perl BaseCount_sequenceCount_from_sff_file.pl /path/to/sff/files
 9 |         *************************************************
10 |         Arguments is the path to all sff files of a particular library
11 | \n);
12 | 
13 | die($usage) if (@ARGV == 0);
14 | 
15 | my $sff_path = $ARGV[0];
16 | 
17 | my @sffs = &getSff;
18 | my ($base_amount2, $contig_amount2) = &main(@sffs); #put two references into variables
19 | 
20 | 
21 | my $count = scalar @$base_amount2;
22 | print "bases: @$base_amount2 \n"; #print dereferenced array
23 | print "reads: @$contig_amount2 \n";#print dereferenced array
24 | 
25 | my $total_amount_bp = 0;
26 | my $total_amount_contig = 0;
27 | for (my $i = 0; $i < $count; $i++) {
28 |         my $element = pop(@$base_amount2);
29 | 	my $element2 = pop(@$contig_amount2);
30 |         $total_amount_bp = $element + $total_amount_bp;
31 | 	$total_amount_contig = $element2 + $total_amount_contig;
32 | }
33 | print "The total amount of bases is: $total_amount_bp \n";
34 | print "The total amount of reads is: $total_amount_contig \n";
35 | 
36 | sub main {
37 |         my @base_amount;
38 | 	my @contig_amount;
39 |         foreach my $line (@_) {
40 |                 system("sffinfo -s $sff_path$line > $line.fasta");
41 |                 my $bases = &count_base("$line");
42 | 		my $contigs = &count_read("$line");
43 | 		#my $contigs = `grep -c '>' $line.fasta`;
44 | 		#print "blaat: $contigs \n";
45 | 		chomp($contigs);
46 |                 system("rm $line.fasta");
47 |                 unshift (@base_amount, $bases);
48 | 		unshift (@contig_amount, $contigs);
49 |         }
50 |         return (\@base_amount, \@contig_amount);#make references of the two arrays
51 | }
52 | 
53 | sub getSff {
54 |         my @sff_docs;
55 |         opendir(DIR, $sff_path) or die "Cannot open $sff_path \n";
56 |         my @sff_files = readdir(DIR);
57 |         close DIR;
58 | 
59 |         foreach my $sff_file (@sff_files) {
60 |                 next if (!($sff_file =~ /\.sff$/));
61 |                 push @sff_docs, $sff_file;
62 |         }
63 |         return @sff_docs;
64 | }
65 | 
66 | sub count_base {
67 |         open FASTA, "<@_[0].fasta";
68 |         my $total_bases = 0;
69 |         while (<FASTA>) {
70 |                 if (!(/^>/)) {
71 |                         chomp;
72 |                         s/\r//g;
73 |                         $total_bases += length;
74 |                 }
75 |         }
76 |         return $total_bases;
77 | close(FASTA);
78 | }
79 | 
80 | sub count_read {
81 | 	my $counter = 0;
82 | 	open FASTA, "<@_[0].fasta";
83 | 	while (<FASTA>) {
84 | 		$counter++ if tr/>//;
85 | 	}	
86 | 	return $counter;
87 | }
88 | 


--------------------------------------------------------------------------------
/Calc_coverage_from_spades_assembly.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | 
 3 | # Written by Tom de Man
 4 | 
 5 | use strict; 
 6 | 
 7 | my $fasta = shift;
 8 | my $contig_count = 0;
 9 | my $sum_coverage_contigs = 0;
10 | 
11 | open FA, "$fasta" || die "cannot open $fasta for reading";
12 | 
13 | while (<FA>) {
14 | 	chomp;
15 | 	if ((/^>/)) { 
16 | 		chomp;
17 | 		$contig_count += 1;
18 | 		my @split_header = split ("_", $_);
19 | 		$sum_coverage_contigs += $split_header[5];
20 | 	} else {
21 | 		print OUT "$_\n";
22 | 	}
23 | }
24 | 
25 | my $coverage = $sum_coverage_contigs / $contig_count;
26 | 
27 | print "Amount of contigs: $contig_count\n";
28 | print "--------------------------------\n";
29 | print "coverage on average: $coverage\n";


--------------------------------------------------------------------------------
/IonTorrent_SE_run.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | mkdir assemblies
 4 | mkdir assemblies_500
 5 | mkdir BUSCO_stats
 6 | 
 7 | perl run_IonT_SPAdes.pl raw_reads/*
 8 | 
 9 | #move assemblies to different folder 
10 | for file in *_assembly/scaffolds.fasta; do new="$(echo "$file" | cut -d '_' -f 2)".scaffolds.fasta; cp "$file" "assemblies/$new"; done
11 | 
12 | #filter contigs on length
13 | for i in assemblies/*.fasta; do perl contig_size_select.pl -low 500 $i > $i.500.fna; done
14 | mv assemblies/*.fna assemblies_500
15 | 
16 | #Check if genomes are complete gene content wise. Make sure to add the right lineage db to -l
17 | for i in assemblies_500/*.fna
18 | 
19 | do
20 | 	isolate="$(echo "$i" | cut -d '/' -f 2)"
21 | 	BUSCO.py -i $i -o BUSCO_$isolate -l /path/to/BUSCO/lineage_data/species_odb9 -m geno
22 | done
23 | 
24 | #take all summaries
25 | for file in run_BUSCO_*/*.txt; do cp $file BUSCO_stats; done
26 | BUSCO_plot.py -wd BUSCO_stats
27 | 


--------------------------------------------------------------------------------
/Kraken_krona_fastq.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | kraken_db="/path/to/custom/Kraken-RefSeq/database/"
 4 | 
 5 | for file in *_R1_001.fastq
 6 | 
 7 | do
 8 | 	b=$(basename $file _R1_001.fastq)
 9 | 	rev=${b}_R2_001.fastq
10 | 
11 | 	echo "[:] Analyzing $file and $rev"
12 | 
13 | 	kraken --version
14 | 
15 | 	echo "[:] Running kraken.  Output: $file.kraken / $file.classified"
16 | 
17 | 	kraken --fastq-input --paired --db $kraken_db --preload --threads 12 --output $file.kraken --classified-out $file.classified $file $rev
18 | 
19 | 	echo "[:] Generating metaphlan compatible report."
20 | 
21 | 	kraken-mpa-report --db $kraken_db $file.kraken >> $file.mpa
22 | 
23 | 	echo "[:] Generating krona output for $file."
24 | 
25 | 	python metaphlan2krona.py -p $file.mpa -k $file.krona
26 | 
27 | done
28 | ktImportText -o all_output.html *.krona


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Sequence-scripts
  2 | ================
  3 | 
  4 | ## Extract metagenomic reads from a particular species or strain using [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [samtools](https://github.com/samtools/), and [Picard](https://broadinstitute.github.io/picard/). 
  5 | 
  6 | ### Usage
  7 |     perl run_bowtie2_subtract_mapped_reads_with_picard directory/containing/metagenomic/samples/only/*
  8 | ------------------------------------------------------------------------------------------------------
  9 | 
 10 | ## Remove human DNA from metagenomic samples to comply with HIPAA regulations (useful when uploading raw data to SRA/ENA)
 11 | 
 12 | ### Usage
 13 |     perl run_kneaddata_only_human_removal.pl directory/containing/metagenomic/samples/only/*
 14 | --------------------------------------------------------------------------------------------
 15 | 
 16 | ## Cut DNA sequence at a user defined position (e.g. position 63789) and move the sequence that ranges from start to 63789 to the end of the sequence. Useful when generating plasmid sequence comparison plots with Easyfig or Geneious. As a convention, folks usually cut right before the plasmid replicon gene.
 17 | 
 18 | ### Usage
 19 |     perl cut_and_paste_seq.pl -cut 63789 -strand <forward/reverse> -seq Plasmid_DNA.fasta > Plasmid_DNA_new_sequence_order.fasta
 20 | -----------------------------------------------------------------------------------------------------------------------------------
 21 | 
 22 | ## Determine the core genome size of a given dataset. The script is assessing sorted BAM files and the mapping reference in order to estimate the core genome size for a certain depth of coverage. The script needs [samtools](https://github.com/samtools/), [bedtools](http://bedtools.readthedocs.io/en/latest/), and awk.
 23 | 
 24 | ### Usage
 25 |     perl estimate_core_genome_from_bam.pl -bam /path/to/bam/files -genome mapping/reference/fasta/file -depth 10
 26 | ----------------------------------------------------------------------------------------------------------------
 27 | 
 28 | ## Calculate simple genome assembly stats including N50, number of contigs, total bases, and G+C content
 29 | 
 30 | ### Usage
 31 |     perl calc_N50_GC_genomesize.pl -i genomeAssembly.fasta -o output.stats
 32 | --------------------------------------------------------------------------
 33 | 
 34 | ## Screen raw reads for contamination and get an impression of the bacterial composition of your sample(s). Script is using [Kraken](https://ccb.jhu.edu/software/kraken/) for determining species composition, [KronaTools](https://github.com/marbl/Krona/wiki/KronaTools) for generating multi-layered pie charts, and conversion script [metaphlan2krona.py](https://bitbucket.org/nsegata/metaphlan/src/2f1b17a1f4e9775fe1ce42c8481279a5e69f291f/conversion_scripts/metaphlan2krona.py?at=default)
 35 | 
 36 | ### Usage
 37 |     bash Kraken_krona_fastq.bash
 38 | --------------------------------
 39 | 
 40 | ## Trimming raw reads and remove sequencing adapters using [fastq-mcf](http://ea-utils.googlecode.com/svn/wiki/FastqMcf.wiki)
 41 | 
 42 | ### Usage
 43 |     perl run_fastqMcf.pl directory/containing/raw/reads/only/*
 44 | --------------------------------------------------------------
 45 | 
 46 | ## Map trimmed reads to contaminant (e.g. PhiX) database and subtract unmapped reads for downstream analysis using [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [SAMtools](http://samtools.sourceforge.net) and [bam2fastq](https://gsl.hudsonalpha.org/information/software/bam2fastq)
 47 | 
 48 | ### Usage
 49 |     perl run_bowtie2_subtract_unmapped_reads.pl directory/containing/trimmed/reads/only/*
 50 | -----------------------------------------------------------------------------------------
 51 | 
 52 | ## Assembling the trimmed and contaminant free reads using [SPAdes](http://spades.bioinf.spbau.ru)
 53 | 
 54 | ### Usage
 55 |     perl run_SPAdes.pl directory/containing/trimmed/and/virus/free/reads/only/*
 56 | -------------------------------------------------------------------------------
 57 | 
 58 | ## Perform the previous three steps using one Shell script. It runs fastq-MCF, Bowtie2, SAMtools, bam2fastq and SPAdes assembler in batch
 59 | 
 60 | ### Usage 
 61 |     bash fastqMcf-bowtie2-SPAdes.bash
 62 | --------------------------------------
 63 | 
 64 | ## Calculating average K-mer coverage of SPAdes assembly, from your highest K value (usually k=127)
 65 | 
 66 | ### Usage
 67 |     perl Calc_coverage_from_spades_assembly.pl <scaffolds.fasta>
 68 | ----------------------------------------------------------------
 69 | 
 70 | ## Correcting PacBio data with Illumina reads by means of [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) and [Pilon](https://github.com/broadinstitute/pilon/wiki) 
 71 | 
 72 | ### Usage
 73 |     perl run_bowtie2_and_pilon.pl <PacBio-unitigs.fasta> path/to/trimmed/Illumina/reads/*
 74 | -----------------------------------------------------------------------------------------
 75 | 
 76 | ## Running [kSNP](https://sourceforge.net/projects/ksnp/files/) version 2 using assembled microbial genomes (in fasta format)
 77 | 
 78 | ### Usage
 79 |     perl run_kSNP.pl full/path/containing/the/input/files projectname
 80 | ---------------------------------------------------------------------
 81 | 
 82 | IonTorrent scripts
 83 | ==================
 84 | 
 85 | ## Assemble Single-End (SE) IonTorrent reads with [SPAdes](http://spades.bioinf.spbau.ru)
 86 | 
 87 | ### Usage
 88 |     perl run_IonT_SPAdes.pl directory/containing/trimmed/SE-reads/only/*
 89 | --------------------------------------------------------------------------
 90 | 
 91 | ## Bash workflow script for trimming SE IonTorrent reads, assembling trimmed reads, and quality check contigs using [BUSCO 2.0](http://busco.ezlab.org)
 92 | 
 93 | ### Usage
 94 |     bash IonTorrent_SE_run.bash
 95 | -------------------------------
 96 | 
 97 | 454 scripts
 98 | ===========
 99 | 
100 | ## Quickly assess binary 454 Standard Flowgram Format (SFF) files from a 454 sequencing run. This simple script counts amount of reads and bases. Script needs SFFinfo
101 | 
102 | ### Usage
103 |     perl BaseCount_sequenceCount_from_sff_file.pl /directory/to/sff/files
104 | --------------------------------------
105 | 
106 | Eukaryotic part
107 | ================
108 | 
109 | ## Generate [EVM](https://evidencemodeler.github.io) suitable GFF3 files from [MAKER](http://www.yandell-lab.org/software/maker.html) de novo gene prediction GFF
110 | 
111 | ### Usage
112 |     perl gff3_2_gff3EVM.pl <maker_protein_genes.gff3>
113 | --------------------------------------
114 | 
115 | ## Make EVM data compatible with [Gbrowse](http://gbrowse.org/index.html)
116 | 
117 | ### Usage
118 |     perl fix_evm_for_gbrowse.pl < inputfile.gff3
119 | --------------------------------------
120 | 


--------------------------------------------------------------------------------
/adapters.fasta:
--------------------------------------------------------------------------------
  1 | >gnl|uv|NGB00360.1|Illumina_Paired_End_PCR_Primer_1.0
  2 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
  3 | >gnl|uv|NGB00362.1|Illumina_Paired_End_PCR_Primer_2.0
  4 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
  5 | >gnl|uv|NGB00727.1|Nextera_PCR_primer_i5_N501
  6 | AATGATACGGCGACCACCGAGATCTACACTAGATCGCTCGTCGGCAGCGTC
  7 | >gnl|uv|NGB00728.1|Nextera_PCR_primer_i5_N502
  8 | AATGATACGGCGACCACCGAGATCTACACCTCTCTATTCGTCGGCAGCGTC
  9 | >gnl|uv|NGB00729.1|Nextera_PCR_primer_i5_N503
 10 | AATGATACGGCGACCACCGAGATCTACACTATCCTCTTCGTCGGCAGCGTC
 11 | >gnl|uv|NGB00730.1|Nextera_PCR_primer_i5_N504
 12 | AATGATACGGCGACCACCGAGATCTACACAGAGTAGATCGTCGGCAGCGTC
 13 | >gnl|uv|NGB00731.1|Nextera_PCR_primer_i5_N505
 14 | AATGATACGGCGACCACCGAGATCTACACGTAAGGAGTCGTCGGCAGCGTC
 15 | >gnl|uv|NGB00732.1|Nextera_PCR_primer_i5_N506
 16 | AATGATACGGCGACCACCGAGATCTACACACTGCATATCGTCGGCAGCGTC
 17 | >gnl|uv|NGB00733.1|Nextera_PCR_primer_i5_N507
 18 | AATGATACGGCGACCACCGAGATCTACACAAGGAGTATCGTCGGCAGCGTC
 19 | >gnl|uv|NGB00734.1|Nextera_PCR_primer_i5_N508
 20 | AATGATACGGCGACCACCGAGATCTACACCTAAGCCTTCGTCGGCAGCGTC
 21 | >gnl|uv|NGB00735.1|Nextera_PCR_primer_i7_N701
 22 | CAAGCAGAAGACGGCATACGAGATTCGCCTTAGTCTCGTGGGCTCGG
 23 | >gnl|uv|NGB00736.1|Nextera_PCR_primer_i7_N702
 24 | CAAGCAGAAGACGGCATACGAGATCTAGTACGGTCTCGTGGGCTCGG
 25 | >gnl|uv|NGB00737.1|Nextera_PCR_primer_i7_N703
 26 | CAAGCAGAAGACGGCATACGAGATTTCTGCCTGTCTCGTGGGCTCGG
 27 | >gnl|uv|NGB00738.1|Nextera_PCR_primer_i7_N704
 28 | CAAGCAGAAGACGGCATACGAGATGCTCAGGAGTCTCGTGGGCTCGG
 29 | >gnl|uv|NGB00739.1|Nextera_PCR_primer_i7_N705
 30 | CAAGCAGAAGACGGCATACGAGATAGGAGTCCGTCTCGTGGGCTCGG
 31 | >gnl|uv|NGB00740.1|Nextera_PCR_primer_i7_N706
 32 | CAAGCAGAAGACGGCATACGAGATCATGCCTAGTCTCGTGGGCTCGG
 33 | >gnl|uv|NGB00741.1|Nextera_PCR_primer_i7_N707
 34 | CAAGCAGAAGACGGCATACGAGATGTAGAGAGGTCTCGTGGGCTCGG
 35 | >gnl|uv|NGB00742.1|Nextera_PCR_primer_i7_N708
 36 | CAAGCAGAAGACGGCATACGAGATCCTCTCTGGTCTCGTGGGCTCGG
 37 | >gnl|uv|NGB00743.1|Nextera_PCR_primer_i7_N709
 38 | CAAGCAGAAGACGGCATACGAGATAGCGTAGCGTCTCGTGGGCTCGG
 39 | >gnl|uv|NGB00744.1|Nextera_PCR_primer_i7_N710
 40 | CAAGCAGAAGACGGCATACGAGATCAGCCTCGGTCTCGTGGGCTCGG
 41 | >gnl|uv|NGB00745.1|Nextera_PCR_primer_i7_N711
 42 | CAAGCAGAAGACGGCATACGAGATTGCCTCTTGTCTCGTGGGCTCGG
 43 | >gnl|uv|NGB00746.1|Nextera_PCR_primer_i7_N712
 44 | CAAGCAGAAGACGGCATACGAGATTCCTCTACGTCTCGTGGGCTCGG
 45 | >gnl|uv|NGB00842.1|Nextera_DNA_Sample_Prep_Kit_Transposon
 46 | GCCTCCCTCGCGCCATCAGAGATGTGTATAAGAGACAG
 47 | >gnl|uv|NGB00843.1|Nextera_DNA_Sample_Prep_Kit_Transposon
 48 | GCCTTGCCAGCCCGCTCAGAGATGTGTATAAGAGACAG
 49 | >gnl|uv|NGB00844.1|Nextera_DNA_Sample_Prep_Kit_Adaptor
 50 | AATGATACGGCGACCACCGAGATCTACACGCCTCCCTCGCGCCATCAG
 51 | >gnl|uv|NGB00726.1|Illumina_Nextera_transposase
 52 | GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
 53 | >gnl|uv|NGB00725.1|Illumina_Nextera_transposase
 54 | TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
 55 | >gnl|uv|NGB00755.1|TruSeq_DNA_HT_and_RNA_HT_i7_D701
 56 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTACTCGATCTCGTATGCCGTCTTCTGCTTG
 57 | >gnl|uv|NGB00756.1|TruSeq_DNA_HT_and_RNA_HT_i7_D702
 58 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCCGGAGAATCTCGTATGCCGTCTTCTGCTTG
 59 | >gnl|uv|NGB00757.1|TruSeq_DNA_HT_and_RNA_HT_i7_D703
 60 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGCTCATTATCTCGTATGCCGTCTTCTGCTTG
 61 | >gnl|uv|NGB00758.1|TruSeq_DNA_HT_and_RNA_HT_i7_D704
 62 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGATTCCATCTCGTATGCCGTCTTCTGCTTG
 63 | >gnl|uv|NGB00759.1|TruSeq_DNA_HT_and_RNA_HT_i7_D705
 64 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCAGAAATCTCGTATGCCGTCTTCTGCTTG
 65 | >gnl|uv|NGB00760.1|TruSeq_DNA_HT_and_RNA_HT_i7_D706
 66 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAATTCGTATCTCGTATGCCGTCTTCTGCTTG
 67 | >gnl|uv|NGB00761.1|TruSeq_DNA_HT_and_RNA_HT_i7_D707
 68 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTGAAGCTATCTCGTATGCCGTCTTCTGCTTG
 69 | >gnl|uv|NGB00762.1|TruSeq_DNA_HT_and_RNA_HT_i7_D708
 70 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAATGCGCATCTCGTATGCCGTCTTCTGCTTG
 71 | >gnl|uv|NGB00763.1|TruSeq_DNA_HT_and_RNA_HT_i7_D709
 72 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGGCTATGATCTCGTATGCCGTCTTCTGCTTG
 73 | >gnl|uv|NGB00764.1|TruSeq_DNA_HT_and_RNA_HT_i7_D710
 74 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCCGCGAAATCTCGTATGCCGTCTTCTGCTTG
 75 | >gnl|uv|NGB00765.1|TruSeq_DNA_HT_and_RNA_HT_i7_D711
 76 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCTCGCGCATCTCGTATGCCGTCTTCTGCTTG
 77 | >gnl|uv|NGB00766.1|TruSeq_DNA_HT_and_RNA_HT_i7_D712
 78 | GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGCGATAGATCTCGTATGCCGTCTTCTGCTTG
 79 | >gnl|uv|NGB00360.1|TruSeq_Illumina_PCR_Primer
 80 | AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
 81 | >gnl|uv|NGB00362.1|Illumina_Paired_End_PCR_Primer_2.0
 82 | CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
 83 | >gnl|uv|NGB00846.1|NEBNext_Adaptor
 84 | GATCGGAAGAGCACACGTCTGAACTCCAGTCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
 85 | >gnl|uv|NGB00847.1|NEBNext_1_Primer
 86 | CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 87 | >gnl|uv|NGB00848.1|NEBNext_2_Primer
 88 | CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 89 | >gnl|uv|NGB00849.1|NEBNext_3_Primer
 90 | CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 91 | >gnl|uv|NGB00850.1|NEBNext_4_Primer
 92 | CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 93 | >gnl|uv|NGB00851.1|NEBNext_5_Primer
 94 | CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 95 | >gnl|uv|NGB00852.1|NEBNext_6_Primer
 96 | CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 97 | >gnl|uv|NGB00853.1|NEBNext_7_Primer
 98 | CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 99 | >gnl|uv|NGB00854.1|NEBNext_8_Primer
100 | CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
101 | >gnl|uv|NGB00855.1|NEBNext_9_Primer
102 | CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
103 | >gnl|uv|NGB00856.1|NEBNext_10_Primer
104 | CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
105 | >gnl|uv|NGB00857.1|NEBNext_11_Primer
106 | CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
107 | >gnl|uv|NGB00858.1|NEBNext_12_Primer
108 | CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
109 | >gnl|uv|NGB00859.1|NEBNext_13_Primer
110 | CAAGCAGAAGACGGCATACGAGATTGTTGACTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
111 | >gnl|uv|NGB00860.1|NEBNext_14_Primer
112 | CAAGCAGAAGACGGCATACGAGATACGGAACTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
113 | >gnl|uv|NGB00861.1|NEBNext_15_Primer
114 | CAAGCAGAAGACGGCATACGAGATTCTGACATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
115 | >gnl|uv|NGB00862.1|NEBNext_16_Primer
116 | CAAGCAGAAGACGGCATACGAGATCGGGACGGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
117 | >gnl|uv|NGB00863.1|NEBNext_18_Primer
118 | CAAGCAGAAGACGGCATACGAGATGTGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
119 | >gnl|uv|NGB00864.1|NEBNext_19_Primer
120 | CAAGCAGAAGACGGCATACGAGATCGTTTCACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
121 | >gnl|uv|NGB00865.1|NEBNext_20_Primer
122 | CAAGCAGAAGACGGCATACGAGATAAGGCCACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
123 | >gnl|uv|NGB00866.1|NEBNext_21_Primer
124 | CAAGCAGAAGACGGCATACGAGATTCCGAAACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
125 | >gnl|uv|NGB00867.1|NEBNext_22_Primer
126 | CAAGCAGAAGACGGCATACGAGATTACGTACGGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
127 | >gnl|uv|NGB00868.1|NEBNext_23_Primer
128 | CAAGCAGAAGACGGCATACGAGATATCCACTCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
129 | >gnl|uv|NGB00869.1|NEBNext_25_Primer
130 | CAAGCAGAAGACGGCATACGAGATATATCAGTGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
131 | >gnl|uv|NGB00870.1|NEBNext_27_Primer
132 | CAAGCAGAAGACGGCATACGAGATAAAGGAATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT


--------------------------------------------------------------------------------
/calc_N50_GC_genomesize.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl
  2 | 
  3 | #Written by Tom de Man
  4 | #Calculates basic contig stats like G+C content, N50, and number of contigs
  5 | 
  6 | use strict;
  7 | use warnings;
  8 | use List::Util qw(sum min max);
  9 | use Getopt::Long;
 10 | use File::Basename;
 11 | 
 12 | my $As = 0;
 13 | my $Ts = 0;
 14 | my $Gs = 0;
 15 | my $Cs = 0;
 16 | my $Ns = 0;
 17 | 
 18 | my $file;
 19 | my $helpAsked;
 20 | my $outFile = "";
 21 | 
 22 | GetOptions(
 23 | 		"i=s" => \$file,
 24 | 		"h|help" => \$helpAsked,
 25 | 		"o|outputFile=s" => \$outFile,
 26 | 		);
 27 | 
 28 | if(defined($helpAsked)) {
 29 | 	Usage();
 30 | 	exit;
 31 | }
 32 | if(!defined($file)) {
 33 | 	Error("No input files are provided");
 34 | }
 35 | 
 36 | my ($fileName, $filePath) = fileparse($file);
 37 | $outFile = $file . "_n50_GC_genomesize_stats" if($outFile eq "");
 38 | 
 39 | open(IN, "<$file") or die "Cannot open file: $file\n";
 40 | open(OUT, ">$outFile") or die "Cannot open file: $outFile\n";
 41 | 
 42 | my @len = ();
 43 | my $prevFastaSeqId = "";
 44 | my $fastaSeqId = "";
 45 | my $fastaSeq = "";
 46 | 
 47 | while(<IN>) {
 48 | 	chomp;
 49 | 	if($_ =~ /^>/) {
 50 | 		$prevFastaSeqId = $fastaSeqId;
 51 | 		$fastaSeqId = $_;
 52 | 		if($fastaSeq ne "") {
 53 | 			push(@len, length $fastaSeq);
 54 | 			baseCount($fastaSeq);
 55 | 		}
 56 | 		$fastaSeq = "";
 57 | 	}
 58 | 	else {
 59 | 		$fastaSeq .= $_;
 60 | 	}
 61 | }
 62 | close IN;
 63 | 
 64 | if($fastaSeq ne "") {
 65 | 	$prevFastaSeqId = $fastaSeqId;
 66 | 	push(@len, length $fastaSeq);
 67 | 	baseCount($fastaSeq);
 68 | }
 69 | 	
 70 | my $totalContigs = scalar @len;
 71 | my $bases = sum(@len);
 72 | my $minContigLen = min(@len);
 73 | my $maxContigLen = max(@len);
 74 | my $n50 = calcN50(\@len, 50);
 75 | my $GCcont = ($Gs+$Cs)/$bases*100;
 76 | 
 77 | #MMB sheet order
 78 | print "$totalContigs\t$bases\n";
 79 | 
 80 | printf OUT "%-25s %d\n", "Number of reads/contigs", $totalContigs;
 81 | printf OUT "%-25s %d\n", "Total assembly length", $bases;
 82 | #GC
 83 | printf OUT "%-25s %0.2f %s\n", "(G + C)s", ($Gs+$Cs)/$bases*100, "%";
 84 | #N50
 85 | printf OUT "%-25s %d\n", "N50 length", $n50;
 86 | 
 87 | 
 88 | print "Contig Statistics file: $outFile\n";
 89 | close OUT;
 90 | exit; 
 91 | 
 92 | sub calcN50 {
 93 | 	my @x = @{$_[0]};
 94 | 	my $n = $_[1];
 95 | 	@x=sort{$b<=>$a} @x;
 96 | 	my $total = sum(@x);
 97 | 	my ($count, $n50)=(0,0);
 98 | 	for (my $j=0; $j<@x; $j++){
 99 | 		$count+=$x[$j];
100 | 		if(($count>=$total*$n/100)){
101 | 			$n50=$x[$j];
102 | 			last;
103 | 		}
104 | 	}
105 | 	return $n50;
106 | }
107 | 
108 | sub baseCount {
109 | 	my $seq = $_[0];
110 | 	my $tAs += $seq =~ s/A/A/gi;
111 | 	my $tTs += $seq =~ s/T/T/gi;
112 | 	my $tGs += $seq =~ s/G/G/gi;
113 | 	my $tCs += $seq =~ s/C/C/gi;
114 | 	$Ns += (length $seq) - $tAs - $tTs - $tGs - $tCs;
115 | 	$As += $tAs;
116 | 	$Ts += $tTs;
117 | 	$Gs += $tGs;
118 | 	$Cs += $tCs;
119 | }
120 | 
121 | sub Usage {
122 | 	print "\n Usage: perl $0 <options>\n\n";
123 | 	print "### Input reads/contigs (FASTA) (Required)\n";
124 | 	print "  -i <Read/Sequence file>\n";
125 | 	print "    Read/Sequence in fasta format\n";
126 | 	print "  -o | -outputFile <Output file name>\n";
127 | 	print "    default: By default, N50 statistics file will be stored where the input file is\n";
128 | 	print "  -h | -help\n";
129 | 	print "    Prints this help\n";
130 | 	print "\n";
131 | }
132 | 
133 | sub Error {
134 | 	my $msg = $_[0];
135 | 	printf STDERR "|%-70s|\n", "  Error!!";
136 | 	printf STDERR "|%-70s|\n", "  $msg";
137 | 	
138 | 	Usage();
139 | 	exit;
140 | }
141 | 


--------------------------------------------------------------------------------
/contig_size_select.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | 
 3 | #Copied from Umer Zeeshan Ijaz at University of Glasgow
 4 | 
 5 | use strict;
 6 | use Getopt::Long;
 7 | 
 8 | my ($low,$high)=(0,99999999);
 9 | 
10 | GetOptions( "low=i"      => \$low,
11 |             "high=i"        => \$high,
12 |             "help|?"       => sub {Usage()}
13 |           );
14 | sub Usage
15 | {
16 |    print STDERR "perl $0 -low <lower_bound> -high <higher_bound> <fasta_file>\n\n";
17 |    exit;
18 | }
19 | if (scalar(@ARGV)!=1) {print STDERR "Please give one input fasta file\n";&Usage;}
20 | 
21 | my $seq;
22 | my $id;
23 | my $len;
24 | my @seq;
25 | open (IN,"$ARGV[0]") or die ":$!";
26 | while(<IN>)
27 | {
28 |     chomp;
29 |     if(/^>(.*)/)
30 |     {
31 |        if ($seq){
32 |             if ($seq=~/\d+/)
33 |             {
34 |                 chop $seq;
35 |                 @seq = split /\s+/,$seq;
36 |                 $len=scalar(@seq);
37 |             }
38 |             else
39 |             {
40 |                 $len=length ($seq);
41 |             }
42 |             if ($len>=$low and $len<=$high){
43 |                 print ">$id\n$seq\n";
44 |             }
45 |        }
46 |        $id =$1;
47 |        $seq ="";
48 |     }
49 |     else
50 |     {
51 |       if ($_ =~/\d+/) # for qual file
52 |        {
53 |            $seq .= $_." ";
54 |        }
55 |        else
56 |        {
57 |            $seq .= $_;
58 |        }
59 |     }
60 | }
61 |       if ($seq){
62 |             if ($seq=~/\d+/)
63 |             {
64 |                 chop $seq;
65 |                 @seq = split /\s+/,$seq;
66 |                 $len=scalar(@seq);
67 |             }
68 |             else
69 |             {
70 |                 $len=length ($seq);
71 |             }
72 |             if ($len>=$low and $len<=$high){
73 |                 print ">$id\n$seq\n";
74 |             }
75 |       }
76 | 
77 | close IN;


--------------------------------------------------------------------------------
/cut_and_paste_seq.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Written by Tom de Man
 4 | # Cut DNA sequence at a user defined position, and paste it at the end of that same sequence
 5 | # Useful when generating plasmid DNA sequence comparison plots with Easyfig or Geneious
 6 | # Needs BioPerl 
 7 | 
 8 | use strict;
 9 | use warnings;
10 | use Bio::SeqIO;
11 | use Getopt::Long;
12 | 
13 | my $end_pos;
14 | my $fasta;
15 | my $strand;
16 | 
17 | GetOptions(	"cut=s"			=> \$end_pos,
18 | 			"seq=s"			=> \$fasta,
19 | 			"strand=s"		=> \$strand,
20 | 			"help|?"		=> sub {Usage()}
21 | );
22 | 
23 | if (($end_pos) && ($fasta) && ($strand)) {
24 | 	&subseq($end_pos, $fasta, $strand);
25 | } else {
26 | 	&Usage;
27 | }
28 | 
29 | sub subseq {
30 | 	my ($end, $input_seq, $direction) = @_;
31 | 	my $seqin  = Bio::SeqIO->new(-file => "$input_seq", -format => "fasta");
32 | 	while (my $seq = $seqin->next_seq) {
33 | 		my $acc = $seq->display_id;
34 | 		my $sequence = $seq->seq;
35 | 		
36 | 		print ">$acc\n";
37 | 		if ($strand eq "forward") {
38 | 			my $first = substr($sequence, 0, $end);
39 | 			my $last = substr($sequence, $end);
40 | 			print "$last"."$first"."\n";
41 | 		} elsif ($strand eq "reverse") {
42 | 			my $revcomp = reverse($sequence);
43 | 			$revcomp =~ tr/ABCDGHMNRSTUVWXYabcdghmnrstuvwxy/TVGHCDKNYSAABWXRtvghcdknysaabwxr/;
44 | 			my $len = length $revcomp;
45 | 			my $rev_pos = $len - $end;
46 | 			
47 | 			my $first = substr($revcomp, 0, $rev_pos);
48 | 			my $last = substr($revcomp, $rev_pos);
49 | 			my $out = "$last"."$first";
50 | 			print "$out \n";
51 | 		}
52 | 	}		
53 | }
54 | 
55 | sub Usage {
56 | 	print STDERR "\n Please provide input sequence file, cutting position, and strand direction\n\n";
57 | 	print STDERR "\n Usage:  perl $0 -cut <cut position> -strand <either \"forward\" or \"reverse\" strand> -seq <input FASTA file> \n\n";
58 | 	exit;
59 | }


--------------------------------------------------------------------------------
/estimate_core_genome_from_bam.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl
  2 | 
  3 | # Written by Tom de Man
  4 | #script needs sorted bam files 
  5 | #script also needs samtools, bedtools, and awk in order to operate
  6 | 
  7 | use strict;
  8 | use warnings;
  9 | use Getopt::Long;
 10 | use String::ShellQuote qw(shell_quote);
 11 | use Array::Utils qw(:all);
 12 | use Data::Dumper qw(Dumper);
 13 | 
 14 | my $sortbam_path;
 15 | my $genome;
 16 | my $depth;
 17 | 
 18 | my $genome_size;
 19 | my $starttime = localtime;
 20 | my $version = "1.0";
 21 | 
 22 | GetOptions(	"bam=s"		=> \$sortbam_path,
 23 | 		"genome=s"	=> \$genome,
 24 | 		"depth=s"	=> \$depth,
 25 | 		"help|?"	=> sub {Usage()}
 26 | 		);
 27 | 
 28 | if (($sortbam_path) && ($genome) && ($depth)) {
 29 | 	my @bams = &get_files("bam");
 30 | 	print STDERR "Hi $ENV{USER}, you are now running $0 version: $version on $starttime \n\n";
 31 | 	print STDERR "BAM files included in core genome estimate: \n";
 32 | 	foreach (@bams) {
 33 | 		print "$_\n";
 34 | 	}
 35 | 	$genome_size = &genome_size_calc;
 36 | 	&estimate_core(@bams);
 37 | } else {
 38 | 	&Usage;
 39 | }
 40 | 
 41 | sub genome_size_calc {
 42 | 	print STDERR "\n";
 43 | 	print STDERR "your mapping reference is $genome \n";
 44 | 	open FASTA, "$genome" or die "cannot open $genome for reading \n";
 45 | 	my $total_bases = 0;
 46 | 	while (<FASTA>) {
 47 | 		if (!(/^>/)) {
 48 | 			chomp;
 49 | 			s/\r//g;
 50 | 			$total_bases += length;
 51 | 		}
 52 | 	}
 53 | 	return $total_bases;
 54 | close FASTA;
 55 | }
 56 | 
 57 | sub estimate_core {
 58 | 	#create fasta index
 59 | 	print STDERR "running samtools..... generating a contig list file\n";
 60 | 	system("samtools faidx $genome");
 61 | 	open INDEX, "$genome.fai" or die "cannot open $genome.fai for reading \n";
 62 | 	open (my $fh, '>', "$genome.contig");
 63 | 	while (<INDEX>) {
 64 | 		chomp;
 65 | 		my @split = split ("\t", $_);
 66 | 		print $fh "$split[0]\t$split[1]\n";
 67 | 	}
 68 | 	close $fh;
 69 | 	close INDEX;
 70 | 
 71 | 	my $cnt = 0;
 72 | 	#calculate genome wide coverage for each nucleotide position, for each sorted BAM file
 73 | 	foreach my $file (@_) {
 74 | 		$cnt += 1;
 75 | 		my $bam = "$sortbam_path/$file";
 76 | 		my $con_len = "$genome.contig";
 77 | 		my $out = "$sortbam_path/$file.$depth.cov";
 78 | 		print STDERR "running bedtools for sample $cnt..... generating genome coverage data \n";
 79 | 		system("bedtools genomecov -ibam ".shell_quote($bam)." -g ".shell_quote($con_len)." -d | awk '{if(\$3>=$depth){ print \$0}}' > ".shell_quote($out)."");
 80 | 	}
 81 | 	
 82 | 	#get the .cov files	
 83 | 	my @covs = &get_files("cov");
 84 | 	my @cov2d;
 85 | 	foreach (@covs) {
 86 | 		my @n;
 87 | 		my $c = "$sortbam_path/$_";
 88 | 		open COV, $c or die "cannot open $c for reading \n";
 89 | 		while (<COV>) {
 90 | 			chomp;
 91 | 			my @split = split ("\t", $_);
 92 | 			my $contig_position = $split[0]."_".$split[1];
 93 | 			push @n, $contig_position;
 94 | 		}
 95 | 		close COV;
 96 | 		push @cov2d, \@n;
 97 |  
 98 | 	}
 99 | 	my $rows = scalar @cov2d;
100 | 	print STDERR "You are going to estimate a core genome for $rows isolates ..... how exciting!!! \n";	
101 | 
102 | 	my $start_ref = shift @cov2d;
103 | 	my @overlap = @$start_ref;
104 | 	my $remainder = $rows - 1;
105 | 	#pairwise comparison via intersect
106 | 	for (my $i=0; $i < $remainder; $i++) {
107 | 		my $comparison = shift @cov2d;
108 | 		my $size = @$comparison;
109 | 		@overlap = intersect(@$comparison, @overlap);
110 | 	}
111 | 	my $core = scalar @overlap;
112 | 	my $percentage = ($core/$genome_size)* 100;
113 | 	my $rounded = sprintf "%.2f", $percentage;
114 | 	print STDERR "Core genome size for $rows genomes is: $core base pairs, which equals $rounded% of the mapping reference genome\n";
115 | 
116 | }
117 | 
118 | sub get_files {
119 | 	my $ext = qr/$_[0]/;
120 | 	my @bamfiles;
121 | 	opendir(DIR, $sortbam_path) or die "cannot open $sortbam_path \n";
122 | 	my @files = readdir(DIR);
123 | 	close DIR;
124 | 
125 | 	foreach my $file (@files){
126 | 		next if (!($file =~ /\.$ext$/));
127 | 		push @bamfiles, $file;
128 | 	}
129 | 	return @bamfiles;
130 | }
131 | 
132 | sub Usage {
133 | 	print STDERR "\n Please provide input files!!!\n\n";
134 | 	print STDERR "\n Usage:  perl $0 -bam <BAM file path> -genome <genome FASTA file> -depth <minimum depth of coverage to include in output>\n\n";
135 | 	exit;
136 | }
137 | 


--------------------------------------------------------------------------------
/fastqMcf-bowtie2-SPAdes.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | mkdir trimmed_reads
 4 | mkdir trimmed_viral_free_reads
 5 | mkdir assemblies
 6 | 
 7 | perl run_fastqMcf.pl raw_reads/*
 8 | 
 9 | mv raw_reads/*trimmed* trimmed_reads
10 | 
11 | perl run_bowtie2_subtract_unmapped_reads.pl trimmed_reads/*
12 | 
13 | mv trimmed_reads/*bacterial* trimmed_viral_free_reads
14 | 
15 | perl run_SPAdes.pl trimmed_viral_free_reads/*
16 | 
17 | for file in *_assembly/scaffolds.fasta; do new="$(echo "$file" | cut -d '_' -f 1)".scaffolds.fasta; cp "$file" "assemblies/$new"; done
18 | 


--------------------------------------------------------------------------------
/fix_evm_for_gbrowse.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #Written by Tom de Man
 4 | 
 5 | use strict;
 6 | my %count;
 7 | 
 8 | while(<>) {
 9 |     next if /^\s*$/;
10 |     chomp;
11 |     my @row = split(/\t/,$_);
12 |     
13 |     if($row[2] eq 'CDS') {
14 | 		my %grp = map { split(/=/,$_) } split(/;/,pop @row);
15 | 		$grp{ID} .= ".cds".++$count{$grp{ID}};
16 | 		push @row, join(";", map { sprintf("%s=%s",$_,$grp{$_})} qw(ID Parent));
17 |     }
18 |     print join("\t", @row), "\n";
19 | }


--------------------------------------------------------------------------------
/gff3_2_gff3EVM.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | #create EVM suitable GFF3 files from MAKER de novo gene prediction GFF
 5 | #Written by Tom de Man
 6 | 
 7 | my @pre_name;
 8 | my @last_col_split_name;
 9 | my @pre_name_extra;
10 | my @pre_name_plus;
11 | 
12 | my $exon_count = 0;
13 | 
14 | open(my $gene => ">gene_predictions.EVM.gff3")|| die $!;
15 | 
16 | while(<>) {
17 | 	chomp;
18 |     my @row = split(/\t/,$_);
19 |     my $last_col = pop(@row); 
20 |     my @last_col_split = split(";",$last_col);
21 | 	if ($row[1] eq "genemark" && $row[2] eq "gene") {
22 | 		@pre_name = split("-", $last_col_split[1]);
23 | 		@pre_name_plus = split("=", $last_col_split[1]);
24 | 		print $gene join ("\t", @row), "\t$last_col_split[0];$pre_name[0] model $pre_name_plus[1]\n";
25 | 	}elsif ($row[1] eq "genemark" && $row[2] eq "mRNA") {
26 | 		@last_col_split_name = split("=",$last_col_split[0]);
27 | 		print $gene join ("\t", @row), "\tID=$pre_name_plus[1];Parent=$last_col_split_name[1]\n";
28 | 	}
29 | 	elsif ($row[1] eq "genemark" && $row[2] eq "CDS") {
30 | 		$exon_count+=1;
31 | 		print $gene "$row[0]\t$row[1]\texon\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\tID=e$exon_count;Parent=$pre_name_plus[1]\n";
32 | 		print $gene join ("\t", @row), "\tID=cds_of_$pre_name_plus[1];Parent=$pre_name_plus[1]\n";
33 | 	}elsif ($row[1] eq "augustus" && $row[2] eq "gene") {
34 | 		@pre_name = split("_", $last_col_split[1]);
35 | 		print $gene join ("\t", @row), "\t$last_col_split[0];$pre_name[0] model $pre_name[1]\n";
36 | 	}elsif ($row[1] eq "augustus" && $row[2] eq "mRNA") {
37 | 		@last_col_split_name = split("=",$last_col_split[0]);
38 | 		print $gene join ("\t", @row), "\tID=$pre_name[1];Parent=$last_col_split_name[1]\n";
39 | 	}elsif ($row[1] eq "augustus" && $row[2] eq "CDS") {
40 | 		$exon_count+=1;
41 | 		print $gene "$row[0]\t$row[1]\texon\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\tID=e$exon_count;Parent=$pre_name[1]\n";
42 | 		print $gene join ("\t", @row), "\tID=cds_of_$pre_name[1];Parent=$pre_name[1]\n";
43 | 	}elsif ($row[1] eq "snap" && $row[2] eq "gene") {
44 | 		@pre_name = split ("_", $last_col_split[1]);
45 | 		print $gene join ("\t", @row), "\t$last_col_split[0];$pre_name[0] model $pre_name[1]\n";
46 | 	}elsif ($row[1] eq "snap" && $row[2] eq "mRNA") {
47 | 		@last_col_split_name = split("=",$last_col_split[0]);
48 | 		print $gene join ("\t", @row), "\tID=$pre_name[1];Parent=$last_col_split_name[1]\n";
49 | 	}elsif ($row[1] eq "snap" && $row[2] eq "CDS") {
50 | 		$exon_count+=1;
51 | 		print $gene "$row[0]\t$row[1]\texon\t$row[3]\t$row[4]\t$row[5]\t$row[6]\t$row[7]\tID=e$exon_count;Parent=$pre_name[1]\n";
52 | 		print $gene join ("\t", @row), "\tID=cds_of_$pre_name[1];Parent=$pre_name[1]\n";
53 | 	}
54 | }


--------------------------------------------------------------------------------
/run_IonT_SPAdes.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #runs SPAdes assembler for single end IonTorrent reads in batch
 4 | #Written by Tom de Man
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | use File::Basename;
 9 | 
10 | my @files=@ARGV;
11 | 
12 | foreach my $file (@files){
13 | 	my ($file_name,$dir)=fileparse($file);
14 | 	my @base = split (/\./, $file_name);
15 | 	my $name = $base[0];
16 | 	
17 | 	my $cmd="spades.py -t 12 --iontorrent -k 21,33,55,77,99,127 --careful -s $file -o $name"."_assembly";
18 | 	print $cmd,"\n";
19 | 	die if system($cmd);
20 | }
21 | 


--------------------------------------------------------------------------------
/run_SPAdes.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #runs SPAdes assembler in batch
 4 | #Written by Tom de Man
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | use File::Basename;
 9 | 
10 | my @files=@ARGV;
11 | 
12 | my %paired_files;
13 | foreach my $file (@files){
14 | 	my ($file_name,$dir)=fileparse($file);
15 | 	if($file_name =~ /(.+)_R([1|2])_/){
16 | 		$paired_files{$1}[$2-1]=$file;
17 | 	#attempt different naming scheme
18 | 	}elsif($file_name =~ /(.+)_([1|2])/){
19 | 		$paired_files{$1}[$2-1]=$file;
20 | 	}else{
21 | 		warn "Input file does not contain '_R1_' or '_R2_' in name: $file";
22 | 	}
23 | }
24 | 
25 | foreach my $name (sort keys %paired_files){
26 | 	unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){
27 | 		warn "Couldn't find matching paired end files for file starting with: $name";
28 | 		next;
29 | 	}
30 | 	print "assembling your data....\n";
31 | 	print "----------------------\n";
32 | 	print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n";
33 | 
34 | 	my $cmd="spades.py -t 12 --careful --only-assembler -1 $paired_files{$name}[0] -2 $paired_files{$name}[1] -o $name"."_assembly";
35 | 	print $cmd,"\n";
36 | 	die if system($cmd);
37 | }
38 | 


--------------------------------------------------------------------------------
/run_bowtie2_and_pilon.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #needs bowtie2, samtools and pilon in path
 4 | #runs bowtie2 in batch
 5 | #converts SAM to sorted BAM
 6 | #indexing the sorted BAM
 7 | #running pilon 
 8 | #Written by Tom de Man
 9 | 
10 | use warnings;
11 | use strict;
12 | use File::Basename;
13 | 
14 | my $DB = shift;
15 | my @files=@ARGV;
16 | my $path;
17 | my $DB_index;
18 | 
19 | my %paired_files;
20 | foreach my $file (@files){
21 | 	my ($file_name,$dir)=fileparse($file);
22 | 	$path = $dir;
23 | 	if($file_name =~ /(.+)_R([1|2])_/){
24 | 		$paired_files{$1}[$2-1]=$file;
25 | 	#attempt different naming scheme
26 | 	}elsif($file_name =~ /(.+)_([1|2])/){
27 | 		$paired_files{$1}[$2-1]=$file;
28 | 	}else{
29 | 		warn "Input file does not contain '_R1_' or '_R2_' in name: $file";
30 | 	}
31 | }
32 | 
33 | foreach my $name (sort keys %paired_files){
34 | 	unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){
35 | 		warn "Couldn't find matching paired end files for file starting with: $name";
36 | 		next;
37 |     }
38 |     print "mapping to $DB - converting - running pilon\n";
39 |     print "----------------------\n";
40 |     print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n";
41 | 	
42 |     system("bowtie2-build $DB $DB.index");
43 |     system("bowtie2 -x $DB.index -1 $paired_files{$name}[0] -2 $paired_files{$name}[1] -S $path$name.sam --end-to-end -p 12");
44 |     system("samtools view -bS $path$name.sam | samtools sort - $path$name.sort");
45 |     system("samtools index $path$name.sort.bam $path$name.sort.bai");
46 |     system("pilon --genome $DB --frags $path$name.sort.bam --mindepth 0.6 --minqual 30 --minmq 30 --fix all --changes --vcf --output $path$name.$DB.pilon");
47 | }
48 | 


--------------------------------------------------------------------------------
/run_bowtie2_subtract_mapped_reads_with_picard.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #needs bowtie2, samtools, and bedtools in path
 4 | #runs bowtie2 in batch
 5 | #converts SAM to sorted BAM
 6 | #extracts mapped reads from BAM file and generates two FASTQ files per sample
 7 | #Written by Tom de Man
 8 | 
 9 | use warnings;
10 | use strict;
11 | use File::Basename;
12 | 
13 | my @files=@ARGV;
14 | 
15 | #bowtie2 index
16 | my $DB = "/path/to/mapping-reference-file";
17 | my $SamToFastq = "/path/to/picard-tools-1.96/SamToFastq.jar";
18 | my $path;
19 | 
20 | my %paired_files;
21 | foreach my $file (@files){
22 | 	my ($file_name,$dir)=fileparse($file);
23 | 	$path = $dir;
24 | 	if($file_name =~ /(.+)_R([1|2])_/){
25 | 		$paired_files{$1}[$2-1]=$file;
26 | 	#attempt different naming scheme
27 | 	}elsif($file_name =~ /(.+)_([1|2])/){
28 | 		$paired_files{$1}[$2-1]=$file;
29 | 	}else{
30 | 		warn "Input file does not contain '_R1_' or '_R2_' in name: $file";
31 | 	}
32 | }
33 | 
34 | foreach my $name (sort keys %paired_files){
35 | 	unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){
36 | 		warn "Couldn't find matching paired end files for file starting with: $name";
37 | 		next;
38 | 	}
39 | 	print "mapping - converting - subtracting your mapped read data....\n";
40 | 	print "----------------------\n";
41 | 	print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n";
42 |     
43 | 	my $re1 = $paired_files{$name}[0];
44 | 	my $re2 = $paired_files{$name}[1];
45 | 	my $fq1 = "$path$name"."_R1_001.fastq";
46 | 	my $fq2 = "$path$name"."_R2_001.fastq";
47 | 
48 | 	system("bowtie2 -x $DB -1 $re1 -2 $re2 -p 12 --end-to-end -D 10 -R 2 -N 0 -L 30 -i S,0,2.50 | samtools view -bS - > $path$name.bam");
49 | 	system("samtools view -F 4 $path$name.bam -o $path$name.mappedReads.bam");
50 | 	system("samtools sort -n $path$name.mappedReads.bam -o $path$name.mappedReadsSorted.bam");
51 | 	system("java -Xmx4g -jar $SamToFastq I=$path$name.mappedReadsSorted.bam F=$fq1 F2=$fq2");
52 | }
53 | 


--------------------------------------------------------------------------------
/run_bowtie2_subtract_unmapped_reads.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #needs bowtie2, samtools and bam2fastq in path
 4 | #runs bowtie2 in batch
 5 | #converts SAM to BAM
 6 | #extracts unmapped reads from BAM file and generates FASTQ files
 7 | #Written by Tom de Man
 8 | 
 9 | use warnings;
10 | use strict;
11 | use File::Basename;
12 | 
13 | my @files=@ARGV;
14 | #for now hard coded, will change later
15 | #refSeq-Viral is the bowtie2 index
16 | my $DB = "/path/to/refSeq-viral";
17 | my $path;
18 | 
19 | my %paired_files;
20 | foreach my $file (@files){
21 |     my ($file_name,$dir)=fileparse($file);
22 |     $path = $dir;
23 |     if($file_name =~ /(.+)_R([1|2])_/){
24 | 		$paired_files{$1}[$2-1]=$file;
25 |     #attempt different naming scheme
26 |     }elsif($file_name =~ /(.+)_([1|2])/){
27 | 		$paired_files{$1}[$2-1]=$file;
28 |     }else{
29 | 		warn "Input file does not contain '_R1_' or '_R2_' in name: $file";
30 |     }
31 | }
32 | 
33 | foreach my $name (sort keys %paired_files){
34 |     unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){
35 | 		warn "Couldn't find matching paired end files for file starting with: $name";
36 | 		next;
37 |     }
38 |     print "mapping - converting - subtracting your data....\n";
39 |     print "----------------------\n";
40 |     print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n";
41 | 
42 |     system("bowtie2 -x $DB -1 $paired_files{$name}[0] -2 $paired_files{$name}[1] -S $path$name.sam -p 16 --end-to-end");
43 |     system("samtools view -bS $path$name.sam > $path$name.bam");
44 |     system("bam2fastq --no-aligned -o $path$name"."_R\#_bacterial.fastq $path$name.bam"); 
45 | }
46 | 
47 | system("ls $path*_R_1_bacterial.fastq | sed -e 'p;s/_R_1_/_R1_/' | xargs -n2 mv");
48 | system("ls $path*_R_2_bacterial.fastq | sed -e 'p;s/_R_2_/_R2_/' | xargs -n2 mv");
49 | 


--------------------------------------------------------------------------------
/run_fastqMcf.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #runs fastq-mcf in batch
 4 | #Written by Tom de Man
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | use File::Basename;
 9 | 
10 | my $path;
11 | 
12 | #hard coded
13 | my $adaptors = "/path/to/adapters.fasta"; 
14 | my @files=@ARGV;
15 | 
16 | my %paired_files;
17 | foreach my $file (@files){
18 |     my ($file_name,$dir)=fileparse($file);
19 |     $path = $dir;
20 |     if($file_name =~ /(.+)_R([1|2])_/){
21 | 		$paired_files{$1}[$2-1]=$file;
22 |     #attempt different naming scheme
23 |     }elsif($file_name =~ /(.+)_([1|2])/){
24 | 		$paired_files{$1}[$2-1]=$file;
25 |     }else{
26 | 		warn "Input file does not contain '_R1_' or '_R2_' in name: $file";
27 |     }
28 | }
29 | 
30 | foreach my $name (sort keys %paired_files){
31 |     unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){
32 | 		warn "Couldn't find matching paired end files for file starting with: $name";
33 | 		next;
34 |     }
35 |     print "trimming your data....\n";
36 |     print "----------------------\n";
37 |     print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n";
38 | 
39 |     my $cmd="fastq-mcf $adaptors $paired_files{$name}[0] $paired_files{$name}[1]" . " -o $path$name" . "_R1_trimmed.fastq    " . " -o $path$name" . "_R2_trimmed.fastq -C 1000000 -q 30 -p 10 -u -x 0.01";
40 |     print $cmd,"\n";
41 |     die if system($cmd);
42 | }
43 | 


--------------------------------------------------------------------------------
/run_kSNP.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | #convert dashes in FASTA headers to spaces
 3 | #merge all the contigs per sample into one long sequence
 4 | #run kSNP
 5 | 
 6 | #Written by Tom de Man
 7 | 
 8 | #kSNP, Jellyfish, FastTree, MUMmer and Parsimonator all need to be in the PATH variable before launching this script
 9 | use strict;
10 | 
11 | #path containing the input files
12 | my $file_path = shift;
13 | #project name
14 | my $project = shift;
15 | my @fastas = &getFile;
16 | 
17 | foreach my $fasta (@fastas) {
18 | 	open FILE, "$fasta";
19 | 	my @lines = <FILE>;
20 | 	close FILE;
21 | 	
22 | 	open STDOUT, ">$fasta";
23 | 	for (@lines) {
24 | 		$_ =~ s/_/ /g;
25 | 		print;
26 | 	}
27 | 	close STDOUT;
28 | }
29 | 
30 | my @new_fastas = &getFile;
31 | foreach my $file (@new_fastas) {
32 | 	system("merge_fasta_contigs.pl $file > $file.merged.fasta");
33 | }
34 | 
35 | #combine all the fasta files that come from merge_fasta_contigs
36 | system("cat *.fasta > $project.fasta");
37 | #run kSNP using the merged fasta file
38 | `kSNP -f $project.fasta -k 23 -d $project -p $project.finished`;
39 | 
40 | sub getFile {
41 |         my @file_docs;
42 |         opendir(DIR, $file_path) or die "Cannot open $file_path \n";
43 |         my @file_files = readdir(DIR);
44 |         close DIR;
45 | 
46 |         foreach my $file (@file_files) {
47 |                 next if (!($file =~ /\.fa$/));
48 |                 push @file_docs, $file;
49 |         }
50 |         return @file_docs;
51 | }
52 | 


--------------------------------------------------------------------------------
/run_kneaddata_only_human_removal.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #runs KneadData in batch. Only for human DNA removal, no quality trimming 
 4 | #Written by Tom de Man
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | use File::Basename;
 9 | 
10 | my @files=@ARGV;
11 | my $database = "/path/to/database/Homo_sapiens_Bowtie2_v0.1/";
12 | 
13 | my %paired_files;
14 | foreach my $file (@files){
15 |     my ($file_name,$dir)=fileparse($file);
16 |     if($file_name =~ /(.+)_R([1|2])_/){
17 | 	$paired_files{$1}[$2-1]=$file;
18 |     #attempt different naming scheme
19 |     }elsif($file_name =~ /(.+)_([1|2])/){
20 | 	$paired_files{$1}[$2-1]=$file;
21 |     }else{
22 | 	warn "Input file does not contain '_R1_' or '_R2_' in name: $file";
23 |     }
24 | }
25 | 
26 | foreach my $name (sort keys %paired_files){
27 |     unless(defined($paired_files{$name}[0]) && defined($paired_files{$name}[1])){
28 | 	warn "Couldn't find matching paired end files for file starting with: $name";
29 | 	next;
30 |     }
31 |     print "processing your data....\n";
32 |     print "----------------------\n";
33 |     print "$paired_files{$name}[0]"." <--> "."$paired_files{$name}[1]"."\n";
34 | 
35 |     my $cmd="kneaddata -i $paired_files{$name}[0] -i $paired_files{$name}[1] -o kneaddata_nohuman -db $database --bypass-trim --bowtie2-options \"--very-sensitive --dovetail\" --remove-intermediate-output";
36 |     print $cmd,"\n";
37 |     die if system($cmd);
38 | }
39 | 


--------------------------------------------------------------------------------