├── annotation
    ├── strains
    ├── maker
    │   ├── round3
    │   │   ├── locus_tags.txt
    │   │   ├── README.md
    │   │   ├── gag.sh
    │   │   ├── rename.sh
    │   │   ├── maker3.sh
    │   │   ├── training_snap2
    │   │   │   └── snap2.sh
    │   │   ├── fusotu1_abyss_round3_maker_opts.ctl
    │   │   ├── fusotu3_abyss_round3_maker_opts.ctl
    │   │   ├── fusotu5_abyss_round3_maker_opts.ctl
    │   │   ├── fusotu6_abyss_round3_maker_opts.ctl
    │   │   └── fusotu7_abyss_round3_maker_opts.ctl
    │   ├── round1
    │   │   ├── README.md
    │   │   ├── maker.sh
    │   │   ├── fusotu1_abyss_round1_maker_opts.ctl
    │   │   ├── fusotu3_abyss_round1_maker_opts.ctl
    │   │   ├── fusotu5_abyss_round1_maker_opts.ctl
    │   │   ├── fusotu6_abyss_round1_maker_opts.ctl
    │   │   └── fusotu7_abyss_round1_maker_opts.ctl
    │   ├── round2
    │   │   ├── README.md
    │   │   ├── maker2.sh
    │   │   ├── training_snap
    │   │   │   └── snap.sh
    │   │   ├── fusotu1_abyss_round2_maker_opts.ctl
    │   │   ├── fusotu3_abyss_round2_maker_opts.ctl
    │   │   ├── fusotu5_abyss_round2_maker_opts.ctl
    │   │   ├── fusotu6_abyss_round2_maker_opts.ctl
    │   │   └── fusotu7_abyss_round2_maker_opts.ctl
    │   └── README.md
    ├── README.md
    └── repeat_masking
    │   ├── README.md
    │   ├── repeatmasker.sh
    │   └── repeatmodeler.sh
├── assembly
    ├── strains
    ├── polishing
    │   ├── mito_trim_7.bed
    │   ├── duplicates_7
    │   ├── duplicates_5
    │   ├── mito_trim_6.bed
    │   ├── mito_trim_5.bed
    │   ├── mito_trim_3.bed
    │   ├── mito_trim_1.bed
    │   ├── mito_remove_3
    │   ├── README.md
    │   ├── ncbi_filter.sh
    │   ├── polish.sh
    │   ├── mito_remove_1
    │   ├── mito_remove_7
    │   └── mito_remove_5
    ├── assessment
    │   ├── submit_assessment.sh
    │   ├── README.md
    │   ├── quast.sh
    │   ├── blast.sh
    │   ├── busco.sh
    │   └── blobtools.sh
    ├── denovo_assembly
    │   ├── submit_assembly.sh
    │   ├── abyss.sh
    │   ├── README.md
    │   ├── megahit.sh
    │   ├── abyss_comp.sh
    │   └── spades.sh
    ├── README.md
    └── reads
    │   ├── fastqc.sh
    │   ├── README.md
    │   └── trimmomatic.sh
├── pipeline.png
├── .gitattributes
├── CSEP_CAZyme_prediction
    ├── submit_CSEPfilter.sh
    ├── submit_CAZymeprediction.sh
    ├── submit_CSEPblast.sh
    ├── predgpi
    │   ├── predgpi.sh
    │   └── PredGPI.r
    ├── submit_orthogroupparsing.sh
    ├── run_dbcan
    │   └── run_dbcan.sh
    ├── signalp
    │   └── signalp.sh
    ├── tmhmm
    │   └── tmhmm.sh
    ├── prosite
    │   └── ps_scan.sh
    ├── targetp
    │   └── targetp.sh
    ├── effectorp
    │   └── effectorp.sh
    ├── submit_CSEPprediction.sh
    ├── nucpred
    │   └── nucpred.sh
    ├── phobius
    │   └── phobius.sh
    ├── blastp
    │   └── blastp.sh
    ├── README.md
    └── CSEPfilter
├── phylogenomics
    ├── submit_speciestrees_concatenation.sh
    ├── submit_modeltestng.sh
    ├── submit_speciestrees_coalescent.sh
    ├── submit_RAxML-NG_genetrees.sh
    ├── submit_alignment.sh
    ├── modeltest-ng
    │   └── modeltestng.sh
    ├── species_tree
    │   ├── iqtree.sh
    │   ├── astral.sh
    │   ├── astral-pro.sh
    │   └── raxmlng.sh
    ├── RAxMLNG_genetrees.sh
    ├── alignment.sh
    ├── README.md
    └── concat.sh
├── divergence_time_estimation
    ├── mcmctree
    │   ├── submit_mcmctree_dating_step2.sh
    │   ├── blank_topology.r
    │   ├── mcmctree_step1.ctl
    │   ├── mcmctree_correlated.sh
    │   ├── mcmctree_independent.sh
    │   ├── README.md
    │   ├── mcmctree_step2_correlated.ctl
    │   ├── mcmctree_step2_independent.ctl
    │   ├── estimate_rate.r
    │   └── mcmctree_dating_step1.sh
    ├── README.md
    ├── sortadate
    │   ├── README.md
    │   └── sortadate.sh
    └── reroot.r
├── lifestyle_comparison
    ├── submit_lifestyletest.sh
    ├── README.md
    ├── lifestyle-test.sh
    ├── lifestyle_v_phylogeny.r
    └── run_edited.py
├── orthology_inference
    ├── README.md
    ├── orthofinder.sh
    ├── protein_download.sh
    ├── submit_protein_download.sh
    └── ncbi_ftp_links.r
├── selection
    ├── gbff_files
    │   └── ncbi_gbff_download.sh
    ├── codon_optimisation
    │   ├── README.md
    │   ├── submit_codon_optimisation.sh
    │   ├── blast.sh
    │   ├── pull_ribosomes.sh
    │   └── codon_optimisation.r
    ├── reroot.r
    ├── hyphy
    │   ├── aBSREL.sh
    │   ├── BUSTED.sh
    │   └── Contrast-FEL.sh
    ├── README.md
    ├── submit_pal2nal.sh
    ├── label_trees.r
    ├── submit_hyphy.sh
    ├── pal2nal.sh
    └── pull_nucleotides.py
├── CITATION.cff
├── cazyme_substrates.csv
└── README.md


/annotation/strains:
--------------------------------------------------------------------------------
1 | 1
2 | 3
3 | 5
4 | 6
5 | 7
6 | 


--------------------------------------------------------------------------------
/assembly/strains:
--------------------------------------------------------------------------------
1 | 1
2 | 3
3 | 5
4 | 6
5 | 7
6 | 


--------------------------------------------------------------------------------
/assembly/polishing/mito_trim_7.bed:
--------------------------------------------------------------------------------
1 | 2764_pilon	0	1118
2 | 


--------------------------------------------------------------------------------
/assembly/polishing/duplicates_7:
--------------------------------------------------------------------------------
1 | 1635_pilon
2 | 1994_pilon
3 | 


--------------------------------------------------------------------------------
/assembly/polishing/duplicates_5:
--------------------------------------------------------------------------------
1 | 3590_pilon
2 | 5157_pilon
3 | 3659_pilon
4 | 


--------------------------------------------------------------------------------
/assembly/polishing/mito_trim_6.bed:
--------------------------------------------------------------------------------
1 | 5170_pilon	0	326
2 | 5191_pilon	2099	3740
3 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/locus_tags.txt:
--------------------------------------------------------------------------------
1 | LB503
2 | LB505
3 | LB506
4 | LB507
5 | LB504
6 | 


--------------------------------------------------------------------------------
/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rowena-h/FusariumLifestyles/HEAD/pipeline.png


--------------------------------------------------------------------------------
/assembly/polishing/mito_trim_5.bed:
--------------------------------------------------------------------------------
1 | 1266_pilon	0	1480
2 | 5209_pilon	0	511
3 | 5959_pilon	400	860
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto eol=lf
3 | *.png binary
4 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/submit_CSEPfilter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | for i in $(cat todo)
4 | do	
5 | 	./CSEPfilter ${i}
6 | done
7 | 


--------------------------------------------------------------------------------
/phylogenomics/submit_speciestrees_concatenation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd species_tree
4 | 
5 | qsub raxmlng.sh
6 | qsub iqtree.sh
7 | 


--------------------------------------------------------------------------------
/assembly/assessment/submit_assessment.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #Script to submit assembly assessment jobs
3 | 
4 | qsub quast.sh
5 | qsub busco.sh
6 | qsub blast.sh


--------------------------------------------------------------------------------
/assembly/polishing/mito_trim_3.bed:
--------------------------------------------------------------------------------
1 | 2989_pilon	2533	3802
2 | 5625_pilon	0	488
3 | 5698_pilon	1106	1676
4 | 5704_pilon	2324	3459
5 | 5728_pilon	1416	2013
6 | 


--------------------------------------------------------------------------------
/phylogenomics/submit_modeltestng.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd modeltest-ng
4 | 
5 | NUM=$(ls -d partition.* | wc -l)
6 | 
7 | qsub -t 1-${NUM} modeltestng.sh
8 | 


--------------------------------------------------------------------------------
/phylogenomics/submit_speciestrees_coalescent.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | cd species_tree
4 | 
5 | mkdir astral
6 | 
7 | qsub astral.sh
8 | qsub astral-pro.sh
9 | 


--------------------------------------------------------------------------------
/assembly/polishing/mito_trim_1.bed:
--------------------------------------------------------------------------------
1 | 1374_pilon	0	691
2 | 1679_pilon	2524	3793
3 | 1933_pilon	333	930
4 | 2473_pilon	0	359
5 | 3511_pilon	697	832
6 | 703_pilon	0	311
7 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/submit_mcmctree_dating_step2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | qsub -t 1-2 mcmctree_independent.sh
4 | qsub -t 1-2 mcmctree_correlated.sh
5 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 5 Divergence time estimation
4 | 
5 | 1. :file_folder: `sortadate`
6 | 2. :file_folder: `mcmctree`
7 | 


--------------------------------------------------------------------------------
/lifestyle_comparison/submit_lifestyletest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | module load R/3.6.1
4 | 
5 | Rscript lifestyle_v_phylogeny.r ../CSEP_CAZyme_prediction/orthogroup-matrices-2022-02-10.RData
6 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/submit_CAZymeprediction.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | #Number of samples
4 | NUM=$(ls ../proteins/*.faa | sed 's#\.\./proteins/##' | wc -l)
5 | 
6 | qsub -t 1-${NUM} run_dbcan.sh/run_dbcan.sh
7 | 


--------------------------------------------------------------------------------
/phylogenomics/submit_RAxML-NG_genetrees.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | NUM=$(cat aln_list | wc -l)
4 | 
5 | mkdir gene_trees/RAxML-NG
6 | mkdir gene_trees_bmge/RAxML-NG
7 | 
8 | qsub -t 1-${NUM} RAxMLNG_genetrees.sh
9 | 


--------------------------------------------------------------------------------
/annotation/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 2 Annotation
 4 | 
 5 | Order of steps:
 6 | 
 7 | 1. :file_folder: `repeat_masking`
 8 | 2. :file_folder: `maker`
 9 | 
10 | `strains` contains a list of the strain numbers, which is referred to in various scripts.
11 | 


--------------------------------------------------------------------------------
/phylogenomics/submit_alignment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ls -1 ../orthology_inference/OrthoFinder/Results_Oct22/Single_Copy_Orthologue_Sequences/*.fa | sed s/^.*\\/\// > aln_list
 4 | 
 5 | NUM=$(cat aln_list | wc -l)
 6 | 
 7 | mkdir gene_trees gene_trees_bmge
 8 | 
 9 | qsub -t 1-${NUM} alignment.sh
10 | 


--------------------------------------------------------------------------------
/annotation/maker/round1/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 2 Annotation
 4 | 
 5 | ### 2.2 MAKER pipeline
 6 | 
 7 | #### Round 1
 8 | 
 9 | `qsub maker.sh` - submits first run of [MAKER](http://www.yandell-lab.org/software/maker.html) using ESTs and proteins, as indicated in `.ctl` files.
10 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/sortadate/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 5 Divergence time estimation
4 | 
5 | ### 5.1 SortaDate
6 | 
7 | `qsub sortadate` - reroots gene and RAxML-NG species tree and runs with [SortaDate](https://github.com/FePhyFoFum/SortaDate) to filter for top ten 'clock-like' genes.


--------------------------------------------------------------------------------
/assembly/denovo_assembly/submit_assembly.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #Script to submit de novo assembly jobs
 3 | 
 4 | STRAINS=$(cat ../strains)
 5 | 
 6 | for ASSEMBLER in abyss megahit spades
 7 | do
 8 | 	mkdir ${ASSEMBLER}
 9 | 
10 | 	for STRAIN in $STRAINS
11 | 	do
12 | 		mkdir ${ASSEMBLER}/fusotu${STRAIN}
13 | 	done
14 | 
15 | 	qsub ${ASSEMBLER}.sh
16 | done
17 | 


--------------------------------------------------------------------------------
/orthology_inference/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 3 Orthology inference
4 | 
5 | 1. `./submit_protein_download.sh` - submits `ncbi_ftp_links.r` and `protein_download.sh` to download of predicted protein sets of *Fusarium* strains from NCBI.
6 | 2. `qsub orthofinder.sh` - submits orthology inference using [OrthoFinder](https://github.com/davidemms/OrthoFinder).
7 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/submit_CSEPblast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #Script to submit blast of CSEPs against the PHI-base database
 3 | 
 4 | module load blast+
 5 | 
 6 | makeblastdb -dbtype prot -in blastp/phi-base_current.fas
 7 | 
 8 | #Number of samples
 9 | NUM=$(ls ../proteins/*.faa | sed 's#\.\./proteins/##' | wc -l)
10 | 
11 | cd blastp
12 | 
13 | qsub -t 1-${NUM} blastp.sh
14 | 
15 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 2 Annotation
 4 | 
 5 | ### 2.2 MAKER pipeline
 6 | 
 7 | #### Round 2
 8 | 
 9 | 1. `qsub training_snap/snap.sh` - trains [SNAP](https://github.com/KorfLab/SNAP) using gene models from the first MAKER round.
10 | 2. `qsub maker2.sh` - submits second run of MAKER using trained SNAP (as indicated in `.ctl` files).
11 | 


--------------------------------------------------------------------------------
/assembly/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 1 Assembly
 4 | 
 5 | Order of steps:
 6 | 
 7 | 1. :file_folder: `reads`
 8 | 2. :file_folder: `denovo_assembly`
 9 | 3. :file_folder: `polishing`
10 | 4. :file_folder: `assessment`
11 | 
12 | `strains` contains a list of the identifying numbers for the strains newly whole genome sequenced in this study, which is referred to in various scripts.
13 | 


--------------------------------------------------------------------------------
/annotation/repeat_masking/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 2 Annotation
4 | 
5 | ### 2.1 Repeatmasking
6 |  
7 | 1. `qsub repeatmodeler.sh` - makes custom repeat library for each strain using [RepeatModeler](https://www.repeatmasker.org/RepeatModeler/).
8 | 2. `qsub repeatmasker.sh` - uses the custom repeat libraries to softmask assemblies using [RepeatMasker](https://www.repeatmasker.org/RepeatMasker/).
9 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/predgpi/predgpi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	# Request 1 core
 4 | #$ -l h_rt=24:00:0      # Request 24 hours runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | DIR=$1
10 | SAMPLES=$(cat $2)
11 | 
12 | module load R
13 | 
14 | Rscript PredGPI.r ${DIR} ${SAMPLES}
15 | 


--------------------------------------------------------------------------------
/orthology_inference/orthofinder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 10    	    # Request 10 cores
 4 | #$ -l h_rt=48:00:0      # Request 48 hours runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | module load anaconda3
10 | conda activate orthofinder
11 | 
12 | ulimit -n 3944
13 | 
14 | orthofinder -f . -t 9
15 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/submit_orthogroupparsing.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1            # Request 1 core
 4 | #$ -l h_rt=1:00:00      # Request 1 hour runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | module load R
10 | 
11 | Rscript orthogroup_parser.r ../orthology_inference/OrthoFinder/Results_Oct22/
12 | 


--------------------------------------------------------------------------------
/assembly/reads/fastqc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           # Set the working directory for the job to the current directory
 3 | #$ -pe smp 4      # Request 4 cores
 4 | #$ -l h_rt=24:0:0 # Request 24 hour runtime
 5 | #$ -l h_vmem=1G   # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -t 1-5
 8 | 
 9 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
10 | 
11 | module load fastqc
12 | 
13 | fastqc -t ${NSLOTS} ${STRAIN}_1_trimmedpaired.fastq.gz ${STRAIN}_2_trimmedpaired.fastq.gz
14 | 


--------------------------------------------------------------------------------
/lifestyle_comparison/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 7 Lifestyle comparison
4 | 
5 | `./submit_lifestyletest.sh` - submits `lifestyle_v_phylogeny.r` to prepare input file for `lifestyle-test.sh` which runs PERMANOVA-based lifestyle test on orthogroup and CSEP presence absence matrices; `run_edited.py` is modified from the original script `run.py` by [Mesny & Vannier](https://github.com/fantin-mesny/Effect-Of-Biological-Categories-On-Genomes-Composition).


--------------------------------------------------------------------------------
/selection/gbff_files/ncbi_gbff_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -cwd
 3 | #$ -j y
 4 | #$ -l h_rt=1:00:00
 5 | #$ -t 1-56		#number of taxa to download
 6 | 
 7 | #Read ftp link for gbff into variable
 8 | LINK=$(sed -n ${SGE_TASK_ID}p fus_ncbi_genomic)
 9 | 
10 | #Download from the ncbi ftp server
11 | wget $LINK
12 | 
13 | #Read taxon name into variable
14 | FILE=$(echo $LINK | sed 's/^.*\(GCA.*\.gbff\).*$/\1/')
15 | 
16 | #Extract file
17 | gunzip ${FILE}.gz
18 | 


--------------------------------------------------------------------------------
/orthology_inference/protein_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -cwd
 3 | #$ -j y
 4 | #$ -l h_rt=1:00:00
 5 | 
 6 | #Read ftp link for proteins into variable
 7 | LINK=$(sed -n ${SGE_TASK_ID}p fus_ncbi_proteins)
 8 | 
 9 | #Download from the ncbi ftp server
10 | wget $LINK
11 | 
12 | #Read taxon name into variable
13 | FILE=$(echo $LINK | sed 's/^.*\(GCA.*\.faa\).*$/\1/')
14 | 
15 | #Extract file
16 | gunzip ${FILE}.gz
17 | 
18 | #Add filename to fasta headers
19 | sed -i "s/>/>${FILE}_/g" $FILE
20 | 


--------------------------------------------------------------------------------
/selection/codon_optimisation/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 8 Selection
4 | 
5 | ### 8.2 Codon optimisation
6 | 
7 | 1. `./pull_ribosomes.sh` - extracts ribosomal protein encoding genes from [Fusgr1](https://mycocosm.jgi.doe.gov/Fusgr1/Fusgr1.home.html) and submits `blast.sh` to run BLAST search against all strains in this study.
8 | 2. `./submit_codon_optimisation.sh` - submits `codon_optimisation.r` script to estimate various codon usage bias statistics and codon optimisation values.
9 | 


--------------------------------------------------------------------------------
/assembly/denovo_assembly/abyss.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd
 3 | #$ -pe parallel 48
 4 | #$ -l infiniband=sdv-i
 5 | #$ -l h_rt=24:0:0
 6 | #$ -m bea
 7 | #$ -t 120-128	#kmer sizes to check
 8 | 
 9 | STRAINS=$(cat ../strains)
10 | 
11 | module load abyss
12 | 
13 | for STRAIN in $STRAINS
14 | do
15 | 	mkdir abyss/fusotu${STRAIN}/k${SGE_TASK_ID}
16 | 	abyss-pe -C abyss/fusotu${STRAIN}/k${SGE_TASK_ID} name=fusotu${STRAIN} in='../reads/FUS_OTU${STRAIN}_1_trimmedpaired.fastq.gz ../reads/FUS_OTU${STRAIN}_2_trimmedpaired.fastq.gz' np=${NSLOTS}
17 | done
18 | 


--------------------------------------------------------------------------------
/assembly/denovo_assembly/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 1 Assembly
4 | 
5 | ### 1.2 *De novo* genome assembly
6 |  
7 | 1. `./submit_assembly.sh` - makes new directory and submits job scripts for each assembly tool - `abyss.sh` ([ABySS](https://github.com/bcgsc/abyss)), `megahit.sh` ([MEGAHIT](https://github.com/voutcn/megahit)) and `spades.sh` ([SPAdes](https://github.com/ablab/spades)).
8 | 2. `./abyss_comp.sh` - compares the assembly stats to choose 'best' kmer size for ABySS (must be done after `abyss.sh` has finished for all kmer sizes and strains).
9 | 


--------------------------------------------------------------------------------
/orthology_inference/submit_protein_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module load R 
 4 | 
 5 | #Get FTP links for Fusarium taxa off NCBI
 6 | Rscript ncbi_ftp_links.r
 7 | 
 8 | #Read number of taxa into variable
 9 | NUM=$(cat fus_ncbi_proteins | wc -l)
10 | 
11 | #Submit download
12 | qsub -t 1-${NUM} protein_download.sh
13 | 
14 | #Copy across own proteins
15 | cp ../annotation/maker/round3/fusotu*_gag/fusotu*.proteins.fasta .
16 | rename fasta faa *
17 | 
18 | for FILE in $(ls fusotu*)
19 | do
20 | 	#Add filename to fasta headers
21 | 	sed -i "s/>/>${FILE}_/g" $FILE
22 | done
23 | 


--------------------------------------------------------------------------------
/annotation/maker/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 2 Annotation
 4 | 
 5 | ### 2.2 MAKER pipeline
 6 | 
 7 | Informed by [this](https://gist.github.com/darencard/bb1001ac1532dd4225b030cf0cd61ce2) tutorial.
 8 |  
 9 | 1. :file_folder: `round1`
10 | 2. :file_folder: `round2`
11 | 3. :file_folder: `round3`
12 | 
13 | Requires ESTs and proteins from [Fusoxy1](https://mycocosm.jgi.doe.gov/Fusoxy1/Fusoxy1.home.html) and [Fuseq1](https://mycocosm.jgi.doe.gov/Fuseq1/Fuseq1.home.html) ([Mesny et al. 2021](https://doi.org/10.1038/s41467-021-27479-y)) downloaded from Mycocosm in this directory.
14 | 


--------------------------------------------------------------------------------
/assembly/denovo_assembly/megahit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd
 3 | #$ -pe smp 10
 4 | #$ -l h_rt=48:0:0
 5 | #$ -j y
 6 | #$ -m bea
 7 | #$ -t 1-5
 8 | 
 9 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
10 | 
11 | module load anaconda3
12 | conda activate megahit
13 | 
14 | megahit -1 ../reads/FUS_OTU${STRAIN}_1_trimmedpaired.fastq.gz \
15 | 	-2 ../reads/FUS_OTU${STRAIN}_1_trimmedpaired.fastq.gz \
16 | 	-t ${NSLOTS} \
17 | 	-o megahit/fusotu${STRAIN} \
18 | 	--k-list 51,59,67,75,83,91,99,107,115,123,131
19 | 
20 | mv megahit/fusotu${STRAIN}/final.contigs.fa megahit/fusotu${STRAIN}/fusotu${STRAIN}-contigs.fa
21 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/run_dbcan/run_dbcan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1		# Request 1 core
 4 | #$ -l h_rt=12:00:00 	# Request 30 minutes runtime
 5 | #$ -l h_vmem=20G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | TAXON=$(ls ../../proteins/*.faa | sed 's#\.\./\.\./proteins/##' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | module load anaconda3
11 | conda activate run_dbcan
12 | 
13 | run_dbcan ../../proteins/${TAXON} protein --out_dir ${TAXON}_dbcan_results
14 | 
15 | awk '$6 == 3' ${TAXON}_dbcan_results/overview.txt > ${TAXON}_cazymes
16 | 


--------------------------------------------------------------------------------
/selection/reroot.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args=commandArgs(trailingOnly=TRUE)
 3 | 
 4 | #Test if there are two arguments: if not, return an error
 5 | if (length(args)<3) {
 6 |   stop("Three arguments must be supplied: a tree first, an outgroup string second and the directory to save the rooted tree third.", call.=FALSE)
 7 | } 
 8 | 
 9 | library(ape)
10 | 
11 | tree <- read.tree(args[1])
12 | outgroup <- as.character(args[2])
13 | 
14 | file <- sub(".*\\/", "", args[1])
15 | 
16 | tree <- root(tree, outgroup, resolve.root=TRUE, edgelabel=TRUE)
17 | write.tree(tree, file=paste0(args[3], file, "_rooted"))
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/assembly/reads/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 1 Assembly
 4 | 
 5 | ### 1.1 Read quality control
 6 |  
 7 | Requires raw `fastq.gz` paired-end reads in this directory as well as `TruSeq3-PE.fa` file with adapter sequences downloaded from [here](https://github.com/timflutre/trimmomatic/blob/master/adapters/TruSeq3-PE.fa) (for Illumina NovaSeq 6000 151bp paired-end reads).
 8 | 
 9 | 1. `qsub trimmomatic.sh` - trims raw reads using [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic).
10 | 2. `qsub fastqc.sh` - after trimming, checks read quality with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).


--------------------------------------------------------------------------------
/assembly/assessment/README.md:
--------------------------------------------------------------------------------
1 | # *Fusarium* Lifestyles
2 | 
3 | ## 1 Assembly
4 | 
5 | ### 1.4 Assessment
6 |  
7 | 1. `./submit_assessment.sh` - submits `quast.sh` ([QUAST](https://github.com/ablab/quast)), `busco.sh` ([BUSCO](https://busco.ezlab.org/)) and `blast.sh` ([BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi)) scripts for assembly quality statistics. `busco.sh` requires the Hypocreales BUSCO dataset downloaded from [here](https://busco-data.ezlab.org/v4/data/lineages/)).
8 | 2. `qsub blobtools.sh` - submits `blobtools.sh` to run [BlobTools](https://github.com/DRL/blobtools) (must be done after `blast.sh` has finished for all strains).
9 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/reroot.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args=commandArgs(trailingOnly=TRUE)
 3 | 
 4 | #Test if there are two arguments: if not, return an error
 5 | if (length(args)<3) {
 6 |   stop("Three arguments must be supplied: a tree first, an outgroup string second and the directory to save the rooted tree third.", call.=FALSE)
 7 | } 
 8 | 
 9 | library(ape)
10 | 
11 | tree <- read.tree(args[1])
12 | outgroup <- as.character(args[2])
13 | 
14 | file <- sub(".*\\/", "", args[1])
15 | 
16 | tree <- root(tree, outgroup, resolve.root=TRUE, edgelabel=TRUE)
17 | write.tree(tree, file=paste0(args[3], file, "_rooted"))
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/annotation/repeat_masking/repeatmasker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 5		# Request 5 cores
 4 | #$ -l h_rt=24:00:0 	# Request 24 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
11 | 
12 | module load anaconda3
13 | conda activate repeatmasker
14 | 
15 | mkdir fusotu${STRAIN}abyss_masked
16 | RepeatMasker -e ncbi -lib fusotu${STRAIN}/RM*/consensi.fa -pa ${NSLOTS} -xsmall -dir fusotu${STRAIN}_abyss_masked ../../assembly/polishing/fusotu${STRAIN}_abyss_pilon_filtered.fa
17 | 


--------------------------------------------------------------------------------
/selection/hyphy/aBSREL.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1		# Request 1 core
 4 | #$ -l h_rt=5:00:00 	# Request 5 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | ORTHO=$(cat ../../phylogenomics/aln_list | sed 's/\.fa//' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | module load anaconda3
11 | conda activate hyphy-2.5.30
12 | 
13 | hyphy absrel 	--alignment ../alignments/codon/${ORTHO}_aln_nuc.fa \
14 |                 --tree ../trees/dated_tree_absrel.tre \
15 |                 --output absrel/${ORTHO}_aBSREL.json \
16 |                 --branches FOREGROUND
17 | 


--------------------------------------------------------------------------------
/selection/hyphy/BUSTED.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1		# Request 1 core
 4 | #$ -l h_rt=24:00:00 	# Request 24 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | ORTHO=$(cat ../../phylogenomics/aln_list | sed 's/\.fa//' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | module load anaconda3
11 | conda activate hyphy-2.5.30
12 | 
13 | hyphy busted 	--alignment ../alignments/codon/${ORTHO}_aln_nuc.fa \
14 |                 --tree ../trees/${ORTHO}.raxml.bestTree_rooted_hyphy \
15 |                 --output busted/${ORTHO}_BUSTED.json \
16 |                 --branches FOREGROUND
17 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 2 Annotation
 4 | 
 5 | ### 2.2 MAKER pipeline
 6 | 
 7 | #### Round 3
 8 | 
 9 | 1. `qsub training_snap2/snap2.sh` - trains SNAP again using gene models from the second MAKER round.
10 | 2. `qsub maker3.sh` - submits third run of MAKER using second trained SNAP (as indicated in `.ctl` files).
11 | 3. `qsub rename.sh` - after obtaining unique locus tags from e.g. NCBI (see `locus_tags.txt`), renames IDs in gff and fasta files.
12 | 4. `qsub gag.sh` - runs [GAG](https://github.com/genomeannotation/GAG/) to remove introns <10bp, remove terminal Ns and correct start and stop codons in gff file for NCBI compliance.
13 | 


--------------------------------------------------------------------------------
/assembly/reads/trimmomatic.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           
 3 | #$ -pe smp 4      
 4 | #$ -l h_rt=01:00:00 
 5 | #$ -l h_vmem=2G   
 6 | #$ -t 1-5
 7 | 
 8 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
 9 | 
10 | module load trimmomatic
11 | 
12 | java -jar /share/apps/centos7/trimmomatic/0.36/trimmomatic-0.36.jar PE -threads ${NSLOTS} -trimlog fusotu${STRAIN}-trimmomatic.log FUS_OTU${STRAIN}_1.fastq.gz FUS_OTU${STRAIN}_2.fastq.gz FUS_OTU${STRAIN}_1_trimmedpaired.fastq.gz FUS_OTU${STRAIN}_1_trimmedunpaired.fastq.gz FUS_OTU${STRAIN}_2_trimmedpaired.fastq.gz FUS_OTU${STRAIN}_2_trimmedunpaired.fastq.gz ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
13 | 


--------------------------------------------------------------------------------
/assembly/denovo_assembly/abyss_comp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | STRAINS=$(cat ../strains)
 4 | 
 5 | module load abyss
 6 | 
 7 | for STRAIN in $STRAINS
 8 | do
 9 | 	abyss-fac abyss/fusotu${STRAIN}/k*/fusotu${STRAIN}-contigs.fa > abyss/fusotu${STRAIN}/fusotu${STRAIN}_abyss_k_comparison.tsv
10 | 
11 | 	KMER=$(tail -n +2 abyss/fusotu${STRAIN}/fusotu${STRAIN}_abyss_k_comparison.tsv | sort -n -k 6 | awk '{print $11}' | sed "s#abyss/fusotu${STRAIN}/##" | sed "s#/fusotu${STRAIN}-contigs.fa##" | tail -n1)
12 | 
13 | 	echo "${KMER} selected" >> abyss/fusotu${STRAIN}/fusotu${STRAIN}_abyss_k_comparison.tsv
14 | 
15 | 	cp abyss/fusotu${STRAIN}/${KMER}/fusotu${STRAIN}-contigs.fa abyss/fusotu${STRAIN}/
16 | done
17 | 


--------------------------------------------------------------------------------
/assembly/assessment/quast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 4		# Request 4 cores
 4 | #$ -l h_rt=0:30:0 	# Request 30 min runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | 
 9 | STRAINS=$(cat ../strains)
10 | 
11 | module load anaconda3
12 | conda activate quast
13 | 
14 | for STRAIN in $STRAINS
15 | do
16 | 	quast.py ../polishing/fusotu${STRAIN}_abyss_pilon_filtered.fa ../polishing/fusotu${STRAIN}_megahit_pilon_filtered.fa ../polishing/fusotu${STRAIN}_spades_pilon_filtered.fa -o fusotu${STRAIN}_quast_results -t ${NSLOTS} --fungus -l "ABySS v2.0.2, MEGAHIT v1.2.9, SPAdes v3.11.1"
17 | done
18 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/blank_topology.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args=commandArgs(trailingOnly=TRUE)
 3 | 
 4 | #Test if there are two arguments: if not, return an error
 5 | if (length(args)<2) {
 6 |   stop("Two arguments must be supplied: a tree first and the directory to save the rooted tree second.", call.=FALSE)
 7 | } 
 8 | 
 9 | library(ape)
10 | 
11 | #Read in tree
12 | tree <- read.tree(args[1])
13 | 
14 | file <- sub(".*\\/", "", args[1])
15 | 
16 | #Remove branch lengths and node and tip labels
17 | tree$edge.length <- NULL
18 | tree$node.label <- NULL
19 | tree$tip.label <- substr(tree$tip.label, 1, 13) 
20 | 
21 | write.tree(tree, file=paste0(args[2], file, "_blank"))
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/assembly/denovo_assembly/spades.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 4		# Request 4 cores
 4 | #$ -l h_rt=48:0:0 	# Request 48 hours runtime
 5 | #$ -l highmem
 6 | #$ -l h_vmem=60G   	# Request 60GB RAM
 7 | #$ -j y
 8 | #$ -m bea
 9 | #$ -t 1-5
10 | 
11 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
12 | 
13 | module load spades
14 | 
15 | spades.py 	-1 ../reads/FUS_OTU${STRAIN}_1_trimmedpaired.fastq.gz \
16 | 		-2 ../reads/FUS_OTU${STRAIN}_2_trimmedpaired.fastq.gz \
17 | 		--careful \
18 | 		-o spades/fusotu${STRAIN} \
19 | 		-t ${NSLOTS}
20 | 
21 | mv spades/fusotu${STRAIN}/contigs.fasta spades/fusotu${STRAIN}/fusotu${STRAIN}-contigs.fa
22 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/signalp/signalp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd 		    # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	# Request 1 core
 4 | #$ -l h_rt=48:00:0  # Request 48 hours runtime
 5 | #$ -l h_vmem=5G     # Request 5GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | DIR=$1
10 | SAMPLES=$(cat $2)
11 | 
12 | cd /data/home/btx494/Programmes/signalp-5.0b/bin/
13 | 
14 | for i in $SAMPLES
15 | do
16 | 	./signalp -fasta ${DIR}${i} -org euk -prefix ${i}_signalp
17 | 	#List of signal peptide genes
18 |         cat ${i}_signalp_summary.signalp5 | awk '$2=="SP(Sec/SPI)" {print $1}' > ${i}_signalp_SPlist
19 | 	mv ${i}_signalp* /data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction/signalp/
20 | done
21 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/tmhmm/tmhmm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	    # Request 1 core
 4 | #$ -l h_rt=12:00:00     # Request 12 hours runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | DIR=$1
10 | SAMPLES=$(cat $2)
11 | RUN_DIR=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction
12 | 
13 | for i in $SAMPLES
14 | do
15 | 	/data/home/btx494/Programmes/tmhmm-2.0c/bin/tmhmm ${DIR}${i} > ${RUN_DIR}/tmhmm/${i}_tmhmm
16 | 	grep "Number of predicted TMHs" ${RUN_DIR}/tmhmm/${i}_tmhmm | awk '{ $NF = "\t" $NF; print }' | column -t -s $'\t' | awk '$7>1 { print $2}' > ${RUN_DIR}/tmhmm/${i}_tmhmm_TMlist
17 | done
18 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/mcmctree_step1.ctl:
--------------------------------------------------------------------------------
 1 |           seed = -1
 2 |        seqfile = fus_proteins_dating10_mcmctree_short.phy
 3 |       treefile = fus_proteins_62T.raxml.support_rooted_blank
 4 |        outfile = mcmctree_step1_output.txt
 5 |          ndata = 1
 6 |        seqtype = 2  * 0: nucleotides; 1:codons; 2:AAs
 7 |        usedata = 3    * 0: no data; 1:seq like; 2:use in.BV; 3: out.BV
 8 |          clock = 2    * 1: global clock; 2: independent rates; 3: correlated rates
 9 |          model = 0    * 0:JC69, 1:K80, 2:F81, 3:F84, 4:HKY85
10 |          alpha = 0    * alpha for gamma rates at sites
11 |          ncatG = 5    * No. categories in discrete gamma
12 |      cleandata = 0    * remove sites with ambiguity data (1:yes, 0:no)?
13 | 


--------------------------------------------------------------------------------
/phylogenomics/modeltest-ng/modeltestng.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 6            # Request 6 cores
 4 | #$ -l h_rt=240:00:00    # Request max hours runtime
 5 | #$ -l h_vmem=10G        # Request 10GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | PARTITION=$(ls -d partition.* | sed -n ${SGE_TASK_ID}p)
10 | 
11 | cd partition.${SGE_TASK_ID}
12 | 
13 | module load anaconda3
14 | conda activate modeltest-ng
15 | 
16 | modeltest-ng -d aa -i fus_proteins_62T_concat.phy -q fus_proteins_62T_partition.num${SGE_TASK_ID} -p ${NSLOTS} -T raxml
17 | modeltest-ng -d aa -i fus_proteins_bmge_62T_concat.phy -q fus_proteins_bmge_62T_partition.num${SGE_TASK_ID} -p ${NSLOTS} -T raxml
18 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/prosite/ps_scan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	    # Request 1 core
 4 | #$ -l h_rt=01:00:00     # Request 1 hour runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | DIR=$1
10 | SAMPLES=$(cat $2)
11 | RUN_DIR=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction
12 | 
13 | for i in $SAMPLES
14 | do
15 | 	perl /data/home/btx494/Programmes/ps_scan/ps_scan.pl -p PS00014 -o scan -d /data/home/btx494/Programmes/ps_scan/prosite.dat ${DIR}${i} > ${RUN_DIR}/prosite/${i}_psscan
16 | 	grep ">" ${RUN_DIR}/prosite/${i}_psscan | awk '{print $1}' | tr -d '>' > ${RUN_DIR}/prosite/${i}_psscan_ERlist
17 | done
18 | 


--------------------------------------------------------------------------------
/assembly/assessment/blast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 10 		# Request 10 cores
 4 | #$ -l h_rt=240:00:00# Request max hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
11 | 
12 | module load anaconda3
13 | conda activate blast
14 | 
15 | blastn 	-query ../denovo_assembly/abyss/fusotu${STRAIN}/fusotu${STRAIN}-contigs.fa \
16 |         -db /data/scratch/btx494/nt \
17 |         -outfmt '6 qseqid staxids bitscore std' \
18 |         -max_target_seqs 1 \
19 |         -max_hsps 1 \
20 |         -evalue 1e-25 \
21 |         -out fusotu${STRAIN}_abyss_blast.tsv \
22 |         -num_threads ${NSLOTS}
23 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/targetp/targetp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	    # Request 1 core
 4 | #$ -l h_rt=48:00:0      # Request 48 hours runtime
 5 | #$ -l h_vmem=3G         # Request 3GB RAM
 6 | #$ -j y
 7 | 
 8 | DIR=$1
 9 | SAMPLE=$(cat $2 | sed -n ${SGE_TASK_ID}p)
10 | RUN_DIR=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction
11 | 
12 | /data/home/btx494/Programmes/targetp-2.0/bin/targetp -fasta ${DIR}${SAMPLE} -org non-pl -prefix ${RUN_DIR}/targetp/${SAMPLE}_targetp -tmp ${RUN_DIR}/targetp/tmp_${SAMPLE}
13 | #List of signal peptide genes
14 | cat ${RUN_DIR}/targetp/${SAMPLE}_targetp_summary.targetp2 | awk '$2=="SP" { print $1}' > ${RUN_DIR}/targetp/${SAMPLE}_targetp_SPlist
15 | 


--------------------------------------------------------------------------------
/assembly/assessment/busco.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 5		# Request 5 cores
 4 | #$ -l h_rt=48:00:00 	# Request 48 hours runtime
 5 | #$ -l h_vmem=2G   	# Request 2GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
11 | 
12 | module load busco
13 | 
14 | export AUGUSTUS_CONFIG_PATH=/data/SBCS-BuggsLab/RowenaHill/genome_assemblies/augustus_config/config
15 | 
16 | for ASSEMBLER in abyss megahit spades
17 | do
18 | 	BUSCO.py -i ../polishing/fusotu${STRAIN}_${ASSEMBLER}_pilon_filtered.fa -c ${NSLOTS} -o fusotu${STRAIN}_${ASSEMBLER} -m genome -l /data/SBCS-BuggsLab/RowenaHill/genome_assemblies/busco_datasets/hypocreales_odb10.2019-11-20 -sp fusarium
19 | done
20 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/mcmctree_correlated.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1            # Request 1 core
 4 | #$ -l h_rt=5:00:00      # Request 5 hours runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | 
 9 | mkdir run${SGE_TASK_ID}_correlated
10 | cp {in.BV,fus_proteins_62T.raxml.support_dating,fus_proteins_dating10_mcmctree_short.phy,mcmctree_step2_correlated.ctl} run${SGE_TASK_ID}_correlated
11 | 
12 | cd run${SGE_TASK_ID}_correlated
13 | sed -i "s/mcmcfile = mcmc.txt/mcmcfile = mcmc_run${SGE_TASK_ID}_correlated.txt/" mcmctree_step2_correlated.ctl
14 | 
15 | module load anaconda3
16 | conda activate paml
17 | 
18 | mcmctree mcmctree_step2_correlated.ctl
19 | 


--------------------------------------------------------------------------------
/selection/codon_optimisation/submit_codon_optimisation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #Make fasta file of core single copy orthogroups for each taxon
 4 | 
 5 | for ORTHO in $(ls ../alignments/codon/*.fa | sed 's#\.\./alignments/codon/##')
 6 | do
 7 | 	awk '/>/{sub(">","&"FILENAME"_");sub(/\.fasta/,x)}1' ../alignments/codon/${ORTHO} | sed 's#\.\./alignments/codon/##' > ${ORTHO}.tmp
 8 | done
 9 | 
10 | for TAXON in $(ls ../../proteins/*.faa | sed 's#\.\./\.\./proteins/##' | sed 's/\.faa//')
11 | do
12 | 	awk -v p="$TAXON" 'BEGIN{ ORS=""; RS=">"; FS="\n" } $1 ~ p { print ">" $0 }' *.tmp | sed "s/_${TAXON}//" | sed 's/_aln_nuc\.fa//' > ${TAXON}_coreSC.fa
13 | done
14 | 
15 | rm *.tmp
16 | 
17 | module load R/4.0.2
18 | 
19 | Rscript codon_optimisation.r ../../orthology_inference/OrthoFinder/Results_Oct22/
20 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/effectorp/effectorp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	    # Request 1 core
 4 | #$ -l h_rt=00:30:0      # Request 30 minutes runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | DIR=$1
 9 | SAMPLE=$(cat $2 | sed -n ${SGE_TASK_ID}p)
10 | RUN_DIR=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction
11 | 
12 | python /data/home/btx494/Programmes/EffectorP-3.0/EffectorP.py -i ${DIR}${SAMPLE} > ${RUN_DIR}/effectorp/${SAMPLE}_effectorp
13 | #List of effectors
14 | awk '/# Identifier/{flag=1;next}/-----------------/{flag=0}flag' ${RUN_DIR}/effectorp/${SAMPLE}_effectorp | awk '$NF=="effector" { print $1}' | sed '/^$/d' > ${RUN_DIR}/effectorp/${SAMPLE}_effectorlist
15 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/mcmctree_independent.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1            # Request 1 core
 4 | #$ -l h_rt=5:00:00      # Request 5 hours runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | 
 9 | mkdir run${SGE_TASK_ID}_independent
10 | cp {in.BV,fus_proteins_62T.raxml.support_dating,fus_proteins_dating10_mcmctree_short.phy,mcmctree_step2_independent.ctl} run${SGE_TASK_ID}_independent
11 | 
12 | cd run${SGE_TASK_ID}_independent
13 | sed -i "s/mcmcfile = mcmc.txt/mcmcfile = mcmc_run${SGE_TASK_ID}_independent.txt/" mcmctree_step2_independent.ctl
14 | 
15 | module load anaconda3
16 | conda activate paml
17 | 
18 | mcmctree mcmctree_step2_independent.ctl
19 | 


--------------------------------------------------------------------------------
/selection/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 8 Selection
 4 | 
 5 | ### 8.1 dN/dS analysis
 6 | 
 7 | 1. `qsub gbff_files/ncbi_gbff_download.sh` - downloads GBFF files for the strains used in this study from NCBI; also need [Ilysp1 transcripts downloaded from Mycocosm](https://mycocosm.jgi.doe.gov/Ilysp1/Ilysp1.home.html) in `gbff_files` directory.
 8 | 2. `./submit_pal2nal.sh` - submits `pal2nal.sh` script to pull corresponding nucleotides for all proteins using `pull_nucleotides.py` and prepares codon alignments using [PAL2NAL](http://www.bork.embl.de/pal2nal/).
 9 | 3. `./submit_hyphy.sh` - prepares file inputs and submits scripts for [HyPhy](https://github.com/veg/hyphy) dN/dS methods - `hyphy/BUSTED.sh`, `hyphy/aBSREL.sh` and `hyphy/Contrast-FEL.sh`.
10 | 
11 | 4. :file_folder: `codon_optimisation`
12 | 


--------------------------------------------------------------------------------
/selection/codon_optimisation/blast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1 		# Request 1 core
 4 | #$ -l h_rt=1:00:00 	# Request 1 hour runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y 
 7 | 
 8 | TAXON=$(ls ../../proteins/*.faa | sed 's#\.\./\.\./proteins/##' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | module load blast+
11 | 
12 | makeblastdb -dbtype prot -in ../../proteins/${TAXON}
13 | 
14 | blastp \
15 | -query ribosomal_proteins.fasta \
16 | -db ../../proteins/${TAXON} \
17 | -outfmt '6 qseqid sseqid evalue bitscore pident length' \
18 | -evalue 1e-25 \
19 | -out ${TAXON}_blast \
20 | -num_threads ${NSLOTS}
21 | 
22 | #Make file with list of protein names with blast hits
23 | awk '{print $2}' ${TAXON}_blast > ${TAXON}_ribosomes
24 | 


--------------------------------------------------------------------------------
/annotation/repeat_masking/repeatmodeler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 5		# Request 5 cores
 4 | #$ -l h_rt=24:00:0 	# Request 24 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
11 | 
12 | mkdir fusotu${STRAIN}
13 | 
14 | cd fusotu${STRAIN}
15 | 
16 | module load singularity
17 | 
18 | singularity exec /data/containers/repeatmodeler/repeatmodeler-2.0.1.simg BuildDatabase -name fusotu${STRAIN}_abyss ../../../assembly/polishing/fusotu${STRAIN}_abyss_pilon_filtered.fasta
19 | 
20 | singularity exec /data/containers/repeatmodeler/repeatmodeler-2.0.1.simg RepeatModeler -database fusotu${STRAIN}_abyss -engine ncbi -pa 1 -LTRStruct >& fusotu${STRAIN}_abyss.out 
21 | 


--------------------------------------------------------------------------------
/selection/submit_pal2nal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #Number of samples
 4 | NUM_ORTHO=$(ls -1 ../phylogenomics/gene_trees/*_aln.fa | wc -l)
 5 | 
 6 | cat gbff_files/GCA* > gbff_files/concat.gbff
 7 | 
 8 | rm gbff_files/own_fus_transcripts.fa
 9 | rm pal2nal_check
10 | 
11 | for i in 1 3 5 6 7
12 | do
13 | 	cat ../annotation/maker/round3/fusotu${i}_gag2/fusotu${i}.mrna.fasta | sed 's/ .*//' | sed "s/>/>fusotu${i}.proteins.faa_/g" >> gbff_files/own_fus_transcripts.fa
14 | done
15 | 
16 | sed -i 's/ .*//' gbff_files/own_fus_transcripts.fa
17 | 
18 | cat gbff_files/own_fus_transcripts.fa gbff_files/Ilysp1_GeneCatalog_transcripts_20121116.nt.fasta | sed 's/.*|/\>Ilysp1_GeneCatalog_proteins_20121116.faa_/' > tmp && mv tmp gbff_files/own_fus_transcripts.fa
19 | 
20 | mkdir alignments alignments/codon trees
21 | 
22 | qsub -t 1-${NUM_ORTHO} pal2nal.sh
23 | 


--------------------------------------------------------------------------------
/selection/hyphy/Contrast-FEL.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1		# Request 1 core
 4 | #$ -l h_rt=12:00:00 	# Request 12 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | ORTHO=$(cat ../../phylogenomics/aln_list | sed 's/\.fa//' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | module load anaconda3
11 | conda activate hyphy-2.5.30
12 | 
13 | for LIFESTYLE in endophyte insectmutualist mycoparasite plantassociate plantpathogen saprotroph
14 | do
15 | 
16 | 	hyphy contrast-fel 	--alignment ../alignments/codon/${ORTHO}_aln_nuc.fa \
17 |                         	--tree ../trees/${ORTHO}.raxml.bestTree_rooted.${LIFESTYLE} \
18 | 				--output contrast-fel/${ORTHO}_Contrast-FEL_${LIFESTYLE}.json \
19 |         	                --branch-set lifestyle
20 | done
21 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/gag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1            # Request 1 core
 4 | #$ -l h_rt=1:00:0       # Request 1 hour runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -t 3
 8 | 
 9 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../../strains)
10 | 
11 | python2.7 ~/Programmes/genomeannotation-GAG-997e384/gag.py \
12 | -f ../../../assembly/polishing/fusotu${STRAIN}_abyss_pilon_filtered.fa \
13 | -g fusotu${STRAIN}_abyss_rnd3.maker.output/fusotu${STRAIN}_abyss_rnd3.all.maker.gff \
14 | -ris 10 \
15 | --fix_terminal_ns \
16 | --fix_start_stop \
17 | -o fusotu${STRAIN}_gag
18 | 
19 | cd fusotu${STRAIN}_gag
20 | 
21 | rename genome fusotu${STRAIN} *
22 | mv fusotu${STRAIN}.fasta fusotu${STRAIN}.fsa
23 | sed -i 's/protein|//' fusotu${STRAIN}.proteins.fasta
24 | 


--------------------------------------------------------------------------------
/phylogenomics/species_tree/iqtree.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 8            # Request 8 cores
 4 | #$ -l h_rt=240:00:00    # Request max hours runtime
 5 | #$ -l h_vmem=5G         # Request 5GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | mkdir iqtree
10 | 
11 | module load anaconda3
12 | conda activate IQ-Tree
13 | 
14 | iqtree 	-s fus_proteins_62T_concat.phy \
15 |         -spp fus_proteins_62T_iqtreepartition.txt \
16 |         -bb 1000 \
17 |         -nt ${NSLOTS} \
18 |         -pre iqtree/fus_proteins_62T_iqtree \
19 |         -m MFP
20 | 
21 | iqtree  -s fus_proteins_bmge_62T_concat.phy \
22 |         -spp fus_proteins_bmge_62T_iqtreepartition.txt \
23 |         -bb 1000 \
24 |         -nt ${NSLOTS} \
25 |         -pre iqtree/fus_proteins_bmge_62T_iqtree \
26 |         -m MFP
27 | 


--------------------------------------------------------------------------------
/assembly/polishing/mito_remove_3:
--------------------------------------------------------------------------------
 1 | 1145_pilon	888	mitochondrion
 2 | 1451_pilon	1453	mitochondrion
 3 | 1610_pilon	314	mitochondrion
 4 | 1809_pilon	793	mitochondrion
 5 | 1843_pilon	229	mitochondrion
 6 | 2445_pilon	2025	mitochondrion
 7 | 2478_pilon	611	mitochondrion
 8 | 292_pilon	2688	mitochondrion
 9 | 3053_pilon	303	mitochondrion
10 | 3281_pilon	1493	mitochondrion
11 | 3416_pilon	1159	mitochondrion
12 | 3700_pilon	1112	mitochondrion
13 | 3935_pilon	581	mitochondrion
14 | 3938_pilon	1007	mitochondrion
15 | 4483_pilon	644	mitochondrion
16 | 4691_pilon	323	mitochondrion
17 | 4726_pilon	302	mitochondrion
18 | 5035_pilon	552	mitochondrion
19 | 5522_pilon	1638	mitochondrion
20 | 5695_pilon	1510	mitochondrion
21 | 94_pilon	2907	mitochondrion
22 | 5525_pilon	3114	1..409,2980..3114	mitochondrion-not_cleaned
23 | 5731_pilon	6979	1..1301,2358..4154,5483..5721	mitochondrion-not_cleaned
24 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 5 Divergence time estimation
 4 | 
 5 | ### 5.2 MCMCTree
 6 | 
 7 | 1. `qsub mcmctree_dating_step1.sh` - adds secondary time calibrations to species tree nodes and submits first step of approximate likelihood divergence time estimation with protein data using MCMCTree from [PAML](http://abacus.gene.ucl.ac.uk/software/paml.html) (see [tutorial](http://abacus.gene.ucl.ac.uk/software/MCMCtree.Tutorials.pdf)).
 8 | 2. `Rscript estimate_rate.r` - estimates the scaling parameter for the substitution rate prior to be added to `mcmctree_step2_independent.ctl` and `mcmctree_step2_correlated.ctl`.
 9 | 3. `./submit_mcmctree_dating_step2.sh` - submits `mcmctree_independent.sh` and `mcmctree_correlated.sh` for second step of approximate likelihood estimation for both independent and correlated rates relaxed clock models.
10 | 


--------------------------------------------------------------------------------
/assembly/assessment/blobtools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1 		# Request 1 core
 4 | #$ -l h_rt=1:00:00 	# Request 1 hour runtime
 5 | #$ -l h_vmem=3G   	# Request 3GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
11 | 
12 | module load samtools
13 | 
14 | samtools index ../polishing/fusotu${STRAIN}_abyss_mapped_sorted_dups.bam
15 | 
16 | module load anaconda3
17 | conda activate blobtools
18 | 
19 | ~/Programmes/blobtools/blobtools create 	-i ../denovo_assembly/abyss/fusotu${STRAIN}/fusotu${STRAIN}-contigs.fa \
20 | 						-b ../polishing/fusotu${STRAIN}_abyss_mapped_sorted_dups.bam \
21 | 						-t fusotu${STRAIN}_abyss_blast.tsv \
22 | 						-o fusotu${STRAIN}_abyss
23 | 
24 | ~/Programmes/blobtools/blobtools plot 	-r species \
25 | 					-i fusotu${STRAIN}_abyss.blobDB.json
26 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/mcmctree_step2_correlated.ctl:
--------------------------------------------------------------------------------
 1 |           seed = -1
 2 |        seqfile = fus_proteins_dating10_mcmctree_short.phy
 3 |       treefile = fus_proteins_62T.raxml.support_dating
 4 |       mcmcfile = mcmc.txt
 5 |        outfile = mcmctree_step2_out.txt
 6 |          ndata = 1
 7 |        seqtype = 2    	 * 0: nucleotides; 1:codons; 2:AAs
 8 |        usedata = 2    	 * 0: no data; 1:seq like; 2:use in.BV; 3: out.BV
 9 |          clock = 3    	 * 1: global clock; 2: independent rates; 3: correlated rates
10 |      cleandata = 0    	 * remove sites with ambiguity data (1:yes, 0:no)?
11 |        BDparas = 1 1 0   * birth, death, sampling
12 |    rgene_gamma = 1 4.5   * gamma prior for overall rates for genes
13 |   sigma2_gamma = 1 10    * gamma prior for sigma^2     (for clock=2 or 3)
14 |          print = 1
15 |         burnin = 2000
16 |       sampfreq = 10
17 |        nsample = 20000
18 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/mcmctree_step2_independent.ctl:
--------------------------------------------------------------------------------
 1 |           seed = -1
 2 |        seqfile = fus_proteins_dating10_mcmctree_short.phy
 3 |       treefile = fus_proteins_62T.raxml.support_dating
 4 |       mcmcfile = mcmc.txt
 5 |        outfile = mcmctree_step2_out.txt
 6 |          ndata = 1
 7 |        seqtype = 2    	 * 0: nucleotides; 1:codons; 2:AAs
 8 |        usedata = 2    	 * 0: no data; 1:seq like; 2:use in.BV; 3: out.BV
 9 |          clock = 2    	 * 1: global clock; 2: independent rates; 3: correlated rates
10 |      cleandata = 0    	 * remove sites with ambiguity data (1:yes, 0:no)?
11 |        BDparas = 1 1 0   * birth, death, sampling
12 |    rgene_gamma = 1 4.5   * gamma prior for overall rates for genes
13 |   sigma2_gamma = 1 10    * gamma prior for sigma^2     (for clock=2 or 3)
14 |          print = 1
15 |         burnin = 2000
16 |       sampfreq = 10
17 |        nsample = 20000
18 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/rename.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1        	# Request 1 core
 4 | #$ -l h_rt=1:00:0 	# Request 1 hour runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -t 1-5
 8 | 
 9 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../../strains)
10 | TAG=$(sed -n ${SGE_TASK_ID}p locus_tags.txt)
11 | 
12 | cd fusotu${STRAIN}_abyss_rnd3.maker.output
13 | 
14 | module load maker
15 | 
16 | maker_map_ids 	--prefix ${TAG}_ \
17 | 		--justify 6 fusotu${STRAIN}_abyss_rnd3.all.maker.gff > fusotu${STRAIN}_abyss_rnd3.all.maker.map
18 | 
19 | map_gff_ids fusotu${STRAIN}_abyss_rnd3.all.maker.map fusotu${STRAIN}_abyss_rnd3.all.maker.gff
20 | 
21 | map_fasta_ids fusotu${STRAIN}_abyss_rnd3.all.maker.map fusotu${STRAIN}_abyss_rnd3.all.maker.proteins.fasta
22 | 
23 | map_fasta_ids fusotu${STRAIN}_abyss_rnd3.all.maker.map fusotu${STRAIN}_abyss_rnd3.all.maker.transcripts.fasta
24 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/estimate_rate.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | #Script to estimate scale parameter (beta) for the MCMCTree rate prior
 3 | 
 4 | library(ape)
 5 | library(adephylo)
 6 | 
 7 | #Read in rooted species trees
 8 | raxmlng <- read.tree("../fus_proteins_62T.raxml.support_rooted")
 9 | iqtree <- read.tree("../../phylogenomics/species_tree/iqtree/fus_proteins_62T_iqtree_genepart.contree")
10 | iqtree <- root(iqtree, outgroup="Ilysp1_GeneCatalog_proteins_20121116", edgelabel=TRUE, resolve.root=TRUE)
11 | 
12 | #Calculate substitution rate prior
13 | #beta = (alpha x root-time) / mean tip to root distance
14 | print(mean(distRoot(iqtree, iqtree$tip.label, method="patristic")))
15 | print(mean(distRoot(raxmlng, raxmlng$tip.label, method="patristic")))
16 | print(paste0("IQ-TREE: ", 1 / mean(distRoot(iqtree, iqtree$tip.label, method="patristic"))))
17 | print(paste0("RAxML-NG: ", 1 / mean(distRoot(raxmlng, raxmlng$tip.label, method="patristic"))))
18 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/submit_CSEPprediction.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #Script to submit all programmes in the CSEP prediction pipeline
 3 | 
 4 | #Directory containing protein fasta files (MUST END IN FORWARD SLASH)
 5 | DIR=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/proteins/
 6 | #File listing the files to be run
 7 | ls ../proteins/*.faa | sed 's#\.\./proteins/##' > todo
 8 | SAMPLES=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction/todo
 9 | #Number of samples
10 | NUM=$(cat $SAMPLES | wc -l)
11 | 
12 | #Submit all jobs
13 | qsub phobius/phobius.sh $DIR $SAMPLES
14 | qsub prosite/ps_scan.sh $DIR $SAMPLES
15 | qsub predgpi/predgpi.sh $DIR $SAMPLES
16 | 
17 | for i in $(cat $SAMPLES)
18 | do
19 | 	mkdir targetp/tmp_${i}
20 | done
21 | 
22 | qsub -t 1-${NUM} targetp/targetp.sh $DIR $SAMPLES
23 | qsub tmhmm/tmhmm.sh $DIR $SAMPLES
24 | qsub -t 1-${NUM} effectorp/effectorp.sh $DIR $SAMPLES
25 | qsub nucpred/nucpred.sh $DIR $SAMPLES
26 | qsub signalp/signalp.sh $DIR $SAMPLES
27 | 


--------------------------------------------------------------------------------
/assembly/polishing/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 1 Assembly
 4 | 
 5 | ### 1.3 Polishing
 6 |  
 7 | 1.`qsub polish.sh` - for each strain and assembly tool, maps raw reads to assembly and calculates mapping statistics with [BWA-MEM](https://github.com/lh3/bwa) and [SAMtools](http://www.htslib.org/) and polishes the assembly with [Pilon](https://github.com/broadinstitute/pilon). Also removes sequences <200bp using [Seqtk](https://github.com/lh3/seqtk) for NCBI compliance.
 8 | 
 9 | After completing [4 Assessment](https://github.com/Rowena-h/FusariumLifestyles/tree/main/assembly/assessment) and uploading to NCBI:
10 | 
11 | 2.`./ncbi_filter.sh` - removes sequences identified as mitochondrial or duplicates by NCBI (listed in files saved as `duplicates_*` and `mito_remove_*` for each strain) and trims sequences identified as having mitochondrial contaminants (listed in files saved as `mito_trim_*.bed`)  using [bedtools](https://bedtools.readthedocs.io/en/latest/).
12 | 


--------------------------------------------------------------------------------
/selection/label_trees.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | ##Script to prepare trees for Contrast-FEL##
 3 | 
 4 | library(ape)
 5 | 
 6 | #Read in metadata
 7 | metadata <- read.csv("../metadata.csv")
 8 | 
 9 | #List of SC core gene trees
10 | trees <- list.files("trees/", pattern=".bestTree_rooted$")
11 | 
12 | #For each tree...
13 | for (i in trees) {
14 |   
15 |   #Read in tree
16 |   tree <- read.tree(paste0("trees/", i))
17 |   
18 |   #For each lifestyle...
19 |   for (j in na.omit(unique(metadata$lifestyle))) {
20 |     
21 |     #Add label to taxa in that lifestyle
22 |     tree.edit <- tree
23 |     tree.edit$tip.label[match(metadata$file2[which(metadata$lifestyle == j)], tree.edit$tip.label)] <- paste0(tree.edit$tip.label[match(metadata$file2[which(metadata$lifestyle == j)], tree.edit$tip.label)], "{lifestyle}")
24 |     
25 |     #Write tree
26 |     lifestyle <- sub(" ", "", j)
27 |     write.tree(tree.edit, paste0("trees/", i, ".", lifestyle))
28 |     
29 |   }
30 |   
31 | }
32 | 


--------------------------------------------------------------------------------
/phylogenomics/species_tree/astral.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1		# Request 1 core
 4 | #$ -l h_rt=0:30:00 	# Request 30 minutes runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | TREES=$(ls ../gene_trees/RAxML-NG/*bestTree | wc -l)
 9 | 
10 | cat ../gene_trees/RAxML-NG/*bestTree > astral/${TREES}_fusortho_raxmlng_trees.tre
11 | cat ../gene_trees_bmge/RAxML-NG/*bestTree > astral/${TREES}_fusortho_bmge_raxmlng_trees.tre
12 | 
13 | java -jar /data/home/btx494/Programmes/Astral/astral.5.7.3.jar 	-i astral/${TREES}_fusortho_raxmlng_trees.tre \
14 |                                                                 -o astral/fus_proteins_62T_astral.tre
15 | 
16 | java -jar /data/home/btx494/Programmes/Astral/astral.5.7.3.jar  -i astral/${TREES}_fusortho_bmge_raxmlng_trees.tre \
17 |                                                                 -o astral/fus_proteins_bmge_62T_astral.tre
18 | 


--------------------------------------------------------------------------------
/annotation/maker/round1/maker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1	    # Request 1 core
 4 | #$ -l h_rt=120:00:0 # Request 120 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../../strains)
11 | 
12 | module load maker
13 | 
14 | maker -base fusotu${STRAIN}_abyss_rnd1 -RM_off fusotu${STRAIN}_abyss_round1_maker_opts.ctl
15 | 	
16 | gff3_merge -s -d fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1_master_datastore_index.log > fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1.all.maker.gff
17 | fasta_merge -d fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1_master_datastore_index.log
18 | gff3_merge -n -s -d fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1_master_datastore_index.log > fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1.all.maker.noseq.gff
19 | 


--------------------------------------------------------------------------------
/phylogenomics/species_tree/astral-pro.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 4		# Request 1 core
 4 | #$ -l h_rt=1:00:00 	# Request 30 minutes runtime
 5 | #$ -l h_vmem=3G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | mkdir OF_gene_trees
 9 | 
10 | cp ../../orthology_inference/OrthoFinder/Results_Oct22/Resolved_Gene_Trees/*_tree.txt OF_gene_trees
11 | 
12 | find OF_gene_trees/ -type f -exec sed -i 's/proteins_[^:]*:/proteins:/g' {} \;
13 | find OF_gene_trees/ -type f -exec sed -i 's/protein_[^:]*:/protein:/g' {} \;
14 | find OF_gene_trees/ -type f -exec sed -i -e '$a\' {} \;
15 | 
16 | TREES=$(ls ../../orthology_inference/OrthoFinder/Results_Oct22/Resolved_Gene_Trees/*_tree.txt | wc -l)
17 | 
18 | cat OF_gene_trees/*_tree.txt > astral/${TREES}_fusortho_multicopy_OF_trees.tre
19 | 
20 | ~/Programmes/ASTER-master/bin/astral-pro 	-t ${NSLOTS} \
21 | 						-o astral/fus_proteins_62T_astralpro_multicopy.tre astral/${TREES}_fusortho_multicopy_OF_trees.tre
22 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/nucpred/nucpred.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd  		    # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	# Request 1 core
 4 | #$ -l h_rt=24:00:00 # Request 24 hours runtime
 5 | #$ -l h_vmem=5G     # Request 5GB RAM
 6 | #$ -j y
 7 | 
 8 | DIR=$1
 9 | SAMPLES=$(cat $2)
10 | 
11 | cd /data/home/btx494/Programmes/nucpred-1.1/
12 | 
13 | for i in $SAMPLES
14 | do
15 | 	/data/home/btx494/Programmes/nucpred-1.1/nucpred-rh.pl ${DIR}${i} > ${i}_nucpred
16 | 	cat ${i}_nucpred | awk '$NF>=0.8 {print $1}' > ${i}_nucpred_list
17 | 
18 |         PROTEINS=$(grep ">" /data/SBCS-BuggsLab/RowenaHill/fus_comparison/orthology_inference/${i} | wc -l)
19 |         LENGTH=$(cat ${i}_nucpred | wc -l)
20 | 
21 |         if [ "$PROTEINS" -ne "$LENGTH" ]
22 |         then
23 |                 echo $i >> nucpred_failed
24 |         else
25 | 		mv ${i}_nucpred* /data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction/nucpred/
26 | 	fi
27 | done
28 | 
29 | mv nucpred_failed /data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction/nucpred/
30 | 


--------------------------------------------------------------------------------
/assembly/polishing/ncbi_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | module load bedtools
 4 | 
 5 | STRAINS=$(cat ../strains)
 6 | 
 7 | for STRAIN in $STRAINS
 8 | do
 9 | 
10 | 	awk '{print $1}' mito_remove_${STRAIN} > tmp
11 | 
12 | 	#Remove sequences flagged as mitochondrial contaminants by NCBI
13 | 	awk 'BEGIN{while((getline<"tmp")>0)l[">"$1]=1}/^>/{f=!l[$1]}f' fusotu${STRAIN}_abyss_pilon_filtered.fa > tmp1 && mv tmp1 fusotu${STRAIN}_abyss_pilon_filtered.fa
14 | 	rm tmp
15 | 
16 | 	#Trim regions flagged as mitochondrial contaminants by NCBI
17 | 	bedtools maskfasta -fi fusotu${STRAIN}_abyss_pilon_filtered.fa -bed mito_trim_${STRAIN}.bed -fo tmp.fa -mc X
18 | 	sed 's/X//g' tmp.fa > fusotu${STRAIN}_abyss_pilon_filtered.fa
19 | 	rm tmp.fa
20 | 
21 | 	if [ -f duplicates_${STRAIN} ]
22 | 	then
23 | 
24 | 		awk '{print $1}' duplicates_${STRAIN} > tmp		
25 | 
26 | 		#Remove sequences flagged as duplicates
27 | 		awk 'BEGIN{while((getline<"tmp")>0)l[">"$1]=1}/^>/{f=!l[$1]}f' fusotu${STRAIN}_abyss_pilon_filtered.fa > tmp1 && mv tmp1 fusotu${STRAIN}_abyss_pilon_filtered.fa
28 | 		rm tmp
29 | 	
30 | 	fi
31 | 
32 | done
33 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/maker3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1	    	# Request 1 core
 4 | #$ -l h_rt=120:00:0 	# Request 120 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../../strains)
11 | 
12 | module load maker
13 | export AUGUSTUS_CONFIG_PATH=/data/SBCS-BuggsLab/RowenaHill/genome_assemblies/augustus_config/config
14 | 
15 | maker -base fusotu${STRAIN}_abyss_rnd3 -RM_off fusotu${STRAIN}_abyss_round3_maker_opts.ctl
16 |     
17 | gff3_merge -s -d fusotu${STRAIN}_abyss_rnd3.maker.output/fusotu${STRAIN}_abyss_rnd3_master_datastore_index.log > fusotu${STRAIN}_abyss_rnd3.maker.output/fusotu${STRAIN}_abyss_rnd3.all.maker.gff
18 | fasta_merge -d fusotu${STRAIN}_abyss_rnd3.maker.output/fusotu${STRAIN}_abyss_rnd3_master_datastore_index.log
19 | gff3_merge -n -s -d fusotu${STRAIN}_abyss_rnd3.maker.output/fusotu${STRAIN}_abyss_rnd3_master_datastore_index.log > fusotu${STRAIN}_abyss_rnd3.maker.output/fusotu${STRAIN}_abyss_rnd3.all.maker.noseq.gff
20 | 


--------------------------------------------------------------------------------
/phylogenomics/RAxMLNG_genetrees.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 5            # Request 5 cores
 4 | #$ -l h_rt=1:00:00      # Request 1 hour runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | ORTHO=$(cat aln_list | sed 's/\.fa//' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | MODEL=$(grep ${ORTHO} species_tree/fus_proteins_62T_raxmlpartition.txt | sed 's/,.*$//')
11 | MODEL_BMGE=$(grep ${ORTHO} species_tree/fus_proteins_bmge_62T_raxmlpartition.txt | sed 's/,.*$//')
12 | 
13 | module load anaconda3
14 | conda activate raxml-ng
15 | 
16 | raxml-ng --search \
17 |          --msa gene_trees/${ORTHO}_aln_edit_trimmed.phy \
18 |          --model ${MODEL} \
19 |          --prefix gene_trees/RAxML-NG/${ORTHO} \
20 |          --seed 2 \
21 |          --threads ${NSLOTS}
22 | 
23 | raxml-ng --search \
24 |          --msa gene_trees_bmge/${ORTHO}_aln_edit_trimmed.phy \
25 |          --model ${MODEL_BMGE} \
26 |          --prefix gene_trees_bmge/RAxML-NG/${ORTHO} \
27 |          --seed 2 \
28 |          --threads ${NSLOTS}
29 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this, please cite it as below."
 3 | authors:
 4 | - family-names: "Hill"
 5 |   given-names: "Rowena"
 6 |   orcid: "https://orcid.org/0000-0002-1046-5528"
 7 | - family-names: "Buggs"
 8 |   given-names: "Richard J. A."
 9 | - family-names: "Vu"
10 |   given-names: "Dang Toan"
11 | - family-names: "Gaya"
12 |   given-names: "Ester"
13 | title: "Fusarium Lifestyles"
14 | url: "https://github.com/Rowena-h/FusariumLifestyles"
15 | preferred-citation:
16 |   type: article
17 |   authors:
18 |   - family-names: "Hill"
19 |     given-names: "Rowena"
20 |     orcid: "https://orcid.org/0000-0002-1046-5528"
21 |   - family-names: "Buggs"
22 |     given-names: "Richard J. A."
23 |   - family-names: "Vu"
24 |     given-names: "Dang Toan"
25 |   - family-names: "Gaya"
26 |     given-names: "Ester"
27 |   doi: "10.1093/molbev/msac085"
28 |   journal: "Molecular Biology and Evolution"
29 |   month: 4
30 |   start: "msac085" # First page number
31 |   title: "Lifestyle Transitions in Fusarioid Fungi are Frequent and Lack Clear Genomic Signatures"
32 |   issue: 4
33 |   volume: 39
34 |   year: 2022
35 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/phobius/phobius.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1    	    # Request 1 core
 4 | #$ -l h_rt=24:00:0      # Request 24 hours runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | DIR=$1
10 | SAMPLES=$(cat $2)
11 | RUN_DIR=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/CSEP_prediction
12 | 
13 | for i in $SAMPLES
14 | do
15 | 	/data/home/btx494/Programmes/phobius/phobius.pl ${DIR}${i} > ${RUN_DIR}/phobius/${i}_phobius
16 | 	#Filter for list of SPs	
17 | 	grep -i -B 1 "SIGNAL" ${RUN_DIR}/phobius/${i}_phobius | grep "ID" | awk '{print $2}' > ${RUN_DIR}/phobius/${i}_phobius_SPlist
18 | 	#Filter for list of >1 TMs
19 | 	#Combine Phobius output into one line per gene
20 | 	awk 'NR==1{printf $0" ";next}{printf /^ID/ ? "\n"$0" " : $0}' ${RUN_DIR}/phobius/${i}_phobius > tmp
21 | 	#Only print rows with more than one TM
22 | 	cat tmp | grep -o -n "TRANSMEM" | cut -d : -f 1 | uniq -c | awk '$1>1 {print $2}' | sed '/[^0-9]/d;s/.$/&p/' | sed -nf - tmp | awk '{print $2}' > ${RUN_DIR}/phobius/${i}_phobius_TMlist
23 | 	rm tmp
24 | done
25 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/predgpi/PredGPI.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args=commandArgs(trailingOnly=TRUE)
 3 | 
 4 | #Test if there are two arguments: if not, return an error
 5 | if (length(args)<2) {
 6 |   stop("Two arguments must be supplied: the directory where the protein fasta files are located first and a file listing the sample names second", call.=FALSE)
 7 | } 
 8 | 
 9 | #Retrieve GPI anchor predictions from PredGPI (http://gpcr.biocomp.unibo.it/predgpi/)
10 | 
11 | library(ragp)
12 | library(seqinr)
13 | 
14 | list <- read.csv(paste0("../", args[2]), header=FALSE)
15 | list <- list$V1
16 |   
17 | for (i in list) {
18 |   #Read in protein sets for each assembly
19 |   tmp <- read.fasta(paste0(args[1], i), seqtype="AA", as.string=TRUE)
20 |   #Replace protein IDs as limited to 30 characters
21 |   df <- data.frame(orig=names(tmp), replace=seq(length(names(tmp))))
22 |   names(tmp) <- df$replace
23 |   results <- get_pred_gpi(tmp, spec=0.99, progress=TRUE)
24 |   #Return original protein IDs
25 |   results$id <- df$orig
26 |   #Print list of GPI-anchored proteins
27 |   gpilist <- df$orig[results$is.gpi == TRUE]
28 |   assign(paste0(i,"_gpilist"), gpilist)
29 |   #Write file
30 |   write(gpilist, file=paste0(i, "_predgpi_GPlist"))
31 | }
32 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/sortadate/sortadate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1            # Request 1 core
 4 | #$ -l h_rt=00:30:00     # Request 30 minutes runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | mkdir ../trees
 9 | 
10 | module load R/4.0.2
11 | 
12 | #Reroot gene trees for SortaDate
13 | for ORTHO in $(cat ../../phylogenomics/aln_list | sed 's/\.fa//')
14 | do
15 | 	Rscript ../reroot.r ../../phylogenomics/gene_trees/RAxML-NG/${ORTHO}.raxml.bestTree "Ilysp1_GeneCatalog_proteins_20121116" ../trees/
16 | done
17 | 
18 | #Reroot species tree for SortaDate
19 | Rscript ../reroot.r ../../phylogenomics/species_tree/raxml-ng/fus_proteins_62T.raxml.support "Ilysp1_GeneCatalog_proteins_20121116" ../
20 | 
21 | module load anaconda3
22 | conda activate phyx
23 | 
24 | #Run SortaDate
25 | python get_var_length.py ../trees/ --flend _rooted --outf var --outg Ilysp1_GeneCatalog_proteins_20121116
26 | 
27 | python get_bp_genetrees.py ../trees/ ../fus_proteins_62T.raxml.support_rooted --flend _rooted --outf bp
28 | 
29 | python combine_results.py var bp --outf comb
30 | 
31 | python get_good_genes.py comb --max 10 --outf dating_orthogroups
32 | 


--------------------------------------------------------------------------------
/selection/submit_hyphy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ORTHO=$(cat ../phylogenomics/aln_list | sed 's/\.fa//')
 4 | 
 5 | #Number of samples
 6 | NUM_ORTHO=$(cat ../phylogenomics/aln_list | wc -l)
 7 | 
 8 | mkdir hyphy/busted hyphy/absrel hyphy/contrast-fel
 9 | 
10 | for i in $ORTHO
11 | do
12 | 	#Label gene tree foreground branches for BUSTED and Contrast-FEL
13 | 	sed -r 's/:/\{FOREGROUND\}:/g; s/\{FOREGROUND\}([^,]*)$/\1/' trees/${i}.raxml.bestTree_rooted > trees/${i}.raxml.bestTree_rooted_hyphy
14 | done
15 | 
16 | #Label dated species tree foreground branches for aBSREL
17 | sed -n 610p ../divergence_time_estimation/mcmctree/run1_independent/mcmctree_step2_out.txt | sed -r 's/:/\{FOREGROUND\}:/g; s/\{FOREGROUND\}([^,]*)$/\1/' > trees/dated_tree_absrel.tre
18 | awk -F',' '{print $13 "\t" $11}' ../metadata.csv > tmp
19 | sed `cat tmp | awk '{print "-e s/"$2"/"$1"/"}'`<<<"`cat trees/dated_tree_absrel.tre`" > tmp2 && mv tmp2 trees/dated_tree_absrel.tre
20 | rm tmp
21 | 
22 | #Label trees for different lifestyles for Contrast-FEL
23 | module load R/4.0.2
24 | 
25 | Rscript label_trees.r
26 | 
27 | cd hyphy 
28 | 
29 | #Submit HyPhy programmes
30 | qsub -t 1-${NUM_ORTHO} BUSTED.sh
31 | qsub -t 1-${NUM_ORTHO} aBSREL.sh
32 | qsub -t 1-${NUM_ORTHO} Contrast-FEL.sh
33 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/blastp/blastp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1		# Request 1 core
 4 | #$ -l h_rt=1:00:00 	# Request 30 minutes runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | TAXON=$(ls ../../proteins/*.faa | sed 's#\.\./\.\./proteins/##' | sed 's/\.faa//' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | module load seqtk
11 | 
12 | seqtk subseq ../../proteins/${TAXON}.faa ../${TAXON}.faa_candidate_effectors > ${TAXON}_cseps.faa
13 | 
14 | module load blast+
15 | 
16 | blastp \
17 | -query ${TAXON}_cseps.faa \
18 | -db phi-base_current.fas \
19 | -outfmt '6 qseqid sseqid evalue bitscore pident length' \
20 | -evalue 1e-25 \
21 | -out ${TAXON}_phi-b_blast \
22 | -num_threads ${NSLOTS}
23 | 
24 | #Filter for top bitscore result per gene
25 | sort -r -n -k4 < ${TAXON}_phi-b_blast | awk '!x[$1]++' ${TAXON}_phi-b_blast | sort -k 1b,1  > ${TAXON}_phi-b_blast_tophits
26 | #Add data to CSEPs file
27 | join -a1 -a2 -t $'\t' -o 1.1 2.2 2.3 2.4 2.5 -1 1 -2 1 ../${TAXON}.faa_candidate_effectors ${TAXON}_phi-b_blast_tophits > ../${TAXON}.faa_cseps
28 | awk -F "\t" '{gsub(/\#/,"\t",$2);print $0}' ../${TAXON}.faa_cseps | sed 's/ /\t/g' > ${TAXON}tmp && mv ${TAXON}tmp ../${TAXON}.faa_cseps
29 | 


--------------------------------------------------------------------------------
/selection/codon_optimisation/pull_ribosomes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #Search annotation for ribosomal proteins
 4 | grep "ribosomal protein" Fusgr1_GeneCatalog_proteins_20110524_IPR.tab > ribosomal_proteins.tsv
 5 | #Remove mitochondrial ones
 6 | sed -i '/Mitochondrial/d' ribosomal_proteins.tsv	
 7 | 
 8 | #Format ribosomal protein file for bedtools
 9 | cut -f1,8,9 ribosomal_proteins.tsv > ribosomal_proteins.bed
10 | sed -i 's/,//g' ribosomal_proteins.bed
11 | sed -i -e 's/jgi|Fusgr1|//' Fusgr1_GeneCatalog_proteins_20110524.aa.fasta
12 | sed -i -e 's/|.*//' Fusgr1_GeneCatalog_proteins_20110524.aa.fasta
13 | 
14 | module load bedtools
15 | 
16 | #Pull sequences for ribosomal proteins
17 | bedtools getfasta -fi Fusgr1_GeneCatalog_proteins_20110524.aa.fasta -bed ribosomal_proteins.bed -fo ribosomal_proteins.fasta
18 | 
19 | #Format fasta for blasting
20 | sed -i 's/:.*//' ribosomal_proteins.fasta
21 | awk -F '\t' '{print $1 "\t" $3}' ribosomal_proteins.tsv | sed "s/[^a-zA-Z0-9]/_/g" | sed 's/_/\t/' > headers
22 | awk -F '\t' 'FNR==NR {f2[$1]=$2;next} $2 in f2 {$2=f2[$2]}1' headers FS='>' OFS='>' ribosomal_proteins.fasta > tmp && mv tmp ribosomal_proteins.fasta
23 | 
24 | rm headers
25 | 
26 | NUM_TAXA=$(ls ../../proteins/*.faa | sed 's#\.\./\.\./proteins/##' | wc -l)
27 | 
28 | qsub -t 1-${NUM_TAXA} blast.sh
29 | 


--------------------------------------------------------------------------------
/phylogenomics/alignment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1		# Request 1 core
 4 | #$ -l h_rt=00:30:00 	# Request 30 minutes runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | #$ -o /dev/null
 8 | 
 9 | ORTHO=$(cat aln_list | sed 's/\.fa//' | sed -n ${SGE_TASK_ID}p)
10 | ORTHOFINDER_DIR=../orthology_inference/OrthoFinder/Results_Oct22/Single_Copy_Orthologue_Sequences
11 | 
12 | #Align single copy orthogroups
13 | 
14 | module load mafft
15 | 
16 | mafft ${ORTHOFINDER_DIR}/${ORTHO}.fa > gene_trees/${ORTHO}_aln.fa
17 | 
18 | #Fix names
19 | 
20 | sed 's/.faa.*$//' gene_trees/${ORTHO}_aln.fa > gene_trees/${ORTHO}_aln_edit.fa
21 | 
22 | #Trim alignments
23 | 
24 | module load anaconda3
25 | conda activate trimal
26 | 
27 | trimal -in gene_trees/${ORTHO}_aln.fa -fasta -gappyout > gene_trees/${ORTHO}_alntrimmed.fa
28 | trimal -in gene_trees/${ORTHO}_aln_edit.fa -phylip -gappyout > gene_trees/${ORTHO}_aln_edit_trimmed.phy
29 | 
30 | java -jar /data/home/btx494/Programmes/BMGE-1.12/BMGE.jar 	-i gene_trees/${ORTHO}_aln.fa \
31 | 								-t AA -of gene_trees_bmge/${ORTHO}_alntrimmed.fa
32 | java -jar /data/home/btx494/Programmes/BMGE-1.12/BMGE.jar 	-i gene_trees/${ORTHO}_aln_edit.fa \
33 | 								-t AA -o gene_trees_bmge/${ORTHO}_aln_edit_trimmed.phy
34 | 


--------------------------------------------------------------------------------
/phylogenomics/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 4 Phylogenomics
 4 | 
 5 | 1. `./submit_alignment.sh` - submits `alignment.sh` for alignment of single copy orthogroups from OrthoFinder with [MAFFT](https://mafft.cbrc.jp/alignment/software/) followed by trimming with [BMGE](https://bmcecolevol.biomedcentral.com/articles/10.1186/1471-2148-10-210) and [trimAl](http://trimal.cgenomics.org/).
 6 | 2. `./concat.sh` - concatenate single copy orthogroup alignments and prepare partition files using [AMAS](https://github.com/marekborowiec/AMAS).
 7 | 3. `./submit_modeltestng.sh` - submits `modeltest-ng/modeltestng.sh` to run [ModelTest-NG](https://github.com/ddarriba/modeltest) on all single copy orthogroups (in computationally tractable chunks).
 8 | 4. `./submit_speciestrees_concatenation.sh` - submits concatenation-based species tree methods - `species_tree/raxmlng.sh` ([RAxML-NG](https://github.com/amkozlov/raxml-ng)) and `species_tree/iqtree.sh` ([IQ-TREE](https://github.com/iqtree/iqtree2)).
 9 | 5. `./submit_RAxML-NG_genetrees.sh` - submits `RAxMLNG_genetrees.sh` to run RAxML-NG for individual gene trees.
10 | 6. `./submit_speciestrees_coalescent.sh` - submits coalescent-based species tree methods - `species_tree/astral.sh` ([ASTRAL-III](https://github.com/smirarab/ASTRAL)) and `species_tree/astral-pro.sh` ([ASTRAL-Pro](https://github.com/chaoszhang/A-pro)) using genes trees.
11 | 


--------------------------------------------------------------------------------
/lifestyle_comparison/lifestyle-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1            # Request 1 core
 4 | #$ -l h_rt=00:30:00     # Request 30 minutes runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | module load anaconda3
 9 | conda activate lifestyle-test
10 | 
11 | module load R/3.6.1
12 | 
13 | mkdir CSEPs CAZymes orthogroups
14 | 
15 | python run_edited.py 	-i lifestyle-test-orthogroups.csv \
16 |                         -t species_tree_ingroup.tre \
17 |                         --colors endophyte:#009E73,insectmutualist:#56B4E9,plantpathogen:#696969,saprotroph:#0072B2,plantassociate:#9AE324,mycoparasite:#D55E00 \
18 |                         -o orthogroups
19 | 
20 | python run_edited.py  	-i lifestyle-test-CSEP.csv \
21 |                         -t species_tree_ingroup.tre \
22 |                         --colors endophyte:#009E73,insectmutualist:#56B4E9,plantpathogen:#696969,saprotroph:#0072B2,plantassociate:#9AE324,mycoparasite:#D55E00 \
23 |                         -o CSEPs
24 | 
25 | python run_edited.py    -i lifestyle-test-CAZyme.csv \
26 |                         -t species_tree_ingroup.tre \
27 |                         --colors endophyte:#009E73,insectmutualist:#56B4E9,plantpathogen:#696969,saprotroph:#0072B2,plantassociate:#9AE324,mycoparasite:#D55E00 \
28 |                         -o CAZymes
29 | 


--------------------------------------------------------------------------------
/selection/pal2nal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1 		# Request 1 core
 4 | #$ -l h_rt=12:00:00 # Request 12 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | ORTHO=$(ls -1 ../phylogenomics/gene_trees/*_aln.fa | sed 's#\.\./phylogenomics/gene_trees/##' | sed 's/_aln\.fa//' | sed -n ${SGE_TASK_ID}p)
 9 | 
10 | #Remove jgi prefixes
11 | sed 's/jgi|*|.*|//' ../phylogenomics/gene_trees/${ORTHO}_aln.fa > alignments/${ORTHO}_aln.fa
12 | 
13 | #Make list of sequences
14 | grep ">" alignments/${ORTHO}_aln.fa | sed 's/>//g' > alignments/${ORTHO}_seqlist
15 | 
16 | module load anaconda3
17 | conda activate biopython
18 | 
19 | #Get nucleotides for genbank proteins 
20 | python pull_nucleotides.py alignments/${ORTHO}_seqlist
21 | 
22 | #Get nucleotides for own proteins
23 | awk -F '>' 'NR==FNR{ids[$0]; next} NF>1{f=($2 in ids)} f' alignments/${ORTHO}_seqlist gbff_files/own_fus_transcripts.fa >> alignments/${ORTHO}_seqlist_nucl.fa
24 | 
25 | perl /data/home/btx494/Programmes/pal2nal.v14/pal2nal.pl alignments/${ORTHO}_aln.fa alignments/${ORTHO}_seqlist_nucl.fa -output fasta -nogap > alignments/codon/${ORTHO}_aln_nuc.fa
26 | 
27 | #Remove protein name
28 | sed -i 's/\.faa.*//g' alignments/codon/${ORTHO}_aln_nuc.fa
29 | 
30 | #Add orthogroup to list for manual checking
31 | if [[ ! -s alignments/codon/${ORTHO}_aln_nuc.fa ]]
32 | then
33 | 	echo ${ORTHO} >> pal2nal_check
34 | fi
35 | 
36 | module load R/4.0.2
37 | 
38 | #Reroot gene trees
39 | Rscript reroot.r ../phylogenomics/gene_trees/RAxML-NG/${ORTHO}.raxml.bestTree "Ilysp1_GeneCatalog_proteins_20121116" trees/
40 | 


--------------------------------------------------------------------------------
/selection/pull_nucleotides.py:
--------------------------------------------------------------------------------
 1 | ##Script to pull corresponding nucleotides from GBFF files
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | 
 6 | def get_cds_feature_with_qualifier_value(record, name, value):
 7 |     """Function to look for CDS feature by annotation value in sequence record."""
 8 |     # Loop over the records and features
 9 |     for feature in record.features:
10 |         if feature.type == "CDS" and value in feature.qualifiers.get(name, []):
11 |     	    return feature
12 | 
13 |     # Could not find it
14 |     return None
15 | 
16 | with open(sys.argv[1], "r") as file:
17 |     proteins_id = file.read().splitlines()
18 | 
19 | with open(sys.argv[1] + "_nucl.fa", "w") as nt_output:
20 |     for unit in proteins_id:
21 | 
22 |         sample = unit.partition(".faa_")[-3] + ".faa_"
23 |         protein = unit.partition(".faa_")[-1]
24 |         genome_records = SeqIO.parse("gbff_files/concat.gbff", "genbank")	
25 |         print("Looking at " + protein)
26 |         found_feature = False
27 |         for record in genome_records:
28 |             cds_feature = get_cds_feature_with_qualifier_value(record, "protein_id", protein)
29 |             if cds_feature is None:
30 |                 continue # feature not found in this record - try the next record
31 | 	
32 |             found_feature = True
33 |             
34 |             gene_sequence = cds_feature.extract(record.seq)
35 |              
36 |             # Output FASTA records
37 |             nt_output.write(">%s%s\n%s\n" % (sample, protein, gene_sequence))
38 | 
39 |         if not found_feature:
40 |             print("Error: could not find feature")
41 | 
42 | print("Done")
43 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/maker2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1	    	# Request 1 core
 4 | #$ -l h_rt=120:00:0 	# Request 120 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../../strains)
11 | 
12 | module load maker
13 | export AUGUSTUS_CONFIG_PATH=/data/SBCS-BuggsLab/RowenaHill/genome_assemblies/augustus_config/config
14 | 
15 | # transcript alignments
16 | awk '{ if ($2 == "est2genome") print $0 }' ../round1/fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1.all.maker.noseq.gff > ../round1/fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1.all.maker.est2genome.gff
17 | # protein alignments
18 | awk '{ if ($2 == "protein2genome") print $0 }' ../round1/fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1.all.maker.noseq.gff > ../round1/fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1.all.maker.protein2genome.gff
19 | 
20 | maker -base fusotu${STRAIN}_abyss_rnd2 -RM_off fusotu${STRAIN}_abyss_round2_maker_opts.ctl
21 |    
22 | gff3_merge -s -d fusotu${STRAIN}_abyss_rnd2.maker.output/fusotu${STRAIN}_abyss_rnd2_master_datastore_index.log > fusotu${STRAIN}_abyss_rnd2.maker.output/fusotu${STRAIN}_abyss_rnd2.all.maker.gff
23 | fasta_merge -d fusotu${STRAIN}_abyss_rnd2.maker.output/fusotu${STRAIN}_abyss_rnd2_master_datastore_index.log
24 | gff3_merge -n -s -d fusotu${STRAIN}_abyss_rnd2.maker.output/fusotu${STRAIN}_abyss_rnd2_master_datastore_index.log > fusotu${STRAIN}_abyss_rnd2.maker.output/fusotu${STRAIN}_abyss_rnd2.all.maker.noseq.gff
25 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/README.md:
--------------------------------------------------------------------------------
 1 | # *Fusarium* Lifestyles
 2 | 
 3 | ## 6 CSEP & CAZyme prediction
 4 | 
 5 | 1. `./submit_CSEPprediction.sh` - submits all programmes in the CSEP prediction pipeline - `signalp/signalp.sh` ([SignalP](https://services.healthtech.dtu.dk/service.php?SignalP-5.0)), `targetp/targetp.sh` ([TargetP](https://services.healthtech.dtu.dk/service.php?TargetP-2.0)), `phobius/phobius.sh` ([Phobius](https://phobius.sbc.su.se/instructions.html)), `tmhmm/tmhmm.sh` ([TMHMM](https://services.healthtech.dtu.dk/service.php?TMHMM-2.0)), `prosite/ps_scan.sh` ([ps_scan](https://prosite.expasy.org/scanprosite/)), `nucpred/nucpred.sh` ([NucPred](https://nucpred.bioinfo.se/nucpred/)), `predgpi/predgpi.sh` which in turn submits `predgpi/PredGPI.r` to use the R package [ragp](https://rdrr.io/github/missuse/ragp/man/get_pred_gpi.html) ([PredGPI](http://gpcr.biocomp.unibo.it/predgpi/)) and  `effectorp/effectorp.sh` ([EffectorP](https://github.com/JanaSperschneider/EffectorP-3.0)).
 6 | 2. `./submit_CSEPfilter.sh` - submits `CSEPfilter` to produce lists of CSEPs from all programme results.
 7 | 3. `./submit_CSEPblast.sh` -  submits `blastp/blastp.sh` to BLAST of CSEPs against the [PHI-base database](http://www.phi-base.org/) (requires `phi-base_current.csv` and `phi-base_current.fas` to be downloaded from [here](http://www.phi-base.org/downloadLink.htm) into the `blastp` directory).
 8 | 4. `./submit_CAZymeprediction.sh` - submits `run_dbcan/run_dbcan.sh` to run [run_dbcan](https://github.com/linnabrown/run_dbcan). 
 9 | 5. `qsub submit_orthogroupparsing.sh` - submits `orthogroup_parser.r` to make abundance matrices of orthogroups for all strains and categorises whether they are CSEPs/CAZymes and core/accessory/specific.
10 | 


--------------------------------------------------------------------------------
/phylogenomics/concat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | module load anaconda3
 4 | conda activate AMAS
 5 | 
 6 | #Concatenate gene alignments
 7 | AMAS.py concat -f phylip-int -d aa -i gene_trees/*_aln_edit_trimmed.phy -p fus_proteins_62T_partition.txt -t fus_proteins_62T_concat.phy -u phylip
 8 | AMAS.py concat -f phylip-int -d aa -i gene_trees_bmge/*_aln_edit_trimmed.phy -p fus_proteins_bmge_62T_partition.txt -t fus_proteins_bmge_62T_concat.phy -u phylip
 9 | 
10 | #Fix format
11 | sed -i 's/^/PROT, /' fus_proteins_62T_partition.txt
12 | sed -i 's/^/PROT, /' fus_proteins_bmge_62T_partition.txt
13 | 
14 | #Split partitions for ModelTest-NG
15 | split -l 100 --numeric-suffixes=1 fus_proteins_62T_partition.txt fus_proteins_62T_partition.num
16 | split -l 100 --numeric-suffixes=1 fus_proteins_bmge_62T_partition.txt fus_proteins_bmge_62T_partition.num
17 | rename num0 num *.num*
18 | 
19 | #Prepare partitioned ModelTest-NG folders
20 | NUM=$(ls fus_proteins_62T_partition.num* | wc -l)
21 | 
22 | for i in $(seq 1 $NUM)
23 | do
24 | 	mkdir modeltest-ng/partition.${i}
25 | 	mv fus_proteins_62T_partition.num${i} modeltest-ng/partition.${i}
26 | 	mv fus_proteins_bmge_62T_partition.num${i} modeltest-ng/partition.${i}
27 | 	cp fus_proteins_62T_concat.phy modeltest-ng/partition.${i}
28 | 	cp fus_proteins_bmge_62T_concat.phy modeltest-ng/partition.${i}
29 | done
30 | 
31 | mkdir species_tree
32 | 
33 | #Prepare IQ-TREE partition file
34 | mv fus_proteins_62T_partition.txt species_tree/fus_proteins_62T_iqtreepartition.txt
35 | mv fus_proteins_bmge_62T_partition.txt species_tree/fus_proteins_bmge_62T_iqtreepartition.txt
36 | mv fus_proteins_62T_concat.phy species_tree/
37 | mv fus_proteins_bmge_62T_concat.phy species_tree/
38 | 
39 | conda deactivate AMAS
40 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/training_snap/snap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1	    # Request 1 core
 4 | #$ -l h_rt=12:00:0 	# Request 12 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../../../strains)
11 | 
12 | module load anaconda3
13 | conda activate snap
14 | module load maker
15 | 
16 | mkdir ${STRAIN}
17 | cd ${STRAIN}	
18 | 
19 | # export 'confident' gene models from MAKER and rename to something meaningful
20 | maker2zff -x 0.25 -l 50 -d ../../../round1/fusotu${STRAIN}_abyss_rnd1.maker.output/fusotu${STRAIN}_abyss_rnd1_master_datastore_index.log 
21 | rename genome fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25 *
22 | 
23 | # gather some stats and validate
24 | fathom fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25.ann fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25.dna -gene-stats > gene-stats.log 2>&1 
25 | fathom fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25.ann fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25.dna -validate > validate.log 2>&1
26 | 
27 | # collect the training sequences and annotations, plus 1000 surrounding bp for training
28 | fathom fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25.ann fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25.dna -categorize 1000 > categorise.log 2>&1
29 | fathom uni.ann uni.dna -export 1000 -plus > uni-plus.log 2>&1
30 | 	
31 | # create the training parameters
32 | mkdir params
33 | cd params
34 | forge ../export.ann ../export.dna > ../forge.log 2>&1
35 | cd ..
36 | 	
37 | # assemble the HMM
38 | hmm-assembler.pl fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25 params > fusotu${STRAIN}_abyss_rnd1.zff.length50_aed0.25.hmm
39 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/training_snap2/snap2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 1	    # Request 1 core    
 4 | #$ -l h_rt=12:00:0 	# Request 12 hours runtime
 5 | #$ -l h_vmem=1G   	# Request 1GB RAM
 6 | #$ -m bea
 7 | #$ -j y
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../../../strains)
11 | 
12 | module load anaconda3
13 | conda activate snap
14 | module load maker
15 | 
16 | mkdir ${STRAIN}
17 | cd ${STRAIN}	
18 | 
19 | # export 'confident' gene models from MAKER and rename to something meaningful
20 | maker2zff -x 0.25 -l 50 -d ../../round2/fusotu${STRAIN}_abyss_rnd2.maker.output/fusotu${STRAIN}_abyss_rnd2_master_datastore_index.log 
21 | rename genome fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25 *
22 | 
23 | # gather some stats and validate
24 | fathom fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25.ann fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25.dna -gene-stats > gene-stats.log 2>&1 
25 | fathom fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25.ann fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25.dna -validate > validate.log 2>&1
26 | 
27 | # collect the training sequences and annotations, plus 1000 surrounding bp for training
28 | fathom fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25.ann fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25.dna -categorize 1000 > categorise.log 2>&1
29 | fathom uni.ann uni.dna -export 1000 -plus > uni-plus.log 2>&1
30 | 	
31 | # create the training parameters
32 | mkdir params
33 | cd params
34 | forge ../export.ann ../export.dna > ../forge.log 2>&1
35 | cd ..
36 | 	
37 | # assemble the HMM
38 | hmm-assembler.pl fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25 params > fusotu${STRAIN}_abyss_rnd2.zff.length50_aed0.25.hmm
39 | 


--------------------------------------------------------------------------------
/divergence_time_estimation/mcmctree/mcmctree_dating_step1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 1            # Request 1 core
 4 | #$ -l h_rt=01:00:00     # Request 1 hour runtime
 5 | #$ -l h_vmem=1G         # Request 1GB RAM
 6 | #$ -j y
 7 | 
 8 | ORTHOS=$(tail -n +2 ../sortadate/dating_orthogroups | sed 's/^.*OG000/OG000/' | sed 's/\.raxml\.bestTree_rooted.*//' | sed 's#^#\.\./\.\./phylogenomics/gene_trees/#' | sed 's/$/_aln_edit_trimmed\.phy/')
 9 | 
10 | module load anaconda3
11 | conda activate AMAS
12 | 
13 | #Concatenate top ten clock-like genes according to SortaDate
14 | AMAS.py concat -f phylip-int -d aa -i ${ORTHOS} -p fus_proteins_dating10_partition.txt -t fus_proteins_dating10_mcmctree.phy -u phylip
15 | 
16 | sed -i 's/[[:space:]]/  /' fus_proteins_dating10_mcmctree.phy
17 | sed -e 's/^\(.\{13\}\).*\( .*\)$/\1 \2/' fus_proteins_dating10_mcmctree.phy > fus_proteins_dating10_mcmctree_short.phy
18 | 
19 | module load R/4.0.2
20 | 
21 | #Format rooted species tree for MCMCTree
22 | Rscript blank_topology.r ../fus_proteins_62T.raxml.support_rooted ./
23 | 
24 | sed -i '1s/^/62 1\n/' fus_proteins_62T.raxml.support_rooted_blank
25 | #sed -i 's/Root//' fus_proteins_62T.raxml.support_rooted_blank
26 | sed "s/;/\'>0.9<1.35\';/" fus_proteins_62T.raxml.support_rooted_blank > fus_proteins_62T.raxml.support_dating
27 | sed -i "s/GCA_013266205)))/GCA_013266205)))'>0.5<0.9'/" fus_proteins_62T.raxml.support_dating
28 | 
29 | module load anaconda3
30 | conda activate paml
31 | 
32 | #Run MCMCTree
33 | mcmctree mcmctree_step1.ctl
34 | 
35 | sed -i 's/aaRatefile =/aaRatefile = wag.dat/' tmp0001.ctl
36 | sed -i 's/model = 0/model = 2/' tmp0001.ctl
37 | echo -e "fix_alpha = 0\nalpha = 0.5\nncatG = 4" >> tmp0001.ctl
38 | 
39 | codeml tmp0001.ctl
40 | 
41 | mv rst2 in.BV
42 | 


--------------------------------------------------------------------------------
/assembly/polishing/polish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd           	# Set the working directory for the job to the current directory
 3 | #$ -pe smp 4		# Request 4 cores
 4 | #$ -l h_rt=48:0:0 	# Request 48 hours runtime
 5 | #$ -l h_vmem=5G   	# Request 5GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | #$ -t 1-5
 9 | 
10 | STRAIN=$(sed -n ${SGE_TASK_ID}p ../strains)
11 | 
12 | module load bwa
13 | module load samtools
14 | module load seqtk
15 | 
16 | for ASSEMBLER in abyss megahit spades
17 | do
18 | 	#Index assembly	
19 | 	bwa index ../denovo_assembly/${ASSEMBLER}/fusotu${STRAIN}/fusotu${STRAIN}-contigs.fa
20 | 	#Align reads to assembly, coordinate sort and mark duplicates
21 | 	bwa mem ./denovo_assembly/${ASSEMBLER}/fusotu${STRAIN}/fusotu${STRAIN}-contigs.fa \
22 | 	../reads/FUS_OTU${STRAIN}_1_trimmedpaired.fastq.gz \
23 | 	../reads/FUS_OTU${STRAIN}_2_trimmedpaired.fastq.gz \
24 | 	-t ${NSLOTS} | \
25 | 	samtools fixmate -m -@ ${NSLOTS} - - | \
26 | 	samtools sort -@ ${NSLOTS} - | \
27 | 	samtools markdup -@ ${NSLOTS} - fusotu${STRAIN}_${ASSEMBLER}_mapped_coordinatesorted.bam
28 | 	
29 | 	#Calculate statistics
30 | 	samtools flagstat fusotu${STRAIN}_${ASSEMBLER}_mapped_coordinatesorted.bam > fusotu${STRAIN}_${ASSEMBLER}_mapstats
31 | 	
32 | 	#Index for polishing
33 | 	samtools index fusotu${STRAIN}_${ASSEMBLER}_mapped_coordinatesorted.bam
34 | 
35 | 	#Polish with pilon
36 | 	java -jar /data/home/btx494/pilon/pilon-1.23.jar --genome ../denovo_assembly/${ASSEMBLER}/fusotu${STRAIN}/fusotu${STRAIN}-contigs.fa --frags fusotu${STRAIN}_${ASSEMBLER}_mapped_coordinatesorted.bam --output fusotu${STRAIN}_${ASSEMBLER}_pilon --changes --fix all --threads ${NSLOTS}
37 | 
38 | 	#Remove contigs <200bp (to be NCBI compliant)
39 | 	seqtk seq -L 200 fusotu${STRAIN}_${ASSEMBLER}_pilon.fasta > fusotu${STRAIN}_${ASSEMBLER}_pilon_filtered.fa
40 | done
41 | 
42 | mkdir fusotu${STRAIN}
43 | mv fusotu${STRAIN}*.bam* fusotu${STRAIN}
44 | 


--------------------------------------------------------------------------------
/lifestyle_comparison/lifestyle_v_phylogeny.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | ##Script to produce input files for lifestyle test:
 3 | ##https://github.com/fantin-mesny/Effect-Of-Biological-Categories-On-Genomes-Composition
 4 | 
 5 | library(ape)
 6 | library(MCMCtreeR)
 7 | library(dplyr)
 8 | 
 9 | args <- commandArgs(trailingOnly=TRUE)
10 | 
11 | #Test if there is one argument: if not, return an error
12 | if (length(args) != 1) {
13 |   stop("One argument must be supplied: the orthology parsing results RData file to be used", call.=FALSE)
14 | } 
15 | 
16 | #Load orthogroup presence absence matrices
17 | load(args[1])
18 | 
19 | ##LIFESTYLE VERSUS PHYLOGENY TEST
20 | 
21 | metadata <- read.csv("../metadata.csv")
22 | 
23 | #Read in dated tree
24 | phy <- readMCMCtree("../divergence_time_estimation/mcmctree/run1_independent/FigTree.tre", 
25 | 		    forceUltrametric=TRUE)$apePhy
26 | #Remove outgroup from tree and write to file
27 | outgroup <- "Ilysp1_GeneCa"
28 | phy.ingroup <- drop.tip(phy, outgroup)
29 | write.tree(phy.ingroup, "species_tree_ingroup.tre")
30 | 
31 | for (i in c("orthogroups", "CSEP", "CAZyme")) {
32 |   
33 |   #Transpose count dataframe (excluding outgroup)
34 |   lifestyle.test <- as.data.frame(t(get(paste0(i, ".count.ingroup1"))))
35 |   #Add column with names
36 |   lifestyle.test$genome <- rownames(lifestyle.test)
37 |   #Add column with lifestyle
38 |   lifestyle.test$lifestyle <- gsub(" ", "", metadata$lifestyle[match(rownames(lifestyle.test), metadata$file2)])
39 |   #Replace names to match dated tree labels
40 |   lifestyle.test$genome <- metadata$short.tip[match(lifestyle.test$genome, metadata$file2)]
41 |   #Reorder columns
42 |   lifestyle.test <- lifestyle.test %>% select(genome, lifestyle, everything())
43 |   
44 |   #Write to file
45 |   write.csv(lifestyle.test, paste0("lifestyle-test-", i, ".csv"), row.names=FALSE, quote=FALSE)
46 |   
47 | }
48 | 
49 | #Submit test
50 | system("qsub lifestyle-test.sh")
51 | 


--------------------------------------------------------------------------------
/orthology_inference/ncbi_ftp_links.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | ##Script to download ftp links for Fusarium strains with annotated proteins from NCBI##
 3 | 
 4 | #Download and read in ncbi genome data (< 3 MB file)
 5 | download.file("ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt", destfile=paste0(Sys.Date(), "_eukaryotes.txt"))
 6 | ncbi <- read.csv(paste0(Sys.Date(), "_eukaryotes.txt"), header=TRUE, sep="\t")
 7 | #Filter for assemblies with annotated proteins
 8 | ncbi <- ncbi[ncbi$Proteins != "-",]
 9 | #Filter for Fusarium taxa
10 | ncbi.filtered <- ncbi[grep("Fusarium", ncbi$X.Organism.Name),]
11 | ncbi.filtered <- ncbi.filtered[order(ncbi.filtered$X.Organism.Name),]
12 | #Select taxa
13 | ncbi.filtered <- ncbi.filtered[c(1:19, 34:36, 42, 44:54, 62, 65, 70, 76, 83, 84, 87:90, 92:96, 100:104, 108:109),]
14 | 
15 | #Download and read in file with ftp links to assemblies (< 300 MB file)
16 | download.file("ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt", destfile=paste0(Sys.Date(), "_assembly_summary_genbank.txt"))
17 | assembly.sum <- read.csv(paste0(Sys.Date(), "_assembly_summary_genbank.txt"), skip=1, header=TRUE, sep="\t", quote="")
18 | #Match to ncbi data
19 | assembly.sum <- assembly.sum[match(ncbi.filtered$Assembly.Accession, assembly.sum$X..assembly_accession),]
20 | 
21 | #Get FTP links and write to file
22 | genomic.ftp <- paste0(assembly.sum$ftp_path, "/", assembly.sum$X..assembly_accession, "_", assembly.sum$asm_name, "_genomic.gbff.gz")
23 | genomic.ftp <- gsub(" ", "_", genomic.ftp)
24 | write(genomic.ftp, file="../selection/gbff_files/fus_ncbi_genomic")
25 | 
26 | protein.ftp <- paste0(assembly.sum$ftp_path, "/", assembly.sum$X..assembly_accession, "_", assembly.sum$asm_name, "_protein.faa.gz")
27 | protein.ftp <- gsub(" ", "_", protein.ftp)
28 | write(protein.ftp, file="fus_ncbi_proteins")
29 | 
30 | #Write file with metadata
31 | write.csv(ncbi.filtered, "ncbi_metadata.csv")
32 | 


--------------------------------------------------------------------------------
/phylogenomics/species_tree/raxmlng.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #$ -cwd                 # Set the working directory for the job to the current directory
 3 | #$ -pe smp 20           # Request 20 cores
 4 | #$ -l h_rt=240:00:00    # Request max hours runtime
 5 | #$ -l h_vmem=2G         # Request 2GB RAM
 6 | #$ -j y
 7 | #$ -m bea
 8 | 
 9 | mkdir raxml-ng
10 | 
11 | #Combine ModelTest-NG partition results
12 | NUM=$(ls -dir ../modeltest-ng/partition.* | wc -l)
13 | 
14 | rm fus_proteins_62T_raxmlpartition.txt
15 | rm fus_proteins_bmge_62T_raxmlpartition.txt
16 | 
17 | for i in $(seq 1 $NUM)
18 | do
19 | 	cat ../modeltest-ng/partition.${i}/fus_proteins_62T_concat.phy.part.aic >> fus_proteins_62T_raxmlpartition.txt
20 | 	cat ../modeltest-ng/partition.${i}/fus_proteins_bmge_62T_concat.phy.part.aic >> fus_proteins_bmge_62T_raxmlpartition.txt
21 | done
22 | 
23 | module load anaconda3
24 | conda activate raxml-ng
25 | 
26 | raxml-ng --parse \
27 |          --msa fus_proteins_62T_concat.phy \
28 |          --model fus_proteins_62T_raxmlpartition.txt \
29 |          --prefix raxml-ng/fus_proteins_62T
30 | 
31 | raxml-ng --all \
32 |          --msa fus_proteins_62T_concat.phy \
33 |          --model fus_proteins_62T_raxmlpartition.txt \
34 |          --prefix raxml-ng/fus_proteins_62T \
35 |          --seed 2 \
36 |          --threads ${NSLOTS} \
37 |          --bs-trees 100
38 | 
39 | raxml-ng --bsconverge \
40 |          --bs-trees raxml-ng/fus_proteins_62T.raxml.bootstraps \
41 |          --prefix raxml-ng/fus_proteins_62T_convergence_test \
42 |          --seed 2
43 | 
44 | raxml-ng --parse \
45 |          --msa fus_proteins_bmge_62T_concat.phy \
46 |          --model fus_proteins_bmge_62T_raxmlpartition.txt \
47 |          --prefix raxml-ng/fus_proteins_bmge_62T
48 | 
49 | raxml-ng --all \
50 |          --msa fus_proteins_bmge_62T_concat.phy \
51 |          --model fus_proteins_bmge_62T_raxmlpartition.txt \
52 |          --prefix raxml-ng/fus_proteins_bmge_62T \
53 |          --seed 2 \
54 |          --threads ${NSLOTS} \
55 |          --bs-trees 100
56 | 
57 | raxml-ng --bsconverge \
58 |          --bs-trees raxml-ng/fus_proteins_bmge_62T.raxml.bootstraps \
59 |          --prefix raxml-ng/fus_proteins_bmge_62T_convergence_test \
60 |          --seed 2
61 | 


--------------------------------------------------------------------------------
/cazyme_substrates.csv:
--------------------------------------------------------------------------------
  1 | Substrate,CAZy.Family
  2 | Cellulose,AA16
  3 | Cellulose,AA3
  4 | Cellulose,AA3_1
  5 | Cellulose,AA9
  6 | Cellulose,GH1
  7 | Cellulose,GH12
  8 | Cellulose,GH131
  9 | Cellulose,GH141
 10 | Cellulose,GH29
 11 | Cellulose,GH3
 12 | Cellulose,GH3_4
 13 | Cellulose,GH30_1
 14 | Cellulose,GH32
 15 | Cellulose,GH45
 16 | Cellulose,GH48
 17 | Cellulose,GH5
 18 | Cellulose,GH5_1
 19 | Cellulose,GH5_22
 20 | Cellulose,GH5_4
 21 | Cellulose,GH5_5
 22 | Cellulose,GH5_7
 23 | Cellulose,GH51
 24 | Cellulose,GH55
 25 | Cellulose,GH6
 26 | Cellulose,GH6_2
 27 | Cellulose,GH61
 28 | Cellulose,GH7
 29 | Cellulose,GH74
 30 | Cutin,CE5
 31 | Hemicellulose,AA14
 32 | Hemicellulose,AA3_1
 33 | Hemicellulose,CE1
 34 | Hemicellulose,CE15
 35 | Hemicellulose,CE16
 36 | Hemicellulose,CE2
 37 | Hemicellulose,CE3
 38 | Hemicellulose,CE4
 39 | Hemicellulose,CE5
 40 | Hemicellulose,CE6
 41 | Hemicellulose,CE7
 42 | Hemicellulose,GH10
 43 | Hemicellulose,GH11
 44 | Hemicellulose,GH113
 45 | Hemicellulose,GH115
 46 | Hemicellulose,GH120
 47 | Hemicellulose,GH13
 48 | Hemicellulose,GH141
 49 | Hemicellulose,GH16
 50 | Hemicellulose,GH2
 51 | Hemicellulose,GH26
 52 | Hemicellulose,GH27
 53 | Hemicellulose,GH29
 54 | Hemicellulose,GH3
 55 | Hemicellulose,GH30
 56 | Hemicellulose,GH30_1
 57 | Hemicellulose,GH30_7
 58 | Hemicellulose,GH31
 59 | Hemicellulose,GH35
 60 | Hemicellulose,GH36
 61 | Hemicellulose,GH39
 62 | Hemicellulose,GH43
 63 | Hemicellulose,GH44
 64 | Hemicellulose,GH5
 65 | Hemicellulose,GH5_1
 66 | Hemicellulose,GH5_2
 67 | Hemicellulose,GH5_22
 68 | Hemicellulose,GH5_7
 69 | Hemicellulose,GH51
 70 | Hemicellulose,GH52
 71 | Hemicellulose,GH53
 72 | Hemicellulose,GH54
 73 | Hemicellulose,GH55
 74 | Hemicellulose,GH62
 75 | Hemicellulose,GH67
 76 | Hemicellulose,GH72
 77 | Hemicellulose,GH74
 78 | Hemicellulose,GH79
 79 | Hemicellulose,GH8
 80 | Hemicellulose,GH93
 81 | Hemicellulose,GH95
 82 | Lignin,AA1_1
 83 | Lignin,AA1_3
 84 | Lignin,AA2
 85 | Lignin,AA3_1
 86 | Lignin,AA3_2
 87 | Lignin,AA5
 88 | Pectin,CE1
 89 | Pectin,CE12
 90 | Pectin,CE13
 91 | Pectin,CE8
 92 | Pectin,GH1
 93 | Pectin,GH105
 94 | Pectin,GH106
 95 | Pectin,GH16
 96 | Pectin,GH2
 97 | Pectin,GH27
 98 | Pectin,GH28
 99 | Pectin,GH3
100 | Pectin,GH30
101 | Pectin,GH35
102 | Pectin,GH36
103 | Pectin,GH4
104 | Pectin,GH42
105 | Pectin,GH43
106 | Pectin,GH49
107 | Pectin,GH5
108 | Pectin,GH51
109 | Pectin,GH53
110 | Pectin,GH54
111 | Pectin,GH62
112 | Pectin,GH78
113 | Pectin,GH79
114 | Pectin,GH88
115 | Pectin,PL1
116 | Pectin,PL1_4
117 | Pectin,PL1_7
118 | Pectin,PL11
119 | Pectin,PL2
120 | Pectin,PL26
121 | Pectin,PL3
122 | Pectin,PL3_2
123 | Pectin,PL4
124 | Pectin,PL9
125 | 


--------------------------------------------------------------------------------
/CSEP_CAZyme_prediction/CSEPfilter:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Script to filter protein set for candidate effectors
 3 | 
 4 | display_usage() {
 5 | 	echo "USAGE: $0 sample_prefix"
 6 |     exit
 7 | }
 8 | 
 9 | if [ -z "$1" ]
10 | then
11 |     display_usage
12 |     exit
13 | fi
14 | 
15 | #Function to check file endings
16 | check_file() {
17 | 	FILE=$1
18 | 
19 | 	if dos2unix < $FILE | cmp - $FILE; then
20 | 		:
21 | 	else
22 | 		echo "Provided file $FILE has wrong line endings" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
23 | 		dos2unix ${FILE} 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
24 | 	fi
25 | }
26 | 
27 | #Function to filter main list of names by another list
28 | filter_out() {
29 | 	FILTER_LIST=$1
30 | 	MAIN_LIST=$2
31 | 	OUT_LIST=$3
32 | 
33 | 	awk 'NR==FNR{a[$0]=1;next}!a[$0]' $FILTER_LIST $MAIN_LIST > $OUT_LIST
34 | }
35 | 
36 | #Function to find common names between two lists
37 | find_common() {
38 | 	SECOND_LIST=$1
39 | 	MAIN_LIST=$2
40 | 	OUT_LIST=$3
41 | 
42 | 	comm -12 <(sort $SECOND_LIST) <(sort $MAIN_LIST) > $OUT_LIST
43 | }
44 | 
45 | SAMPLE=$1
46 | echo "--------------------------------------------------"
47 | echo "Running sample ${SAMPLE}" 2>&1 | tee CSEPfilter_${SAMPLE}.log
48 | 
49 | check_file signalp/${SAMPLE}_signalp_SPlist
50 | echo "Total number of proteins: `grep "^[^#;]" signalp/${SAMPLE}_signalp_summary.signalp5 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
51 | echo "SignalP number of signal peptide proteins: `cat signalp/${SAMPLE}_signalp_SPlist | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
52 | echo "--------------------------------------------------"
53 | 
54 | echo ""
55 | echo "Cross-checking signal peptide prediction with TargetP and Phobius" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
56 | echo "-----------------------------------------------------------------"
57 | check_file targetp/${SAMPLE}_targetp_SPlist
58 | find_common targetp/${SAMPLE}_targetp_SPlist signalp/${SAMPLE}_signalp_SPlist filter1
59 | echo "TargetP also SP: `cat filter1 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
60 | check_file phobius/${SAMPLE}_phobius_SPlist
61 | find_common phobius/${SAMPLE}_phobius_SPlist filter1 filter1.2
62 | echo "Phobius also SP: `cat filter1.2 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
63 | 
64 | echo ""
65 | echo "Removing proteins with >1 transmembrane domains" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
66 | echo "-----------------------------------------------"
67 | check_file tmhmm/${SAMPLE}_tmhmm_TMlist
68 | filter_out tmhmm/${SAMPLE}_tmhmm_TMlist filter1.2 filter2
69 | echo "TMHMM >1 TMs removed: `cat filter2 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
70 | check_file phobius/${SAMPLE}_phobius_TMlist
71 | filter_out phobius/${SAMPLE}_phobius_TMlist filter2 filter2.2
72 | echo "Phobius >1 TMs removed: `cat filter2.2 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
73 | 
74 | echo ""
75 | echo "Removing proteins with cellular localisation contradicting secretion" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
76 | echo "--------------------------------------------------------------------"
77 | check_file prosite/${SAMPLE}_psscan_ERlist
78 | filter_out prosite/${SAMPLE}_psscan_ERlist filter2.2 filter3
79 | echo "Prosite ER localised removed: `cat filter3 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
80 | check_file nucpred/${SAMPLE}_nucpred_list
81 | filter_out nucpred/${SAMPLE}_nucpred_list filter3 filter3.2
82 | echo "NucPred nucleus localised removed: `cat filter3.2 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
83 | check_file predgpi/${SAMPLE}_predgpi_GPlist
84 | filter_out predgpi/${SAMPLE}_predgpi_GPlist filter3.2 filter3.3
85 | echo "PredGPI GPI anchored removed: `cat filter3.3 | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
86 | 
87 | echo ""
88 | echo "Cross-checking effector prediction with EffectorP" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
89 | echo "-----------------------------------------------------------------"
90 | check_file effectorp/${SAMPLE}_effectorlist
91 | find_common effectorp/${SAMPLE}_effectorlist filter3.3 ${SAMPLE}_candidate_effectors
92 | echo "EffectorP also effectors: `cat ${SAMPLE}_candidate_effectors | wc -l`" 2>&1 | tee -a CSEPfilter_${SAMPLE}.log
93 | echo ""
94 | 
95 | echo "Final set of `cat ${SAMPLE}_candidate_effectors | wc -l` potential effectors listed in file ${SAMPLE}_candidate_effectors"
96 | 
97 | rm filter1 filter1.2 filter2 filter2.2 filter3 filter3.2 filter3.3
98 | 


--------------------------------------------------------------------------------
/annotation/maker/round1/fusotu1_abyss_round1_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu1_abyss_masked/fusotu1_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_EST_20171014_cluster_consensi.fasta
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_GeneCatalog_proteins_20171014.aa.fasta
23 | protein_gff=  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm= #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species= #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=1 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=1 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round1/fusotu3_abyss_round1_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu3_abyss_masked/fusotu3_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_EST_20171014_cluster_consensi.fasta
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_GeneCatalog_proteins_20171014.aa.fasta
23 | protein_gff=  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm= #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species= #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=1 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=1 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round1/fusotu5_abyss_round1_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu5_abyss_masked/fusotu5_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_EST_20171014_cluster_consensi.fasta
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_GeneCatalog_proteins_20171014.aa.fasta
23 | protein_gff=  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm= #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species= #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=1 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=1 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round1/fusotu6_abyss_round1_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu6_abyss_masked/fusotu6_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fuseq1_EST_20180330_cluster_consensi.fasta
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fuseq1_GeneCatalog_proteins_20180330.aa.fasta
23 | protein_gff=  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm= #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species= #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=1 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=1 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round1/fusotu7_abyss_round1_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu7_abyss_masked/fusotu7_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_EST_20171014_cluster_consensi.fasta
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/Fusoxy1_GeneCatalog_proteins_20171014.aa.fasta
23 | protein_gff=  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm= #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species= #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=1 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=1 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/fusotu1_abyss_round2_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu1_abyss_masked/fusotu1_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu1_abyss_rnd1.maker.output/fusotu1_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu1_abyss_rnd1.maker.output/fusotu1_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round2/training_snap/1/fusotu1_abyss_rnd1.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/fusotu3_abyss_round2_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu3_abyss_masked/fusotu3_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu3_abyss_rnd1.maker.output/fusotu3_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu3_abyss_rnd1.maker.output/fusotu3_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round2/training_snap/3/fusotu3_abyss_rnd1.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/fusotu5_abyss_round2_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu5_abyss_masked/fusotu5_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu5_abyss_rnd1.maker.output/fusotu5_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu5_abyss_rnd1.maker.output/fusotu5_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round2/training_snap/5/fusotu5_abyss_rnd1.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/fusotu6_abyss_round2_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu6_abyss_masked/fusotu6_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu6_abyss_rnd1.maker.output/fusotu6_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu6_abyss_rnd1.maker.output/fusotu6_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round2/training_snap/6/fusotu6_abyss_rnd1.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round2/fusotu7_abyss_round2_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu7_abyss_masked/fusotu7_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu7_abyss_rnd1.maker.output/fusotu7_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu7_abyss_rnd1.maker.output/fusotu7_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round2/training_snap/7/fusotu7_abyss_rnd1.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/fusotu1_abyss_round3_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu1_abyss_masked/fusotu1_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu1_abyss_rnd1.maker.output/fusotu1_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu1_abyss_rnd1.maker.output/fusotu1_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round3/training_snap2/1/fusotu1_abyss_rnd2.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/fusotu3_abyss_round3_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu3_abyss_masked/fusotu3_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu3_abyss_rnd1.maker.output/fusotu3_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu3_abyss_rnd1.maker.output/fusotu3_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round3/training_snap2/3/fusotu3_abyss_rnd2.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/fusotu5_abyss_round3_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu5_abyss_masked/fusotu5_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu5_abyss_rnd1.maker.output/fusotu5_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu5_abyss_rnd1.maker.output/fusotu5_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round3/training_snap2/5/fusotu5_abyss_rnd2.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/fusotu6_abyss_round3_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu6_abyss_masked/fusotu6_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu6_abyss_rnd1.maker.output/fusotu6_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu6_abyss_rnd1.maker.output/fusotu6_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round3/training_snap2/6/fusotu6_abyss_rnd2.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/annotation/maker/round3/fusotu7_abyss_round3_maker_opts.ctl:
--------------------------------------------------------------------------------
 1 | #-----Genome (these are always required)
 2 | genome=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/repeat_masking/fusotu7_abyss_masked/fusotu7_abyss_pilon.fasta.masked
 3 | organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
 4 | 
 5 | #-----Re-annotation Using MAKER Derived GFF3
 6 | maker_gff= #MAKER derived GFF3 file
 7 | est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
 8 | altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
 9 | protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
10 | rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
11 | model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
12 | pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
13 | other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
14 | 
15 | #-----EST Evidence (for best results provide a file for at least one)
16 | est=
17 | altest= #EST/cDNA sequence file in fasta format from an alternate organism
18 | est_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu7_abyss_rnd1.maker.output/fusotu7_abyss_rnd1.all.maker.est2genome.gff #aligned ESTs or mRNA-seq from an external GFF3 file
19 | altest_gff= #aligned ESTs from a closly relate species in GFF3 format
20 | 
21 | #-----Protein Homology Evidence (for best results provide a file for at least one)
22 | protein=
23 | protein_gff=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round1/fusotu7_abyss_rnd1.maker.output/fusotu7_abyss_rnd1.all.maker.protein2genome.gff  #aligned protein homology evidence from an external GFF3 file
24 | 
25 | #-----Repeat Masking (leave values blank to skip repeat masking)
26 | model_org= #select a model organism for RepBase masking in RepeatMasker
27 | rmlib=
28 | repeat_protein= #provide a fasta file of transposable element proteins for RepeatRunner
29 | rm_gff= #pre-identified repeat elements from an external GFF3 file
30 | prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
31 | softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
32 | 
33 | #-----Gene Prediction
34 | snaphmm=/data/SBCS-BuggsLab/RowenaHill/fus_comparison/annotation/maker/round3/training_snap2/7/fusotu7_abyss_rnd2.zff.length50_aed0.25.hmm #SNAP HMM file
35 | gmhmm= #GeneMark HMM file
36 | augustus_species=fusarium #Augustus gene prediction species model
37 | fgenesh_par_file= #FGENESH parameter file
38 | pred_gff= #ab-initio predictions from an external GFF3 file
39 | model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
40 | est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
41 | protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
42 | trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
43 | snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
44 | unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
45 | 
46 | #-----Other Annotation Feature Types (features MAKER doesn't recognize)
47 | other_gff= #extra features to pass-through to final MAKER generated GFF3 file
48 | 
49 | #-----External Application Behavior Options
50 | alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
51 | cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
52 | 
53 | #-----MAKER Behavior Options
54 | max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
55 | min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
56 | 
57 | pred_flank=200 #flank for extending evidence clusters sent to gene predictors
58 | pred_stats=0 #report AED and QI statistics for all predictions as well as models
59 | AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
60 | min_protein=0 #require at least this many amino acids in predicted proteins
61 | alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
62 | always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
63 | map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
64 | keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
65 | 
66 | split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
67 | single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
68 | single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
69 | correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
70 | 
71 | tries=2 #number of times to try a contig if there is a failure for some reason
72 | clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
73 | clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
74 | TMP= #specify a directory other than the system default temporary directory for temporary files
75 | 


--------------------------------------------------------------------------------
/assembly/polishing/mito_remove_1:
--------------------------------------------------------------------------------
  1 | 100_pilon	237	mitochondrion
  2 | 1010_pilon	247	mitochondrion
  3 | 1016_pilon	1007	mitochondrion
  4 | 1117_pilon	237	mitochondrion
  5 | 1119_pilon	236	mitochondrion
  6 | 1121_pilon	247	mitochondrion
  7 | 1123_pilon	705	mitochondrion
  8 | 1124_pilon	1097	mitochondrion
  9 | 1153_pilon	247	mitochondrion
 10 | 1180_pilon	247	mitochondrion
 11 | 1181_pilon	219	mitochondrion
 12 | 1182_pilon	237	mitochondrion
 13 | 1189_pilon	240	mitochondrion
 14 | 1192_pilon	247	mitochondrion
 15 | 119_pilon	247	mitochondrion
 16 | 1219_pilon	2381	mitochondrion
 17 | 1291_pilon	247	mitochondrion
 18 | 1326_pilon	238	mitochondrion
 19 | 1339_pilon	247	mitochondrion
 20 | 1369_pilon	238	mitochondrion
 21 | 1372_pilon	247	mitochondrion
 22 | 138_pilon	237	mitochondrion
 23 | 1468_pilon	247	mitochondrion
 24 | 1471_pilon	238	mitochondrion
 25 | 1479_pilon	247	mitochondrion
 26 | 1501_pilon	237	mitochondrion
 27 | 1502_pilon	240	mitochondrion
 28 | 1548_pilon	237	mitochondrion
 29 | 1564_pilon	240	mitochondrion
 30 | 1565_pilon	247	mitochondrion
 31 | 1566_pilon	347	mitochondrion
 32 | 1584_pilon	438	mitochondrion
 33 | 1585_pilon	445	mitochondrion
 34 | 1635_pilon	247	mitochondrion
 35 | 1643_pilon	238	mitochondrion
 36 | 1661_pilon	245	mitochondrion
 37 | 1669_pilon	245	mitochondrion
 38 | 1705_pilon	247	mitochondrion
 39 | 1736_pilon	246	mitochondrion
 40 | 1750_pilon	247	mitochondrion
 41 | 1807_pilon	237	mitochondrion
 42 | 1810_pilon	247	mitochondrion
 43 | 1857_pilon	530	mitochondrion
 44 | 1884_pilon	247	mitochondrion
 45 | 1920_pilon	237	mitochondrion
 46 | 1927_pilon	247	mitochondrion
 47 | 1932_pilon	589	mitochondrion
 48 | 1947_pilon	524	mitochondrion
 49 | 1949_pilon	407	mitochondrion
 50 | 1954_pilon	236	mitochondrion
 51 | 199_pilon	247	mitochondrion
 52 | 2012_pilon	247	mitochondrion
 53 | 202_pilon	240	mitochondrion
 54 | 2035_pilon	247	mitochondrion
 55 | 2056_pilon	238	mitochondrion
 56 | 2079_pilon	247	mitochondrion
 57 | 2087_pilon	247	mitochondrion
 58 | 2091_pilon	247	mitochondrion
 59 | 2102_pilon	247	mitochondrion
 60 | 2130_pilon	247	mitochondrion
 61 | 2138_pilon	247	mitochondrion
 62 | 2146_pilon	237	mitochondrion
 63 | 2219_pilon	238	mitochondrion
 64 | 2244_pilon	210	mitochondrion
 65 | 2287_pilon	238	mitochondrion
 66 | 2288_pilon	247	mitochondrion
 67 | 2323_pilon	247	mitochondrion
 68 | 2337_pilon	247	mitochondrion
 69 | 2371_pilon	357	mitochondrion
 70 | 237_pilon	237	mitochondrion
 71 | 2397_pilon	242	mitochondrion
 72 | 2401_pilon	247	mitochondrion
 73 | 2447_pilon	1012	mitochondrion
 74 | 2470_pilon	553	mitochondrion
 75 | 2483_pilon	573	mitochondrion
 76 | 2517_pilon	216	mitochondrion
 77 | 2524_pilon	247	mitochondrion
 78 | 2527_pilon	756	mitochondrion
 79 | 253_pilon	1783	mitochondrion
 80 | 2546_pilon	247	mitochondrion
 81 | 254_pilon	1983	mitochondrion
 82 | 2560_pilon	234	mitochondrion
 83 | 2588_pilon	247	mitochondrion
 84 | 2621_pilon	247	mitochondrion
 85 | 2622_pilon	247	mitochondrion
 86 | 2697_pilon	247	mitochondrion
 87 | 2701_pilon	574	mitochondrion
 88 | 2702_pilon	609	mitochondrion
 89 | 2781_pilon	238	mitochondrion
 90 | 2784_pilon	301	mitochondrion
 91 | 278_pilon	230	mitochondrion
 92 | 2799_pilon	237	mitochondrion
 93 | 2808_pilon	247	mitochondrion
 94 | 2812_pilon	247	mitochondrion
 95 | 2832_pilon	247	mitochondrion
 96 | 2879_pilon	247	mitochondrion
 97 | 2881_pilon	247	mitochondrion
 98 | 2949_pilon	247	mitochondrion
 99 | 2959_pilon	247	mitochondrion
100 | 2961_pilon	247	mitochondrion
101 | 2976_pilon	243	mitochondrion
102 | 2984_pilon	247	mitochondrion
103 | 3024_pilon	350	mitochondrion
104 | 3040_pilon	236	mitochondrion
105 | 3048_pilon	321	mitochondrion
106 | 3052_pilon	247	mitochondrion
107 | 3073_pilon	237	mitochondrion
108 | 3128_pilon	237	mitochondrion
109 | 3167_pilon	236	mitochondrion
110 | 3170_pilon	247	mitochondrion
111 | 3184_pilon	237	mitochondrion
112 | 3191_pilon	247	mitochondrion
113 | 3192_pilon	247	mitochondrion
114 | 3197_pilon	216	mitochondrion
115 | 3236_pilon	247	mitochondrion
116 | 3258_pilon	230	mitochondrion
117 | 3306_pilon	237	mitochondrion
118 | 3346_pilon	247	mitochondrion
119 | 3352_pilon	1037	mitochondrion
120 | 3401_pilon	247	mitochondrion
121 | 3404_pilon	247	mitochondrion
122 | 3426_pilon	233	mitochondrion
123 | 3491_pilon	247	mitochondrion
124 | 3501_pilon	396	mitochondrion
125 | 3505_pilon	1000	mitochondrion
126 | 355_pilon	247	mitochondrion
127 | 358_pilon	382	mitochondrion
128 | 391_pilon	524	mitochondrion
129 | 420_pilon	209	mitochondrion
130 | 430_pilon	237	mitochondrion
131 | 434_pilon	247	mitochondrion
132 | 44_pilon	247	mitochondrion
133 | 458_pilon	238	mitochondrion
134 | 459_pilon	237	mitochondrion
135 | 45_pilon	237	mitochondrion
136 | 461_pilon	247	mitochondrion
137 | 498_pilon	247	mitochondrion
138 | 530_pilon	217	mitochondrion
139 | 540_pilon	236	mitochondrion
140 | 542_pilon	247	mitochondrion
141 | 610_pilon	391	mitochondrion
142 | 64_pilon	247	mitochondrion
143 | 690_pilon	247	mitochondrion
144 | 702_pilon	237	mitochondrion
145 | 744_pilon	237	mitochondrion
146 | 746_pilon	247	mitochondrion
147 | 767_pilon	247	mitochondrion
148 | 773_pilon	247	mitochondrion
149 | 774_pilon	247	mitochondrion
150 | 778_pilon	238	mitochondrion
151 | 817_pilon	247	mitochondrion
152 | 820_pilon	721	mitochondrion
153 | 848_pilon	260	mitochondrion
154 | 851_pilon	223	mitochondrion
155 | 912_pilon	247	mitochondrion
156 | 936_pilon	247	mitochondrion
157 | 939_pilon	1031	mitochondrion
158 | 994_pilon	788	mitochondrion
159 | 1442_pilon	2565	1..1302,2353..2565	mitochondrion-not_cleaned
160 | 3508_pilon	2721	1..366,1873..2721	mitochondrion-not_cleaned
161 | 


--------------------------------------------------------------------------------
/assembly/polishing/mito_remove_7:
--------------------------------------------------------------------------------
  1 | 1006_pilon	245	mitochondrion
  2 | 1010_pilon	243	mitochondrion
  3 | 1052_pilon	244	mitochondrion
  4 | 1055_pilon	253	mitochondrion
  5 | 1070_pilon	1030	mitochondrion
  6 | 1079_pilon	243	mitochondrion
  7 | 1108_pilon	227	mitochondrion
  8 | 1110_pilon	242	mitochondrion
  9 | 1113_pilon	253	mitochondrion
 10 | 1118_pilon	253	mitochondrion
 11 | 1133_pilon	438	mitochondrion
 12 | 1134_pilon	243	mitochondrion
 13 | 1136_pilon	253	mitochondrion
 14 | 1170_pilon	243	mitochondrion
 15 | 1193_pilon	249	mitochondrion
 16 | 1194_pilon	244	mitochondrion
 17 | 1195_pilon	253	mitochondrion
 18 | 1221_pilon	253	mitochondrion
 19 | 1222_pilon	253	mitochondrion
 20 | 1248_pilon	253	mitochondrion
 21 | 1277_pilon	244	mitochondrion
 22 | 127_pilon	243	mitochondrion
 23 | 1282_pilon	253	mitochondrion
 24 | 1285_pilon	232	mitochondrion
 25 | 1287_pilon	253	mitochondrion
 26 | 128_pilon	245	mitochondrion
 27 | 1319_pilon	301	mitochondrion
 28 | 1323_pilon	2212	mitochondrion
 29 | 1346_pilon	211	mitochondrion
 30 | 1375_pilon	243	mitochondrion
 31 | 1421_pilon	243	mitochondrion
 32 | 1427_pilon	699	mitochondrion
 33 | 1463_pilon	253	mitochondrion
 34 | 146_pilon	244	mitochondrion
 35 | 1497_pilon	253	mitochondrion
 36 | 149_pilon	631	mitochondrion
 37 | 1500_pilon	253	mitochondrion
 38 | 1501_pilon	246	mitochondrion
 39 | 1505_pilon	1381	mitochondrion
 40 | 150_pilon	524	mitochondrion
 41 | 1524_pilon	253	mitochondrion
 42 | 1555_pilon	246	mitochondrion
 43 | 1566_pilon	242	mitochondrion
 44 | 1567_pilon	253	mitochondrion
 45 | 1571_pilon	656	mitochondrion
 46 | 1602_pilon	245	mitochondrion
 47 | 1634_pilon	253	mitochondrion
 48 | 1670_pilon	253	mitochondrion
 49 | 1673_pilon	253	mitochondrion
 50 | 1685_pilon	253	mitochondrion
 51 | 1688_pilon	253	mitochondrion
 52 | 1697_pilon	244	mitochondrion
 53 | 1733_pilon	243	mitochondrion
 54 | 1740_pilon	243	mitochondrion
 55 | 1758_pilon	358	mitochondrion
 56 | 1767_pilon	220	mitochondrion
 57 | 1779_pilon	1370	mitochondrion
 58 | 179_pilon	253	mitochondrion
 59 | 1803_pilon	243	mitochondrion
 60 | 1809_pilon	246	mitochondrion
 61 | 1810_pilon	253	mitochondrion
 62 | 1812_pilon	243	mitochondrion
 63 | 1832_pilon	339	mitochondrion
 64 | 185_pilon	244	mitochondrion
 65 | 1863_pilon	243	mitochondrion
 66 | 1867_pilon	253	mitochondrion
 67 | 1894_pilon	242	mitochondrion
 68 | 1898_pilon	246	mitochondrion
 69 | 1900_pilon	887	mitochondrion
 70 | 1925_pilon	244	mitochondrion
 71 | 1934_pilon	244	mitochondrion
 72 | 1949_pilon	2592	mitochondrion
 73 | 195_pilon	574	mitochondrion
 74 | 1967_pilon	253	mitochondrion
 75 | 196_pilon	253	mitochondrion
 76 | 1991_pilon	407	mitochondrion
 77 | 1994_pilon	253	mitochondrion
 78 | 2035_pilon	246	mitochondrion
 79 | 2065_pilon	243	mitochondrion
 80 | 2092_pilon	253	mitochondrion
 81 | 2096_pilon	253	mitochondrion
 82 | 2121_pilon	245	mitochondrion
 83 | 2123_pilon	253	mitochondrion
 84 | 2125_pilon	614	mitochondrion
 85 | 2126_pilon	834	mitochondrion
 86 | 2133_pilon	1119	mitochondrion
 87 | 2172_pilon	562	mitochondrion
 88 | 2178_pilon	253	mitochondrion
 89 | 2191_pilon	253	mitochondrion
 90 | 2192_pilon	253	mitochondrion
 91 | 21_pilon	253	mitochondrion
 92 | 2214_pilon	203	mitochondrion
 93 | 2241_pilon	243	mitochondrion
 94 | 2249_pilon	253	mitochondrion
 95 | 2250_pilon	1557	mitochondrion
 96 | 2287_pilon	243	mitochondrion
 97 | 2304_pilon	232	mitochondrion
 98 | 2329_pilon	234	mitochondrion
 99 | 2332_pilon	253	mitochondrion
100 | 2333_pilon	243	mitochondrion
101 | 2360_pilon	236	mitochondrion
102 | 2361_pilon	253	mitochondrion
103 | 236_pilon	237	mitochondrion
104 | 239_pilon	253	mitochondrion
105 | 2418_pilon	201	mitochondrion
106 | 2424_pilon	244	mitochondrion
107 | 2427_pilon	219	mitochondrion
108 | 242_pilon	253	mitochondrion
109 | 2432_pilon	243	mitochondrion
110 | 2434_pilon	294	mitochondrion
111 | 243_pilon	242	mitochondrion
112 | 2460_pilon	243	mitochondrion
113 | 2462_pilon	231	mitochondrion
114 | 2471_pilon	253	mitochondrion
115 | 2474_pilon	253	mitochondrion
116 | 2476_pilon	331	mitochondrion
117 | 24_pilon	444	mitochondrion
118 | 2518_pilon	253	mitochondrion
119 | 2528_pilon	253	mitochondrion
120 | 2529_pilon	253	mitochondrion
121 | 2530_pilon	253	mitochondrion
122 | 2539_pilon	253	mitochondrion
123 | 2552_pilon	693	mitochondrion
124 | 2577_pilon	253	mitochondrion
125 | 2581_pilon	243	mitochondrion
126 | 2617_pilon	244	mitochondrion
127 | 2620_pilon	243	mitochondrion
128 | 2630_pilon	253	mitochondrion
129 | 2633_pilon	244	mitochondrion
130 | 2653_pilon	253	mitochondrion
131 | 2663_pilon	3377	mitochondrion
132 | 267_pilon	973	mitochondrion
133 | 2684_pilon	243	mitochondrion
134 | 2689_pilon	253	mitochondrion
135 | 2690_pilon	253	mitochondrion
136 | 2719_pilon	253	mitochondrion
137 | 2723_pilon	253	mitochondrion
138 | 2756_pilon	253	mitochondrion
139 | 2757_pilon	855	mitochondrion
140 | 292_pilon	242	mitochondrion
141 | 346_pilon	253	mitochondrion
142 | 370_pilon	244	mitochondrion
143 | 458_pilon	253	mitochondrion
144 | 461_pilon	243	mitochondrion
145 | 465_pilon	253	mitochondrion
146 | 466_pilon	408	mitochondrion
147 | 518_pilon	253	mitochondrion
148 | 532_pilon	253	mitochondrion
149 | 536_pilon	209	mitochondrion
150 | 538_pilon	243	mitochondrion
151 | 565_pilon	253	mitochondrion
152 | 567_pilon	243	mitochondrion
153 | 576_pilon	253	mitochondrion
154 | 613_pilon	245	mitochondrion
155 | 646_pilon	253	mitochondrion
156 | 65_pilon	242	mitochondrion
157 | 677_pilon	253	mitochondrion
158 | 679_pilon	253	mitochondrion
159 | 681_pilon	244	mitochondrion
160 | 68_pilon	253	mitochondrion
161 | 746_pilon	253	mitochondrion
162 | 779_pilon	242	mitochondrion
163 | 814_pilon	253	mitochondrion
164 | 815_pilon	253	mitochondrion
165 | 868_pilon	585	mitochondrion
166 | 869_pilon	620	mitochondrion
167 | 890_pilon	253	mitochondrion
168 | 894_pilon	253	mitochondrion
169 | 903_pilon	253	mitochondrion
170 | 90_pilon	253	mitochondrion
171 | 91_pilon	350	mitochondrion
172 | 920_pilon	242	mitochondrion
173 | 924_pilon	253	mitochondrion
174 | 93_pilon	395	mitochondrion
175 | 968_pilon	243	mitochondrion
176 | 971_pilon	243	mitochondrion
177 | 999_pilon	246	mitochondrion
178 | 


--------------------------------------------------------------------------------
/lifestyle_comparison/run_edited.py:
--------------------------------------------------------------------------------
  1 | ## Developed by Fantin Mesny 
  2 | ## Max Planck Institute For Plant Breeding Research (Cologne, Germany)
  3 | 
  4 | # Added line to output phylPCA data on line 140 - Rowena Hill
  5 | # Added Bonferroni multiple testing correction to pairwise PERMANOVA on line 76 - Rowena Hill
  6 | 
  7 | import sys
  8 | import argparse
  9 | import pandas as pd
 10 | from Bio import Phylo
 11 | import matplotlib.pyplot as plt
 12 | import seaborn as sns
 13 | from sklearn.decomposition import PCA
 14 | from sklearn.metrics.pairwise import pairwise_distances
 15 | from itertools import combinations_with_replacement 
 16 | import subprocess
 17 | import networkx as nx
 18 | 
 19 | def get_params(argv):
 20 |     parser = argparse.ArgumentParser(description='Analyse the composition in genes of different genome categories, taking phylogeny into account')
 21 |     parser.add_argument('-t', '--t', help="Phylogenetic tree, with leaf labels matching the genome names in the data table", required=True)
 22 |     parser.add_argument('-i', '--i', help="Dataframe of gene counts (gene families as columns, and genomes as rows). See 'example.csv'", required=True)
 23 |     parser.add_argument('-o', '--o', help="Output directory", required=True)
 24 |     parser.add_argument('-colors', '--colors', help="Color to match each lifestyle category: 'lifestyleA:blue,lifestyleB:#00FF00,...'", default='')
 25 |     a = parser.parse_args()
 26 |     return a
 27 | 
 28 | def getDistMatrix(tree):
 29 |     df=pd.DataFrame()
 30 |     nodes=[a.name for a in tree.depths(unit_branch_lengths=True) if a.name!=None]
 31 |     comb=list(combinations_with_replacement(nodes, 2))
 32 |     for c in comb:
 33 |         df.loc[c[0],c[1]]=tree.distance(c[0],c[1])
 34 |         df.loc[c[1],c[0]]=tree.distance(c[1],c[0])
 35 |     return df
 36 | 
 37 | def doPCA(df):
 38 |     DF=pd.DataFrame()
 39 |     pca = PCA(n_components=2)
 40 |     pca_result = pca.fit_transform(df)
 41 |     DF['PC1'] = pca_result[:,0]
 42 |     DF['PC2'] = pca_result[:,1]
 43 |     DF['genome']=df.columns
 44 |     DF=DF.rename(index=str, columns={'PC1':'PC1 ('+str(pca.explained_variance_ratio_[0])[2:4].lstrip('0')+'%)','PC2':'PC2 ('+str(pca.explained_variance_ratio_[1])[2:4].lstrip('0')+'%)'})
 45 |     DF=DF.set_index('genome')
 46 |     return DF
 47 | 
 48 | def terminal(cmd):
 49 | 	p = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
 50 | 	p.wait()
 51 | 
 52 | def plotPhylPCA(metadata, palet, order, output):
 53 |     fig,axes=plt.subplots(1,1, figsize=(12,8))
 54 |     sns.scatterplot(x=metadata.columns[0], y=metadata.columns[1], s=70, data=metadata, hue='lifestyle', palette=palet, hue_order=order, ax=axes)
 55 |     axes.set_title('PCA of pairwise phylogenetic distances')
 56 |     plt.tight_layout()
 57 |     plt.savefig(output+'phylogenetic_pca.pdf')    
 58 |     
 59 |     
 60 | def runStat(output):
 61 |     script="""
 62 |     args <- commandArgs(trailingOnly=TRUE)
 63 |     dir <- args[1]
 64 | 
 65 |     library(vegan)
 66 |     data<-read.csv(paste(dir,'data.csv',sep=''), row.names='genome')
 67 |     metadata<-read.csv(paste(dir,'metadata.csv',sep=''), row.names='genome')
 68 |     dist <- vegdist(data, method='jaccard')
 69 |     distMatrix <- as.data.frame(as.matrix(dist))
 70 |     perm <- adonis2(dist~PC1+PC2+lifestyle, data=metadata, permutations = 9999)
 71 |     capture.output(perm, file=paste(dir, 'permanova.txt', sep=''))
 72 |     write.csv(distMatrix,paste(dir,'distMatrix.csv',sep=''), row.names=TRUE)
 73 | 
 74 | 
 75 |     library(RVAideMemoire)
 76 |     permManova<-pairwise.perm.manova(dist,metadata$lifestyle, , nperm=9999, p.method="bonferroni")
 77 |     permManova <- as.data.frame(permManova[3])
 78 |     write.csv(permManova,paste(dir,'pairwiseComparisons.csv',sep=''), row.names=TRUE)
 79 |     
 80 |     """
 81 |     with open(output+'tmp.R','w+') as tmp:
 82 |         tmp.write(script)    
 83 |     terminal("Rscript "+output+"tmp.R '"+output+"'")
 84 |     terminal("rm "+output+"tmp.R")
 85 | 
 86 | def plotPCA(metadata, palet, order, output):
 87 |     distMatrix=pd.read_csv(output+'distMatrix.csv').set_index('Unnamed: 0')
 88 |     fig,axes=plt.subplots(1,1, figsize=(12,8))
 89 |     pca=doPCA(distMatrix).merge(metadata[['lifestyle']], left_index=True,right_index=True)
 90 |     pca.rename(index=str, columns={[c for c in pca.columns if 'PC1' in c][0]:'PC1',[c for c in pca.columns if 'PC2' in c][0]:'PC2'}).to_csv(a.o+'pca.csv')
 91 |     if palet=={}:
 92 |         ls=list(set(metadata['lifestyle']))
 93 |         palette=sns.color_palette(n_colors=len(ls))
 94 |         palet={ls[l]:palette[l] for l in range(len(ls))}
 95 |     sns.scatterplot(x=pca.columns[0], y=pca.columns[1], s=70, data=pca, hue='lifestyle', palette=palet, hue_order=order, ax=axes)
 96 |     axes.set_title('PCA of Jaccard distances calculated on genome compositions')
 97 |     plt.tight_layout()
 98 |     plt.savefig(output+'pca.pdf')
 99 |     return palet
100 |     
101 | def plotPvalMatrix(output, palet):
102 |     permManova=pd.read_csv(output+'pairwiseComparisons.csv').rename(index=str, columns={'Unnamed: 0':'Lifestyles'})
103 |     permManova=permManova.rename(index=str, columns={c:c.replace('p.value.','').replace('.',' ') for c in permManova.columns}).set_index('Lifestyles')
104 |     fig,ax=plt.subplots(1,1,figsize=(7,7))
105 |     heatmap = sns.heatmap(permManova, mask=permManova <= 0.05, square = True, linewidths = .5, cmap = 'Blues', cbar=False, vmin = -1000, vmax = 10000)
106 |     heatmap = sns.heatmap(permManova, mask=permManova > 0.05, square = True, linewidths = .5, cmap = 'coolwarm_r', cbar_kws = {'shrink': .4, 'ticks' : [0.5, 0.33, 0]},vmin = -0.1, vmax=1,annot = True,annot_kws = {'size': 12},cbar=False)
107 |     ax.xaxis.set_ticks_position('top')
108 |     ax.xaxis.set_tick_params(rotation=45)
109 |     [label.set_color(palet[label.get_text()] ) for label in ax.get_yticklabels()]
110 |     [label.set_color(palet[label.get_text()] ) for label in ax.get_xticklabels()]
111 |     plt.tight_layout()
112 |     plt.savefig(output+'pvalMatrix.pdf')
113 |     return permManova
114 | 
115 | def plotNetwork(output, permManova, palet):
116 |     G = nx.Graph()
117 |     for node in set(list(permManova.columns)+list(permManova.index)):
118 |         G.add_node(node)
119 |     for c in permManova.columns:
120 |         for i in permManova.index:
121 |             if permManova.loc[i,c]>0.05:
122 |                 G.add_edge(i, c) 
123 |     pos=nx.drawing.spring_layout(G)
124 |     plt.figure(figsize=(5,5))
125 |     nx.draw_networkx(G,pos=pos,with_labels=False,linewidths=1, alpha=1,node_color=[palet[n] for n in G.nodes],font_size=15)
126 |     plt.axis('off')
127 |     plt.savefig(output+'network.pdf')
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     a = get_params(sys.argv[1:])
132 |     if a.o[-1]!='/':
133 |         a.o=a.o+'/'
134 |     
135 |     data=pd.read_csv(a.i)
136 |     data=data.set_index('genome')
137 |     
138 |     tree=Phylo.read(a.t, 'newick')
139 |     phylDist=getDistMatrix(tree)
140 |     phylDist.to_csv(a.o+'phyldistmatrix.csv')
141 |     phylPCA=doPCA(phylDist)
142 |     metadata=phylPCA.merge(data[['lifestyle']], left_index=True,right_index=True)
143 |     metadata.rename(index=str, columns={[c for c in metadata.columns if 'PC1' in c][0]:'PC1',[c for c in metadata.columns if 'PC2' in c][0]:'PC2'}).to_csv(a.o+'metadata.csv')
144 |     
145 |     order=sorted(list(set(metadata['lifestyle'])))
146 |     if a.colors=='':
147 |         palet={}
148 |     else:
149 |         palet={c.split(':')[0]:c.split(':')[1] for c in a.colors.split(',')}
150 |         
151 |     
152 |     data=data.drop(columns=['lifestyle'])
153 |     data=data.reindex(metadata.index)
154 |     data.to_csv(a.o+'data.csv')
155 | 
156 |     runStat(a.o)
157 |     palet=plotPCA(metadata, palet, order, a.o)
158 |     permManova=plotPvalMatrix(a.o, palet)
159 |     plotNetwork(a.o, permManova, palet)
160 |     plotPhylPCA(metadata, palet, order, a.o)
161 | 
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/assembly/polishing/mito_remove_5:
--------------------------------------------------------------------------------
  1 | 102_pilon	241	mitochondrion
  2 | 1045_pilon	241	mitochondrion
  3 | 1050_pilon	241	mitochondrion
  4 | 1077_pilon	231	mitochondrion
  5 | 1079_pilon	241	mitochondrion
  6 | 1083_pilon	241	mitochondrion
  7 | 1122_pilon	231	mitochondrion
  8 | 1131_pilon	241	mitochondrion
  9 | 1174_pilon	241	mitochondrion
 10 | 1176_pilon	241	mitochondrion
 11 | 1199_pilon	241	mitochondrion
 12 | 122_pilon	241	mitochondrion
 13 | 1240_pilon	241	mitochondrion
 14 | 1257_pilon	241	mitochondrion
 15 | 1333_pilon	232	mitochondrion
 16 | 1391_pilon	241	mitochondrion
 17 | 1394_pilon	241	mitochondrion
 18 | 1420_pilon	241	mitochondrion
 19 | 1436_pilon	241	mitochondrion
 20 | 1440_pilon	241	mitochondrion
 21 | 1527_pilon	241	mitochondrion
 22 | 154_pilon	232	mitochondrion
 23 | 1554_pilon	241	mitochondrion
 24 | 1555_pilon	241	mitochondrion
 25 | 1580_pilon	241	mitochondrion
 26 | 1581_pilon	233	mitochondrion
 27 | 1587_pilon	241	mitochondrion
 28 | 158_pilon	241	mitochondrion
 29 | 1613_pilon	241	mitochondrion
 30 | 162_pilon	241	mitochondrion
 31 | 1657_pilon	241	mitochondrion
 32 | 165_pilon	241	mitochondrion
 33 | 1678_pilon	249	mitochondrion
 34 | 1685_pilon	241	mitochondrion
 35 | 1715_pilon	241	mitochondrion
 36 | 1764_pilon	241	mitochondrion
 37 | 1804_pilon	231	mitochondrion
 38 | 1805_pilon	241	mitochondrion
 39 | 1806_pilon	241	mitochondrion
 40 | 1810_pilon	241	mitochondrion
 41 | 1828_pilon	241	mitochondrion
 42 | 1859_pilon	241	mitochondrion
 43 | 1860_pilon	241	mitochondrion
 44 | 1905_pilon	241	mitochondrion
 45 | 1906_pilon	241	mitochondrion
 46 | 1909_pilon	241	mitochondrion
 47 | 1911_pilon	241	mitochondrion
 48 | 1913_pilon	241	mitochondrion
 49 | 1937_pilon	356	mitochondrion
 50 | 1966_pilon	241	mitochondrion
 51 | 1982_pilon	231	mitochondrion
 52 | 2057_pilon	230	mitochondrion
 53 | 2084_pilon	514	mitochondrion
 54 | 2095_pilon	241	mitochondrion
 55 | 209_pilon	204	mitochondrion
 56 | 2105_pilon	241	mitochondrion
 57 | 2109_pilon	232	mitochondrion
 58 | 2142_pilon	241	mitochondrion
 59 | 2143_pilon	231	mitochondrion
 60 | 215_pilon	241	mitochondrion
 61 | 2211_pilon	241	mitochondrion
 62 | 2255_pilon	241	mitochondrion
 63 | 2263_pilon	231	mitochondrion
 64 | 232_pilon	241	mitochondrion
 65 | 233_pilon	241	mitochondrion
 66 | 2354_pilon	241	mitochondrion
 67 | 2406_pilon	241	mitochondrion
 68 | 2444_pilon	241	mitochondrion
 69 | 2500_pilon	259	mitochondrion
 70 | 2538_pilon	241	mitochondrion
 71 | 2591_pilon	241	mitochondrion
 72 | 2672_pilon	231	mitochondrion
 73 | 2696_pilon	241	mitochondrion
 74 | 2750_pilon	231	mitochondrion
 75 | 2791_pilon	241	mitochondrion
 76 | 2800_pilon	241	mitochondrion
 77 | 2813_pilon	226	mitochondrion
 78 | 2877_pilon	231	mitochondrion
 79 | 288_pilon	234	mitochondrion
 80 | 2940_pilon	241	mitochondrion
 81 | 3014_pilon	241	mitochondrion
 82 | 3023_pilon	241	mitochondrion
 83 | 3094_pilon	241	mitochondrion
 84 | 3167_pilon	234	mitochondrion
 85 | 3195_pilon	243	mitochondrion
 86 | 3224_pilon	241	mitochondrion
 87 | 3226_pilon	241	mitochondrion
 88 | 3272_pilon	232	mitochondrion
 89 | 3274_pilon	241	mitochondrion
 90 | 327_pilon	247	mitochondrion
 91 | 3306_pilon	241	mitochondrion
 92 | 3311_pilon	219	mitochondrion
 93 | 3337_pilon	241	mitochondrion
 94 | 3350_pilon	241	mitochondrion
 95 | 3430_pilon	241	mitochondrion
 96 | 3562_pilon	241	mitochondrion
 97 | 3591_pilon	241	mitochondrion
 98 | 3603_pilon	241	mitochondrion
 99 | 3608_pilon	241	mitochondrion
100 | 3609_pilon	241	mitochondrion
101 | 3613_pilon	1212	mitochondrion
102 | 3679_pilon	241	mitochondrion
103 | 367_pilon	241	mitochondrion
104 | 3684_pilon	238	mitochondrion
105 | 3686_pilon	241	mitochondrion
106 | 3688_pilon	231	mitochondrion
107 | 3695_pilon	721	mitochondrion
108 | 370_pilon	989	mitochondrion
109 | 3773_pilon	203	mitochondrion
110 | 3779_pilon	238	mitochondrion
111 | 3815_pilon	436	mitochondrion
112 | 3819_pilon	241	mitochondrion
113 | 3821_pilon	241	mitochondrion
114 | 3856_pilon	241	mitochondrion
115 | 3885_pilon	241	mitochondrion
116 | 3910_pilon	206	mitochondrion
117 | 3952_pilon	231	mitochondrion
118 | 3955_pilon	241	mitochondrion
119 | 3960_pilon	241	mitochondrion
120 | 3970_pilon	241	mitochondrion
121 | 3972_pilon	1209	mitochondrion
122 | 4029_pilon	241	mitochondrion
123 | 403_pilon	232	mitochondrion
124 | 404_pilon	241	mitochondrion
125 | 4058_pilon	241	mitochondrion
126 | 4060_pilon	241	mitochondrion
127 | 4066_pilon	241	mitochondrion
128 | 4070_pilon	1478	mitochondrion
129 | 4085_pilon	231	mitochondrion
130 | 4096_pilon	785	mitochondrion
131 | 4160_pilon	241	mitochondrion
132 | 4194_pilon	231	mitochondrion
133 | 4234_pilon	239	mitochondrion
134 | 4315_pilon	474	mitochondrion
135 | 4331_pilon	249	mitochondrion
136 | 4334_pilon	241	mitochondrion
137 | 4347_pilon	241	mitochondrion
138 | 4358_pilon	2191	mitochondrion
139 | 4398_pilon	241	mitochondrion
140 | 4405_pilon	241	mitochondrion
141 | 4444_pilon	241	mitochondrion
142 | 4520_pilon	241	mitochondrion
143 | 4521_pilon	241	mitochondrion
144 | 4548_pilon	241	mitochondrion
145 | 4549_pilon	241	mitochondrion
146 | 4552_pilon	241	mitochondrion
147 | 4586_pilon	230	mitochondrion
148 | 4597_pilon	223	mitochondrion
149 | 460_pilon	241	mitochondrion
150 | 461_pilon	241	mitochondrion
151 | 4628_pilon	241	mitochondrion
152 | 4630_pilon	1032	mitochondrion
153 | 465_pilon	241	mitochondrion
154 | 4661_pilon	231	mitochondrion
155 | 4663_pilon	241	mitochondrion
156 | 4731_pilon	462	mitochondrion
157 | 4734_pilon	241	mitochondrion
158 | 474_pilon	241	mitochondrion
159 | 4781_pilon	230	mitochondrion
160 | 4785_pilon	241	mitochondrion
161 | 4796_pilon	241	mitochondrion
162 | 484_pilon	241	mitochondrion
163 | 4857_pilon	262	mitochondrion
164 | 4864_pilon	631	mitochondrion
165 | 4901_pilon	241	mitochondrion
166 | 4902_pilon	241	mitochondrion
167 | 4970_pilon	241	mitochondrion
168 | 4983_pilon	585	mitochondrion
169 | 5021_pilon	241	mitochondrion
170 | 5052_pilon	241	mitochondrion
171 | 5072_pilon	241	mitochondrion
172 | 5075_pilon	241	mitochondrion
173 | 5087_pilon	241	mitochondrion
174 | 5095_pilon	319	mitochondrion
175 | 5157_pilon	241	mitochondrion
176 | 5196_pilon	241	mitochondrion
177 | 5235_pilon	241	mitochondrion
178 | 5237_pilon	241	mitochondrion
179 | 525_pilon	241	mitochondrion
180 | 5287_pilon	231	mitochondrion
181 | 5363_pilon	241	mitochondrion
182 | 5364_pilon	241	mitochondrion
183 | 53_pilon	307	mitochondrion
184 | 5415_pilon	241	mitochondrion
185 | 5428_pilon	241	mitochondrion
186 | 5436_pilon	440	mitochondrion
187 | 5447_pilon	241	mitochondrion
188 | 5482_pilon	237	mitochondrion
189 | 553_pilon	358	mitochondrion
190 | 5550_pilon	214	mitochondrion
191 | 5561_pilon	241	mitochondrion
192 | 5570_pilon	213	mitochondrion
193 | 5609_pilon	241	mitochondrion
194 | 5610_pilon	241	mitochondrion
195 | 562_pilon	241	mitochondrion
196 | 5646_pilon	241	mitochondrion
197 | 5655_pilon	241	mitochondrion
198 | 5681_pilon	231	mitochondrion
199 | 5693_pilon	526	mitochondrion
200 | 5694_pilon	241	mitochondrion
201 | 5696_pilon	232	mitochondrion
202 | 5734_pilon	232	mitochondrion
203 | 5737_pilon	241	mitochondrion
204 | 5754_pilon	241	mitochondrion
205 | 5768_pilon	241	mitochondrion
206 | 5785_pilon	241	mitochondrion
207 | 5802_pilon	246	mitochondrion
208 | 582_pilon	241	mitochondrion
209 | 5831_pilon	524	mitochondrion
210 | 5867_pilon	241	mitochondrion
211 | 5882_pilon	241	mitochondrion
212 | 5919_pilon	241	mitochondrion
213 | 5920_pilon	241	mitochondrion
214 | 592_pilon	241	mitochondrion
215 | 5932_pilon	241	mitochondrion
216 | 5957_pilon	312	mitochondrion
217 | 640_pilon	241	mitochondrion
218 | 644_pilon	241	mitochondrion
219 | 649_pilon	241	mitochondrion
220 | 701_pilon	241	mitochondrion
221 | 70_pilon	234	mitochondrion
222 | 733_pilon	230	mitochondrion
223 | 75_pilon	232	mitochondrion
224 | 76_pilon	241	mitochondrion
225 | 77_pilon	241	mitochondrion
226 | 78_pilon	241	mitochondrion
227 | 809_pilon	235	mitochondrion
228 | 810_pilon	241	mitochondrion
229 | 823_pilon	241	mitochondrion
230 | 850_pilon	473	mitochondrion
231 | 872_pilon	1515	mitochondrion
232 | 907_pilon	231	mitochondrion
233 | 965_pilon	241	mitochondrion
234 | 981_pilon	241	mitochondrion
235 | 


--------------------------------------------------------------------------------
/selection/codon_optimisation/codon_optimisation.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | ##Script to parse OrthoFinder and ribosomal protein BLAST results to calculate codon optimisation for core single copy genes##
  3 | 
  4 | library(seqinr)
  5 | library(coRdon)
  6 | library(tAI)
  7 | library(rstatix)
  8 | library(reshape2)
  9 | 
 10 | args <- commandArgs(trailingOnly=TRUE)
 11 | 
 12 | #Test if there is one argument: if not, return an error
 13 | if (length(args) != 1) {
 14 |   stop("One argument must be supplied: the OrthoFinder results directory (ending in a forward slash)", call.=FALSE)
 15 | } 
 16 | 
 17 | dir <- args[1]
 18 | 
 19 | 
 20 | ##Codon optimisation##
 21 | 
 22 | #Read in orthogroups from OrthoFinder
 23 | orthogroups <- read.csv(paste0(dir, "Orthogroups/Orthogroups.tsv"), row.names=1, sep="\t", check.names=FALSE)
 24 | 
 25 | #Read in 'unassigned genes' i.e. species specific genes and combines with orthogroups dataframe
 26 | unassigned <- read.csv(paste0(dir, "Orthogroups/Orthogroups_UnassignedGenes.tsv"),
 27 |                        row.names=1, sep="\t", check.names=FALSE)
 28 | 
 29 | orthogroups <- rbind(orthogroups, unassigned)
 30 | 
 31 | #For each taxon...
 32 | message("Reading in ribosomal proteins")
 33 | for (i in colnames(orthogroups)) {
 34 |   
 35 |   #Read in the list of ribosomal proteins
 36 |   ribosomes <- scan(paste0(i, ".faa_ribosomes"), character(), quote="")
 37 |   #Replace pipes (|) with hyphens
 38 |   ribosomes <- gsub("\\|", "-", ribosomes)
 39 |   assign(paste0(i, ".ribosomes"), ribosomes)
 40 |   
 41 |   #Replace pipes (|) with hyphens
 42 |   orthogroups[,i] <- gsub("\\|", "-", orthogroups[,i])
 43 |   
 44 | }
 45 | 
 46 | 
 47 | #Make dataframe for ribosomal protein counts
 48 | ribosome.count <- data.frame(matrix(0, ncol=ncol(orthogroups), nrow=nrow(orthogroups)))
 49 | colnames(ribosome.count) <- colnames(orthogroups)
 50 | rownames(ribosome.count) <- rownames(orthogroups)
 51 | 
 52 | message("Counting number of ribosomal proteins in each orthogroup:")
 53 | 
 54 | #For each taxon...
 55 | for (i in 1:length(colnames(ribosome.count))) {
 56 |   
 57 |   #Print progress
 58 |   message((i - 1), "/", length(colnames(ribosome.count)))
 59 |   
 60 |   #Retrieve the list of ribosomal proteins
 61 |   ribosomes <- get(paste0(colnames(ribosome.count)[i], ".ribosomes"))
 62 |   
 63 |   #For each row in the list (i.e. protein)...
 64 |   for (j in 1:length(ribosomes)) {
 65 |     
 66 |     #Retrieve orthogroup ID
 67 |     ribosome <- grep(ribosomes[j], orthogroups[, i])
 68 |     
 69 |     #Search for orthogroup ID in corresponding column of orthogroups dataframe and add 1 to orthogroup count
 70 |     ribosome.count[ribosome, i] <- ribosome.count[ribosome, i] + 1
 71 |     
 72 |   }
 73 |   
 74 | }
 75 | message(i, "/", length(colnames(ribosome.count)))
 76 | 
 77 | message("Generating codon tables and RSCU values:")
 78 | 
 79 | #For each taxon...
 80 | for (i in colnames(orthogroups)) {
 81 |   
 82 |   #Print progress
 83 |   message((which(colnames(orthogroups)  == i) - 1),
 84 |           "/", length(colnames(orthogroups)))
 85 |   
 86 |   #Read in core single-copy proteins
 87 |   prots <- readSet(file=paste0(i, "_coreSC.fa"))
 88 |   #Make table of codon counts for core SC proteins
 89 |   codon.table <- codonTable(prots)
 90 |   #Calculate relative synonymous codon usage
 91 |   rscu <- uco(unlist(strsplit(paste(as.vector(prots), collapse=""), "")), index="rscu")
 92 |   
 93 |   assign(paste0("rscu.", i), rscu)
 94 |   assign(paste0("codon.table.", i), codon.table)
 95 |   
 96 | }
 97 | message((which(colnames(orthogroups)  == i)),
 98 |         "/", length(colnames(orthogroups)))
 99 | 
100 | #Make empty vector to label which core SC proteins are ribosomal
101 | test.set <- rep(FALSE, length(codon.table))
102 | ribosome.orthos <- rownames(ribosome.count)[which(rowSums(ribosome.count) > 0)]
103 | test.set[na.omit(match(ribosome.orthos, names(prots)[which(lengths(prots) > 0)]))] <- TRUE
104 | 
105 | #Read in orthogroup data
106 | load("../../CSEP_CAZyme_prediction/orthogroup-matrices-2022-02-10.RData")
107 | 
108 | #Core, single-copy CSEPs
109 | core.SC.csepmixed <- Reduce(intersect,
110 |                             list(orthogroups.stats.ingroup0$orthogroup[which(
111 |                               orthogroups.stats.ingroup0$copy_number == "single")],
112 |                               orthogroups.stats.ingroup0$orthogroup[which(
113 |                                 orthogroups.stats.ingroup0$category == "core")],
114 |                               orthogroups.stats.ingroup0$orthogroup[which(
115 |                                 !is.na(orthogroups.stats.ingroup0$CSEP))]))
116 | 
117 | #Core, single-copy CAZymes
118 | core.SC.cazymemixed <- Reduce(intersect,
119 |                               list(orthogroups.stats.ingroup0$orthogroup[which(
120 |                                 orthogroups.stats.ingroup0$copy_number == "single")],
121 |                                 orthogroups.stats.ingroup0$orthogroup[which(
122 |                                   orthogroups.stats.ingroup0$category == "core")],
123 |                                 orthogroups.stats.ingroup0$orthogroup[which(
124 |                                   !is.na(orthogroups.stats.ingroup0$CAZyme))]))
125 | 
126 | #Make empty vector for GC3 content
127 | gc3.list <- list()
128 | 
129 | #Make empty dataframe for codon optimisation (S) results
130 | s.df <- data.frame(taxon=colnames(orthogroups),
131 |                    S=NA,
132 |                    S.CSEP=NA,
133 |                    S.CAZyme=NA,
134 |                    S.other=NA)
135 | 
136 | cai.list <- list()
137 | enc.list <- list()
138 | 
139 | message("Calculating codon statistics for each core SC orthogroup:")
140 | 
141 | #For each taxon...
142 | for (i in colnames(orthogroups)) {
143 |   
144 |   #Print progress
145 |   message((which(colnames(orthogroups) == i) - 1), "/",
146 |           length(colnames(orthogroups)))
147 |   
148 |   codon.table <- get(paste0("codon.table.", i))
149 |   
150 |   #Calculate codon adaptation index
151 |   cai <- CAI(codon.table, subsets=list(ribosomes=test.set), stop.rm=TRUE)
152 |   
153 |   cai.list[[i]] <- cai
154 |   
155 |   #Calculate effective number of codons
156 |   enc <- ENC(codon.table)
157 |   
158 |   enc.list[[i]] <- enc
159 |   
160 |   #Read in core SC sequences
161 |   fasta <- tryCatch(read.fasta(file=paste0(i, "_coreSC.fa")), error=function(e) NULL)
162 |   
163 |   #For each core SC orthogroup...
164 |   for (j in names(fasta)) {
165 |     if (length(fasta[[j]]) > 0) {
166 |       
167 |       #Calculate GC3 content
168 |       gc3.list[[i]][j] <- GC3(fasta[[j]])
169 |       
170 |     }
171 |     
172 |   }
173 |   
174 |   #Calculate S
175 |   s.df$S[s.df$taxon == i] <- get.s(cai, enc, as.vector(gc3.list[[i]]))
176 |   #...for CSEPs
177 |   s.df$S.CSEP[s.df$taxon == i] <-
178 |     get.s(cai[match(core.SC.csepmixed, getID(codon.table))],
179 |           enc[match(core.SC.csepmixed, getID(codon.table))],
180 |           as.vector(gc3.list[[i]])[match(core.SC.csepmixed,
181 |                                          getID(codon.table))])
182 |   #...for CAZymes
183 |   s.df$S.CAZyme[s.df$taxon == i] <-
184 |     get.s(cai[match(core.SC.cazymemixed, getID(codon.table))],
185 |           enc[match(core.SC.cazymemixed, getID(codon.table))],
186 |           as.vector(gc3.list[[i]])[match(core.SC.cazymemixed,
187 |                                          getID(codon.table))])
188 |   #...for non-CSEPs/CAZymes
189 |   s.df$S.other[s.df$taxon == i] <-
190 |     get.s(cai[-match(union(core.SC.csepmixed, core.SC.cazymemixed),
191 |                      getID(codon.table))],
192 |           enc[-match(union(core.SC.csepmixed, core.SC.cazymemixed),
193 |                      getID(codon.table))],
194 |           as.vector(gc3.list[[i]])[-match(union(core.SC.csepmixed, core.SC.cazymemixed),
195 |                                           getID(codon.table))])
196 |   
197 | }
198 | message((which(colnames(orthogroups) == i)), "/",
199 |         length(colnames(orthogroups)))
200 | 
201 | message(paste0("Results saved in codon_optimisation-", Sys.Date(), ".RData"))
202 | save(list=c(ls(pattern="rscu\\."), "s.df"), file=paste0("codon_optimisation-", Sys.Date(), ".RData")) 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # *Fusarium* Lifestyles
  2 |  
  3 | ![Pipeline workflow](pipeline.png)
  4 | 
  5 | Bioinformatics analysis pipeline for:
  6 | >Hill et al. (2022) Lifestyle transitions in fusarioid fungi are frequent and lack clear genomic signatures. Molecular Biology and Evolution 39(4):msac085. [doi:10.1093/molbev/msac085](https://doi.org/10.1093/molbev/msac085).
  7 | 
  8 | The pipeline was written for and run on Queen Mary University of London's [Apocrita HPC facility](http://doi.org/10.5281/zenodo.438045) which uses the Univa Grid Engine batch-queue system. This means that many of the bash scripts (`.sh` file endings) specify core allocation, run times and memory usage allocation that may need to be adapted for different platforms.
  9 | 
 10 | Associated data files: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6353640.svg)](https://doi.org/10.5281/zenodo.6353640)
 11 | 
 12 | 
 13 | 
 14 | ---
 15 | 
 16 | ## 1 Assembly
 17 | 
 18 | `cd assembly`
 19 | 
 20 | ### 1.1 Read quality control
 21 | 
 22 | `cd assembly/reads`
 23 | 
 24 | Requires raw `fastq.gz` paired-end reads in this directory as well as `TruSeq3-PE.fa` file with adapter sequences downloaded from [here](https://github.com/timflutre/trimmomatic/blob/master/adapters/TruSeq3-PE.fa) (for Illumina NovaSeq 6000 151bp paired-end reads).
 25 | 
 26 | 1. `qsub trimmomatic.sh` - trims raw reads using [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic).
 27 | 2. `qsub fastqc.sh` - after trimming, checks read quality with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
 28 | 
 29 | ### 1.2 *De novo* genome assembly
 30 | 
 31 | `cd assembly/denovo_assembly`
 32 | 
 33 | 1. `./submit_assembly.sh` - makes new directory and submits job scripts for each assembly tool - `abyss.sh` ([ABySS](https://github.com/bcgsc/abyss)), `megahit.sh` ([MEGAHIT](https://github.com/voutcn/megahit)) and `spades.sh` ([SPAdes](https://github.com/ablab/spades)).
 34 | 2. `./abyss_comp.sh` - compares the assembly stats to choose 'best' kmer size for ABySS (must be done after `abyss.sh` has finished for all kmer sizes and strains).
 35 | 
 36 | ### 1.3 Polishing
 37 | 
 38 | `cd assembly/polishing`
 39 | 
 40 | 1.`qsub polish.sh` - for each strain and assembly tool, maps raw reads to assembly and calculates mapping statistics with [BWA-MEM](https://github.com/lh3/bwa) and [SAMtools](http://www.htslib.org/) and polishes the assembly with [Pilon](https://github.com/broadinstitute/pilon). Also removes sequences <200bp using [Seqtk](https://github.com/lh3/seqtk) for NCBI compliance.
 41 | 
 42 | After completing [4 Assessment](https://github.com/Rowena-h/FusariumLifestyles/tree/main/assembly/assessment) and uploading to NCBI:
 43 | 
 44 | 2.`./ncbi_filter.sh` - removes sequences identified as mitochondrial or duplicates by NCBI (listed in files saved as `duplicates_*` and `mito_remove_*` for each strain) and trims sequences identified as having mitochondrial contaminants (listed in files saved as `mito_trim_*.bed`)  using [bedtools](https://bedtools.readthedocs.io/en/latest/).
 45 | 
 46 | ### 1.4 Assessment
 47 | 
 48 | `cd assembly/assessment`
 49 | 
 50 | 1. `./submit_assessment.sh` - submits `quast.sh` ([QUAST](https://github.com/ablab/quast)), `busco.sh` ([BUSCO](https://busco.ezlab.org/)) and `blast.sh` ([BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi)) scripts for assembly quality statistics. `busco.sh` requires the Hypocreales BUSCO dataset downloaded from [here](https://busco-data.ezlab.org/v4/data/lineages/)).
 51 | 2. `qsub blobtools.sh` - submits `blobtools.sh` to run [BlobTools](https://github.com/DRL/blobtools) (must be done after `blast.sh` has finished for all strains).
 52 | 
 53 | ---
 54 | 
 55 | ## 2 Annotation
 56 | 
 57 | `cd annotation`
 58 | 
 59 | ### 2.1 Repeatmasking
 60 | 
 61 | `cd annotation/repeat_masking`
 62 | 
 63 | 1. `qsub repeatmodeler.sh` - makes custom repeat library for each strain using [RepeatModeler](https://www.repeatmasker.org/RepeatModeler/).
 64 | 2. `qsub repeatmasker.sh` - uses the custom repeat libraries to softmask assemblies using [RepeatMasker](https://www.repeatmasker.org/RepeatMasker/).
 65 | 
 66 | ### 2.2 MAKER pipeline
 67 | 
 68 | `cd annotation/maker`
 69 | 
 70 | Informed by [this](https://gist.github.com/darencard/bb1001ac1532dd4225b030cf0cd61ce2) tutorial. Requires ESTs and proteins from [Fusoxy1](https://mycocosm.jgi.doe.gov/Fusoxy1/Fusoxy1.home.html) and [Fuseq1](https://mycocosm.jgi.doe.gov/Fuseq1/Fuseq1.home.html) ([Mesny et al. 2021](https://doi.org/10.1038/s41467-021-27479-y)) downloaded from Mycocosm in this directory.
 71 | 
 72 | #### Round 1
 73 | 
 74 | `cd annotation/maker/round1`
 75 | 
 76 | `qsub maker.sh` - submits first run of [MAKER](http://www.yandell-lab.org/software/maker.html) using ESTs and proteins, as indicated in `.ctl` files.
 77 | 
 78 | #### Round 2
 79 | 
 80 | `cd annotation/maker/round2`
 81 | 
 82 | 1. `qsub training_snap/snap.sh` - trains [SNAP](https://github.com/KorfLab/SNAP) using gene models from the first MAKER round.
 83 | 2. `qsub maker2.sh` - submits second run of MAKER using trained SNAP (as indicated in `.ctl` files).
 84 | 
 85 | #### Round 3
 86 | 
 87 | `cd annotation/maker/round3`
 88 | 
 89 | 1. `qsub training_snap2/snap2.sh` - trains SNAP again using gene models from the second MAKER round.
 90 | 2. `qsub maker3.sh` - submits third run of MAKER using second trained SNAP (as indicated in `.ctl` files).
 91 | 3. `qsub rename.sh` - after obtaining unique locus tags from e.g. NCBI (see `locus_tags.txt`), renames IDs in gff and fasta files.
 92 | 4. `qsub gag.sh` - runs [GAG](https://github.com/genomeannotation/GAG/) to remove introns <10bp, remove terminal Ns and correct start and stop codons in gff file for NCBI compliance.
 93 | 
 94 | ---
 95 | 
 96 | ## 3 Orthology inference
 97 | 
 98 | `cd orthology_inference`
 99 | 
100 | 1. `./submit_protein_download.sh` - submits `ncbi_ftp_links.r` and `protein_download.sh` to download of predicted protein sets of *Fusarium* strains from NCBI.
101 | 2. `qsub orthofinder.sh` - submits orthology inference using [OrthoFinder](https://github.com/davidemms/OrthoFinder).
102 | 
103 | ---	
104 | 
105 | ## 4 Phylogenomics
106 | 
107 | `cd phylogenomics`
108 | 
109 | 1. `./submit_alignment.sh` - submits `alignment.sh` for alignment of single copy orthogroups from OrthoFinder with [MAFFT](https://mafft.cbrc.jp/alignment/software/) followed by trimming with [BMGE](https://bmcecolevol.biomedcentral.com/articles/10.1186/1471-2148-10-210) and [trimAl](http://trimal.cgenomics.org/).
110 | 2. `./concat.sh` - concatenate single copy orthogroup alignments and prepare partition files using [AMAS](https://github.com/marekborowiec/AMAS).
111 | 3. `./submit_modeltestng.sh` - submits `modeltest-ng/modeltestng.sh` to run [ModelTest-NG](https://github.com/ddarriba/modeltest) on all single copy orthogroups (in computationally tractable chunks).
112 | 4. `./submit_speciestrees_concatenation.sh` - submits concatenation-based species tree methods - `species_tree/raxmlng.sh` ([RAxML-NG](https://github.com/amkozlov/raxml-ng)) and `species_tree/iqtree.sh` ([IQ-TREE](https://github.com/iqtree/iqtree2)).
113 | 5. `./submit_RAxML-NG_genetrees.sh` - submits `RAxMLNG_genetrees.sh` to run RAxML-NG for individual gene trees.
114 | 6. `./submit_speciestrees_coalescent.sh` - submits coalescent-based species tree methods - `species_tree/astral.sh` ([ASTRAL-III](https://github.com/smirarab/ASTRAL)) and `species_tree/astral-pro.sh` ([ASTRAL-Pro](https://github.com/chaoszhang/A-pro)) using genes trees.
115 | 
116 | ---
117 | 
118 | ## 5 Divergence time estimation
119 | 
120 | `cd divergence_time_estimation`
121 | 
122 | ### 5.1 SortaDate
123 | 
124 | `cd divergence_time_estimation/sortadata`
125 | 
126 | `qsub sortadate` - reroots gene and RAxML-NG species tree and runs with [SortaDate](https://github.com/FePhyFoFum/SortaDate) to filter for top ten 'clock-like' genes.
127 | 
128 | ### 5.2 MCMCTree
129 | 
130 | `cd divergence_time_estimation/mcmctree`
131 | 
132 | 1. `qsub mcmctree_dating_step1.sh` - adds secondary time calibrations to species tree nodes and submits first step of approximate likelihood divergence time estimation with protein data using MCMCTree from [PAML](http://abacus.gene.ucl.ac.uk/software/paml.html) (see [tutorial](http://abacus.gene.ucl.ac.uk/software/MCMCtree.Tutorials.pdf)).
133 | 2. `Rscript estimate_rate.r` - estimates the scaling parameter for the substitution rate prior to be added to `mcmctree_step2_independent.ctl` and `mcmctree_step2_correlated.ctl`.
134 | 3. `./submit_mcmctree_dating_step2.sh` - submits `mcmctree_independent.sh` and `mcmctree_correlated.sh` for second step of approximate likelihood estimation for both independent and correlated rates relaxed clock models.
135 | 
136 | ---
137 | 
138 | ## 6 CSEP & CAZyme prediction
139 | 
140 | `cd CSEP_CAZyme_prediction`
141 | 
142 | 1. `./submit_CSEPprediction.sh` - submits all programmes in the CSEP prediction pipeline - `signalp/signalp.sh` ([SignalP](https://services.healthtech.dtu.dk/service.php?SignalP-5.0)), `targetp/targetp.sh` ([TargetP](https://services.healthtech.dtu.dk/service.php?TargetP-2.0)), `phobius/phobius.sh` ([Phobius](https://phobius.sbc.su.se/instructions.html)), `tmhmm/tmhmm.sh` ([TMHMM](https://services.healthtech.dtu.dk/service.php?TMHMM-2.0)), `prosite/ps_scan.sh` ([ps_scan](https://prosite.expasy.org/scanprosite/)), `nucpred/nucpred.sh` ([NucPred](https://nucpred.bioinfo.se/nucpred/)), `predgpi/predgpi.sh` which in turn submits `predgpi/PredGPI.r` to use the R package [ragp](https://rdrr.io/github/missuse/ragp/man/get_pred_gpi.html) ([PredGPI](http://gpcr.biocomp.unibo.it/predgpi/)) and  `effectorp/effectorp.sh` ([EffectorP](https://github.com/JanaSperschneider/EffectorP-3.0)).
143 | 2. `./submit_CSEPfilter.sh` - submits `CSEPfilter` to produce lists of CSEPs from all programme results.
144 | 3. `./submit_CSEPblast.sh` -  submits `blastp/blastp.sh` to BLAST of CSEPs against the [PHI-base database](http://www.phi-base.org/) (requires `phi-base_current.csv` and `phi-base_current.fas` to be downloaded from [here](http://www.phi-base.org/downloadLink.htm) into the `blastp` directory).
145 | 4. `./submit_CAZymeprediction.sh` - submits `run_dbcan/run_dbcan.sh` to run [run_dbcan](https://github.com/linnabrown/run_dbcan). 
146 | 5. `qsub submit_orthogroupparsing.sh` - submits `orthogroup_parser.r` to make abundance matrices of orthogroups for all strains and categorises whether they are CSEPs/CAZymes and core/accessory/specific.
147 | 
148 | ---
149 | 
150 | ## 7 Lifestyle comparison
151 | 
152 | `cd lifestyle_comparison`
153 | 
154 | `./submit_lifestyletest.sh` - submits `lifestyle_v_phylogeny.r` to prepare input file for `lifestyle-test.sh` which runs PERMANOVA-based lifestyle test on orthogroup and CSEP presence absence matrices; `run_edited.py` is modified from the original script `run.py` by [Mesny & Vannier](https://github.com/fantin-mesny/Effect-Of-Biological-Categories-On-Genomes-Composition).
155 | 
156 | ---
157 | 
158 | ## 8 Selection
159 | 
160 | `cd selection`
161 | 
162 | ### 8.1 dN/dS analysis
163 | 
164 | 1. `qsub gbff_files/ncbi_gbff_download.sh` - downloads GBFF files for the strains used in this study from NCBI; also need [Ilysp1 transcripts downloaded from Mycocosm](https://mycocosm.jgi.doe.gov/Ilysp1/Ilysp1.home.html) in `gbff_files` directory.
165 | 2. `./submit_pal2nal.sh` - submits `pal2nal.sh` script to pull corresponding nucleotides for all proteins using `pull_nucleotides.py` and prepares codon alignments using [PAL2NAL](http://www.bork.embl.de/pal2nal/).
166 | 3. `./submit_hyphy.sh` - prepares file inputs and submits scripts for [HyPhy](https://github.com/veg/hyphy) dN/dS methods - `hyphy/BUSTED.sh`, `hyphy/aBSREL.sh` and `hyphy/Contrast-FEL.sh`.
167 | 
168 | ### 8.2 Codon optimisation
169 | 
170 | `cd selection/codon_optimisation`
171 | 
172 | 1. `./pull_ribosomes.sh` - extracts ribosomal protein encoding genes from [Fusgr1](https://mycocosm.jgi.doe.gov/Fusgr1/Fusgr1.home.html) and submits `blast.sh` to run BLAST search against all strains in this study.
173 | 2. `./submit_codon_optimisation.sh` - submits `codon_optimisation.r` script to estimate various codon usage bias statistics and codon optimisation values.
174 | 
175 | ---
176 | 
177 | ## 9 Statistics and data visualisation
178 | 
179 | `Rscript stats_and_plots.r`
180 | 
181 | ---
182 | 
183 | ## Citation
184 | 
185 | >Hill et al. (2022) Lifestyle transitions in fusarioid fungi are frequent and lack clear genomic signatures. Molecular Biology and Evolution 39(4):msac085. [doi:10.1093/molbev/msac085](https://doi.org/10.1093/molbev/msac085).
186 | 


--------------------------------------------------------------------------------