├── reproducibility └── All-samples │ ├── sylph │ ├── prefetch_array_job.sh │ ├── sylph_prefetch_array.sbatch │ ├── sylph_array.sbatch │ ├── README.md │ └── prefetch_to_sylph_batch.py │ ├── checkm2 │ ├── README.md │ ├── checkm2.sbatch │ └── checkm2_batch.py │ └── assembly-stats │ ├── README.md │ └── assembly_stats_batch.py ├── LICENSE ├── README.md ├── meetings └── 2024 │ └── zoom_20240322.md └── guide.md /reproducibility/All-samples/sylph/prefetch_array_job.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | set -eu 4 | 5 | split_files_dir=/FIX_PATH/ 6 | root_out=/FIX_PATH/ 7 | 8 | index=$SLURM_ARRAY_TASK_ID 9 | outdir=$root_out/$index 10 | ids_file=$split_files_dir/$index 11 | 12 | echo "Running prefetch on batch number $index. Outdir: $outdir" 13 | 14 | if [ ! -d $outdir ] 15 | then 16 | mkdir $outdir 17 | fi 18 | 19 | cd $outdir 20 | prefetch --option-file $ids_file &> prefetch.stdouterr 21 | 22 | -------------------------------------------------------------------------------- /reproducibility/All-samples/checkm2/README.md: -------------------------------------------------------------------------------- 1 | # checkm2 2 | 3 | Version of checkm2: 1.0.1. The singularity container used is here: 4 | https://osf.io/7vpy3 5 | 6 | Checkm2 database: uniref100.KO.1.dmnd. A copy of this is here: 7 | https://osf.io/x5vtj 8 | 9 | 10 | This was all run on the EBI SLURM compute cluster. Some paths 11 | were hard-coded. You will need to change them to run on your own 12 | data. Look for `FIX_PATH` in the python script. 13 | 14 | A SLURM job array was used, which was submitted using 15 | ``` 16 | sbatch checkm2.sbatch 17 | ``` 18 | Each element of the array ran a batch of samples in serial using the 19 | script `checkm2_batch.py`. See inside that script for notes on what would 20 | need changing if you want to run this script yourself. 21 | 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 iqbal-lab-org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /reproducibility/All-samples/checkm2/checkm2.sbatch: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --job-name=checkm2 3 | #SBATCH --output=/FIX_PATH/Logs/o/%a.o 4 | #SBATCH --error=/FIX_ATH/Logs/e/%a.e 5 | #SBATCH --mem=2867M 6 | #SBATCH --time=1800 7 | #SBATCH --cpus-per-task=1 8 | #SBATCH --signal=B:SIGUSR1@60 9 | #SBATCH --array=1-38657%1000 10 | 11 | 12 | start_time=$(date +"%Y-%m-%dT%H:%M:%S") 13 | start_seconds=$(date +%s) 14 | 15 | end_time=RUNNING 16 | exit_code=UNKNOWN 17 | 18 | gather_stats() { 19 | # unset the trap otherwise this function can get called more than once 20 | trap - EXIT SIGUSR1 21 | end_time=$(date +"%Y-%m-%dT%H:%M:%S") 22 | end_seconds=$(date +%s) 23 | wall_clock_s=$(($end_seconds-$start_seconds)) 24 | echo -e "SLURM_STATS_BEGIN 25 | SLURM_STATS job_id $SLURM_JOB_ID 26 | SLURM_STATS command /FIX_PATH/run_checkm2_batch.py 27 | SLURM_STATS requested_ram 2.8 28 | SLURM_STATS requested_time 1800 29 | SLURM_STATS job_name checkm2 30 | SLURM_STATS start_time $start_time 31 | SLURM_STATS end_time $end_time 32 | SLURM_STATS wall_clock_s $wall_clock_s 33 | SLURM_STATS exit_code $exit_code" 34 | slurmzy jobinfo $SLURM_JOB_ID | awk '{print "SLURM_STATS_JOBINFO "$0}' 35 | 36 | if [ $exit_code = "UNKNOWN" ] 37 | then 38 | exit 1 39 | else 40 | exit $exit_code 41 | fi 42 | } 43 | 44 | trap gather_stats EXIT SIGUSR1 45 | 46 | /usr/bin/time -a -o /FIX_PATH/o/$SLURM_ARRAY_TASK_ID.o -v $SHELL -c "$(cat << 'EOF' 47 | /FIX_PATH/checkm2_batch.py 48 | EOF 49 | )" 50 | 51 | exit_code=$? 52 | gather_stats 53 | 54 | -------------------------------------------------------------------------------- /reproducibility/All-samples/sylph/sylph_prefetch_array.sbatch: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --job-name=prefetch 3 | #SBATCH --output=/FIX_PATH/o/%a.o 4 | #SBATCH --error=/FIX_PATH/e/%a.e 5 | #SBATCH --mem=1G 6 | #SBATCH --time=300 7 | #SBATCH --cpus-per-task=1 8 | #SBATCH --signal=B:SIGUSR1@60 9 | #SBATCH --array=1-1000%40 10 | 11 | start_time=$(date +"%Y-%m-%dT%H:%M:%S") 12 | start_seconds=$(date +%s) 13 | 14 | end_time=RUNNING 15 | exit_code=UNKNOWN 16 | 17 | gather_stats() { 18 | # unset the trap otherwise this function can get called more than once 19 | trap - EXIT SIGUSR1 20 | end_time=$(date +"%Y-%m-%dT%H:%M:%S") 21 | end_seconds=$(date +%s) 22 | wall_clock_s=$(($end_seconds-$start_seconds)) 23 | echo -e "SLURM_STATS_BEGIN 24 | SLURM_STATS job_id $SLURM_JOB_ID 25 | SLURM_STATS command /FIX_PATH/prefetch_array_job.sh 26 | SLURM_STATS requested_ram 1.0 27 | SLURM_STATS requested_time 240 28 | SLURM_STATS job_name prefetch 29 | SLURM_STATS start_time $start_time 30 | SLURM_STATS end_time $end_time 31 | SLURM_STATS wall_clock_s $wall_clock_s 32 | SLURM_STATS exit_code $exit_code" 33 | slurmzy jobinfo $SLURM_JOB_ID | awk '{print "SLURM_STATS_JOBINFO "$0}' 34 | 35 | exit 0 36 | 37 | #if [ $exit_code = "UNKNOWN" ] 38 | #then 39 | # exit 1 40 | #else 41 | # exit $exit_code 42 | #fi 43 | } 44 | 45 | trap gather_stats EXIT SIGUSR1 46 | 47 | /usr/bin/time -a -o /FIX_PATH/o/$SLURM_ARRAY_TASK_ID.o -v $SHELL -c "$(cat << 'EOF' 48 | /FIX_PATH/prefetch_array_job.sh 49 | EOF 50 | )" 51 | 52 | exit_code=$? 53 | gather_stats 54 | 55 | -------------------------------------------------------------------------------- /reproducibility/All-samples/sylph/sylph_array.sbatch: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --job-name=sylph 3 | #SBATCH --output=/FIX_PATH/o/%a.o 4 | #SBATCH --error=/FIX_PATH/e/%a.e 5 | #SBATCH --mem=13G 6 | #SBATCH --time=2000 7 | #SBATCH --cpus-per-task=1 8 | #SBATCH --signal=B:SIGUSR1@60 9 | #SBATCH --array=1-1000%500 10 | #SBATCH --dependency=aftercorr: 11 | 12 | start_time=$(date +"%Y-%m-%dT%H:%M:%S") 13 | start_seconds=$(date +%s) 14 | 15 | end_time=RUNNING 16 | exit_code=UNKNOWN 17 | 18 | gather_stats() { 19 | # unset the trap otherwise this function can get called more than once 20 | trap - EXIT SIGUSR1 21 | end_time=$(date +"%Y-%m-%dT%H:%M:%S") 22 | end_seconds=$(date +%s) 23 | wall_clock_s=$(($end_seconds-$start_seconds)) 24 | echo -e "SLURM_STATS_BEGIN 25 | SLURM_STATS job_id $SLURM_JOB_ID 26 | SLURM_STATS command /FIX_PATH/prefetch_to_sylph_batch.py 27 | SLURM_STATS requested_ram 13.0 28 | SLURM_STATS requested_time 2000 29 | SLURM_STATS job_name sylph 30 | SLURM_STATS start_time $start_time 31 | SLURM_STATS end_time $end_time 32 | SLURM_STATS wall_clock_s $wall_clock_s 33 | SLURM_STATS exit_code $exit_code" 34 | slurmzy jobinfo $SLURM_JOB_ID | awk '{print "SLURM_STATS_JOBINFO "$0}' 35 | 36 | if [ $exit_code = "UNKNOWN" ] 37 | then 38 | exit 1 39 | else 40 | exit $exit_code 41 | fi 42 | } 43 | 44 | trap gather_stats EXIT SIGUSR1 45 | 46 | /usr/bin/time -a -o /FIX_PATH/$SLURM_ARRAY_TASK_ID.o -v $SHELL -c "$(cat << 'EOF' 47 | /FIX_PATH/prefetch_to_sylph_batch.py 48 | EOF 49 | )" 50 | 51 | exit_code=$? 52 | gather_stats 53 | 54 | -------------------------------------------------------------------------------- /reproducibility/All-samples/sylph/README.md: -------------------------------------------------------------------------------- 1 | # Sylph 2 | 3 | Version sylph used: 0.5.1. 4 | 5 | The pre-built GTDB-R214 database was used: https://storage.googleapis.com/sylph-stuff/v0.3-c200-gtdb-r214.syldb 6 | 7 | This was run on the EBI SLURM cluster. 8 | Do not expect these scripts to work on your cluster without 9 | editing. For example: they use hard-coded paths, which have here 10 | been replaced with `FIX_PATH` 11 | 12 | This was run in two stages: 13 | 1. Download the reads using prefetch 14 | 2. Run Sylph on the reads (and delete the reads afterwards). 15 | 16 | Each stage was run as a job array. Each element of the job array processed 17 | a batch of sequencing runs. Each sylph job was pointed to the output of a 18 | prefetch batch. In other words, each prefetch array element N downloads batch 19 | N of reads, and then sylph array element N runs sylph on that batch of reads, 20 | then deletes the reads. 21 | 22 | The batches of samples need to be defined by having a directory 23 | of files called 1, 2, 3, 4, etc. Each file should have one run accession 24 | per line. File number N corresponds to job array element N. 25 | 26 | These arrays were arranged by setting the sylph array to depend on the 27 | prefetch array, so that sylph job N would start when prefetch job N finished. 28 | 29 | The prefetch job array was submitted with `sylph_prefetch_array.sbatch`. 30 | Each element of the array runs the script `prefetch_array_job.sh`, which 31 | uses prefetch to download the batch of reads. 32 | 33 | The sylph job array was submitted with `sylph_array.sbatch`. 34 | Each array element runs the script `prefetch_to_sylph_batch.py`, which 35 | runs sylph on each read run, then deletes the reads. 36 | -------------------------------------------------------------------------------- /reproducibility/All-samples/assembly-stats/README.md: -------------------------------------------------------------------------------- 1 | # Assembly stats 2 | 3 | The final output file of assembly statistics is available here: 4 | https://osf.io/h7g42. 5 | 6 | Version of `assembly-stats` used: git commit 7bdb58b from 7 | https://github.com/sanger-pathogens/assembly-stats. 8 | 9 | The main script is `assembly_stats_batch.py`, which runs `assembly-stats` 10 | on a batch of assemblies, outputting a single TSV file of results. 11 | 12 | The whole process was run as follows on the EBI compute slurm cluster. 13 | If you want to run yourself, then you will need to fix the hard-coded 14 | paths. Look for `FIX_ME` in the python script. 15 | 16 | The script has a hard-coded path to a TSV file that looked like this: 17 | ``` 18 | $ head -n3 sample_path.tsv 19 | Sample Path 20 | SAMD00075885 1300k/batch_68/ilmn-SAMD00075885_contigs.fa.gz 21 | SAMN16231665 1300k/batch_68/ilmn-SAMN16231665_contigs.fa.gz 22 | ``` 23 | It had 1943494 lines. 24 | 25 | The SLURM jobs were submitted with: 26 | 27 | ``` 28 | mkdir Splits 29 | seq 1 10000 1943494 | awk '{s="slurmzy run 0.2 Splits/stats."$1" ./assembly_stats_batch.py "$1" "($1+9999)" Splits/stats."$1".tsv"; print s; system(s)}' 30 | ``` 31 | 32 | Note: `surmzy` can be obtained from https://github.com/martinghunt/slurmzy. 33 | It's a wrapper for running `srun`. 34 | 35 | There was an off-by-one error meaning that the first sample needed to be 36 | run manually. And then gather all the results into one file: 37 | 38 | ``` 39 | assembly-stats -t /FIX_PATH/ilmn-SAMD00075885_contigs.fa.gz | awk 'NR>1 {OFS="\t"; $1="SAMD00075885"} 1' | sed 's/filename/sample/' > assembly-stats.tsv 40 | for x in `seq 1 10000 1943494 `; do awk 'NR>1' Splits/stats.$x.tsv >> assembly-stats.tsv ; done 41 | ``` 42 | -------------------------------------------------------------------------------- /reproducibility/All-samples/assembly-stats/assembly_stats_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import csv 5 | import os 6 | import sys 7 | import subprocess 8 | 9 | 10 | STATS_COLS = [ 11 | "sample", 12 | "total_length", 13 | "number", 14 | "mean_length", 15 | "longest", 16 | "shortest", 17 | "N_count", 18 | "Gaps", 19 | "N50", 20 | "N50n", 21 | "N70", 22 | "N70n", 23 | "N90", 24 | "N90n", 25 | ] 26 | 27 | 28 | def parse_stats_stdout(p, sample, filename): 29 | fields = p.stdout.strip().split("\t") 30 | assert fields[0] == filename 31 | assert len(fields) == len(STATS_COLS) 32 | fields[0] = sample 33 | return "\t".join(fields) 34 | 35 | 36 | RELEASE_ROOT = "FIX_PATH" 37 | TSV = f"FIX_PATH/sample_path.tsv" 38 | 39 | parser = argparse.ArgumentParser( 40 | description="description", 41 | usage="%(prog)s ", 42 | ) 43 | parser.add_argument("start", type=int, help="start line of sample_path.tsv file") 44 | parser.add_argument("end", type=int, help="end line of sample_path.tsv file") 45 | parser.add_argument("outfile", help="output file") 46 | 47 | options = parser.parse_args() 48 | 49 | with open(TSV) as f_in, open(options.outfile, "w") as f_out: 50 | print(*STATS_COLS, sep="\t", file=f_out) 51 | 52 | for i, d in enumerate(csv.DictReader(f_in, delimiter="\t")): 53 | if i < options.start: 54 | continue 55 | if i > options.end: 56 | break 57 | 58 | fa = os.path.join(RELEASE_ROOT, d["Path"]) 59 | p = subprocess.run(["assembly-stats", "-u", fa], stdout=subprocess.PIPE, universal_newlines=True) 60 | if p.returncode != 0: 61 | print("Error", d, file=sys.stderr) 62 | continue 63 | 64 | try: 65 | to_print = parse_stats_stdout(p, d["Sample"], fa) 66 | except: 67 | print("Error", d, file=sys.stderr) 68 | continue 69 | 70 | print(to_print, file=f_out) 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AllTheBacteria 2 | All WGS isolate bacterial INSDC data to June 2023uniformly assembled, QC-ed, annotated, searchable. 3 | 4 | Follow up to Grace Blackwell's 661k dataset (which covered everything to Nov 2018). 5 | 6 | Preprint: https://doi.org/10.1101/2024.03.08.584059 7 | 8 | ## Latest Release 0.2 9 | The data are here: https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/ 10 | 11 | Changes from release 0.1 are documented in detail in the release 0.2 readme: https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/README.md 12 | 13 | Summary of changes: 14 | * Approximately 12k contigs were removed, due to matching the human genome 15 | * Reran assembly stats and checkm2 on the changed assemblies 16 | * The "high quality" dataset changed slightly because of the assemblies changing 17 | * Species call for each sample tidied up 18 | * Added phylign indexes for searching/aligning query sequences (see https://github.com/AllTheBacteria/Phylign/blob/main/README.md) 19 | * Updated sketchlib indexes 20 | * Added file of md5sum of all files in the release 21 | 22 | 23 | ## Release 0.1 24 | Full details here: https://www.biorxiv.org/content/10.1101/2024.03.08.584059v1 25 | The data were here: https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.1/ 26 | but have been deleted due to human contamination. Please use release 0.2 instead. 27 | 28 | First release contains 29 | 1. About 2 million Shovill assemblies, identified by ENA sample id 30 | 2. summary of assembly statistics 31 | 3. File(s) summarising taxonomic and contamination statistics based on sylph taxnonomic abundance estimation (GTDB r214), and CheckM2 32 | 4. A filelist specifying "high quality" assemblies 33 | 5. A README decribing all this. 34 | The assembly workflow is in github , but we don't have a distributable container for it yet. 35 | 36 | ## Further releases 37 | Future releases will include 38 | 1. More search indexes to come. 39 | 2. Annotation (bakta at least) 40 | 3. Pan-genomes and harmonised gene names within species (for the top N species) for representative genomes chosen using poppunk clusters and QC metrics. 41 | 4. MLST, various species specific typing, AMR 42 | 43 | 44 | ## Distribution 45 | Data will be distributed at least by 46 | 1. EBI ftp which is simutaneously accessible by Globus and Aspera. 47 | 2. Zenodo would be good to add 48 | 49 | 50 | ## Rules of Engagement with the data 51 | Once Release 0.1 is out, anyone/everyone is welcome to use the data and publish with it. There is no expectation that the people who made the release/data should be co-authors on these publications, but we would appreciate citation of the preprint (https://www.biorxiv.org/content/10.1101/2024.03.08.584059v1). 52 | 53 | ## Rules of Involvement with the project 54 | All welcome, contact us via Github, Slack or the monthly zoom calls. Anyone who contributes to the project, through analysis, project management or any other means, ought to be an author of the paper. 55 | 56 | ## Next zoom calls 57 | 22nd March 2024, 9am and 4pm GMT 58 | 59 | 60 | 61 | ## FAQ 62 | 1. What happens if two people want to run their competing methods (bad example, prokka versus bakta or one AMR tool versus another). First, anyone can do anything they like, but to get into the releases, we should discuss on a zoom call and make a decision. We shall tend towards allowing multiple analyses (eg we intend to run bakta on everything but if someone wants to run prokka too, we should we ok to add that to the release too). However, if it starts to get silly with people wanting 4 tools each run with 3 parameters, then I think we get a lot stricter - this compute isn't free (in terms of carbon, or money), so we'll make a decision and do something limited. 63 | 64 | -------------------------------------------------------------------------------- /meetings/2024/zoom_20240322.md: -------------------------------------------------------------------------------- 1 | **Meeting notes from Zoom call at 9am GMT 22nd March 2024. Second meeting at 3pm is further down** 2 | 3 | Present: 4 | Zam Iqbal, Martin Hunt, Boas van der Putten, Jane Hawkey, Laura Carroll, Liz Batty, George Bouras, Gerry Tonkin- 5 | 6 | JaneH and Dan Anderson - will split running of AMRFinder+ 7 | Laura Carroll - already running BGC from GECCO. plus GTDB Bacillus_A BTyper A 8 | Gerry volunteers E coli ST 9 | 10 | Jane suggests use pathogen.watch MLST (they have a docker container, Martin will heklp turning it into singularity) 11 | 12 | George Bouras will do prophage annotation, needs to wait for bakta, then some harmonising so consistent with bakta. 13 | I've pointed him at Wendy Figueroa and Daniel Cazares to combine forces. 14 | 15 | Boas will have a think about Strep pyogenes 16 | 17 | Matthew Croxen has offered to run any of 18 | 19 | Streptococcus pneumoniae Pneumokitty, Streptococcus agalactiae GBS-SBG, Streptococcus pyogenes Emmtyper, Haemophilus influenzae Hicap, Escherichia coli Ec_typer, Shigella spp. Shigatyper 20 | [Jane Hawkey says there is a more recent tool from FX], Bordetella pertussis/parapertussis BPAgST, Salmonella enterica SISTR, Neisseria gonorrhoea NG MAST, NG STAR, Klebsiella pneumoniae (lots of "complex species here so may wish to go to genus to cast a broader net) Kleborate, Listeria monocytogenes Lissero, Neisseria meningitidis meningotype, Legionella pneumophila Legsta 21 | 22 | ZAM FORGOT to mention Lesley Hoyles (Klebsiella oxytoca-related genomes (grimontii, michiganensis, pasteurii, oxytoca), and Klebsiella ornithinolytica and Klebsiella planticola) and 23 | and Jonathan Thomas (Staphylococcus epidermidis and/or other coagulase-negative/non-aureus staphylococci) 24 | 25 | Slightly rambling discussion about ways to 26 | a) coordinate who is doing what/what is being done for each species 27 | b) Gerry had an idea of allowing people to do their own typing of things and publish their results (eg on their own github) but have that centrally "advertised"/linked on an AllTheBacteria website. 28 | 29 | Last year Nabil had plans to use a github mod/thing called Zen for project management, will see if he still has energy for this. 30 | Generally people happy with Oliver Schwengers suggestion of a directory structure constructed from sample id, with species/tool/taxid injected in there perhaps. 31 | 32 | 33 | **Meeting notes from Zoom call at 1500 GMT 22nd March 2024.** 34 | 35 | Present: Finlay Macguire, Maria Luisa Andreani, Martin Hunt, Robert Petit, Nabil-Fareed Alikhan 36 | 37 | Finlay would like to talk to Adrian Cazares about plasmid contig identification 38 | Finlay would also like to use this data for a wider AMR database harmonisation project 39 | Rob Petit will happily test the mof-search and convert to nextflow 40 | Rob also has a lot of compute power available and will be contacting Oliver Schwengers in case he needs help with processing Bakta 41 | 42 | Bakta apparently runs AMRFinder+, so we need to make sure Oliver and Dan Anderson talk, otherwise we will run AMRFinder+ twice. 43 | 44 | We started setting up some project management on the AllTheBacteria github, Zam needs to finish this off 45 | (sorting out allowing others to contribute to it). 46 | 47 | Discussed potentially using OSF as a way to distribute the analytic products 48 | 49 | 50 | **Actions** 51 | 1. Zam, Martin, Dan are going to make a 0.2 release in the next few weeks. This will remove contigs from some assemblies that map very well to human. We will in that process remove release 0.1 assemblies (as there is human contamination there). The release will also have search indexes and a snakemake workflow for aligning sequences to the full set. 52 | 2. Zam will share the indexes and snakemake with Rob Petit for external testing prior to release. 53 | 3. Zam to sort out the project management aspect on the github and to make a (google? or in the repo?) sheet with species and volunteers and tools they will run. 54 | 4. We need a checklist of what you need to do for your contribution to be accepted. One key thing is you use precisely identical identifiers to those we have for the assemblies. No switching of . to + or whatever, and Nabil/Fin advocate for a simple csv with all the data or pointing to filenames. We should formalise this. 55 | 5. Zam needs to followup with various volunteers who contacted over email but are not on Slack, and who he forgot to contact. 56 | We don't need to sort out 4 before people start processing I suppose, but sooner is better 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /guide.md: -------------------------------------------------------------------------------- 1 | **XXX This is likely 90% correct at best. Meant as a first attempt to get it right eventually** 2 | 3 | Recently a group of scientists associated with the European Molecular Biology Laboratory released [a compilation of 1,932,812 assemblies of bacterial chromosomes](https://www.biorxiv.org/content/10.1101/2024.03.08.584059v1.full), building on earlier work by [Grace Blackwell et al](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3001421). 4 | 5 | This assembly was performed on FASTQ reads available on the European Nucleotide Archive as of May 2023. Only eubacterial (so not archaeal) data was included, and only Illumina produced reads were processed. [After I asked](https://github.com/AllTheBacteria/AllTheBacteria/issues/28), it now appears that archaeal chromosomes could now also be included in upcoming versions, which is wonderful news. 6 | 7 | It is important to note that this project is not a strict alternative to the selection of complete chromosomes that you can find in the [NCBI Reference Sequence Database](https://www.ncbi.nlm.nih.gov/refseq/). AllTheBacteria delivers contigs, not reference genomes or scaffolds. 8 | 9 | The project very much aims to benefit from community effort, and in this spirit, I want to explain a bit what is actually in the AllTheBacteria data, and how to use it. 10 | 11 | # AllTheBacteria releases 12 | The project has a [GitHub page](https://github.com/AllTheBacteria/AllTheBacteria) through which we can find links to releases. The current release is [0.2](https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/). 13 | 14 | In there we find a bunch of directories, as follows. 15 | 16 | ## assembly/ 17 | [This directory](https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/assembly/) has a cleverly arranged set of tar.xz files, grouped by bacterial species, containing FASTA files. If you know what you are looking for, you can find the right tar.xz files. The FASTA files within these tar files have been arranged using [smart software called MiniPhy](https://github.com/karel-brinda/MiniPhy) so that they compress really well. 18 | 19 | If you are looking for a popular organism, say E. coli, you'll still have to pick from many dozens of tar files though, and see below on how the 'metadata/' directory will help you there. 20 | 21 | In addition to tar.xz files named after popular organisms, there are also 'dustbin' files containing all remaining chromosomes. 22 | 23 | It is important to note that the FASTA files contain contigs, they are not reference sequences. As an example, let's look at `escherichia_coli__01/SAMD00075771.fa`, this contains entries like: 24 | 25 | ``` 26 | >SAMD00075771.contig00001 len=256900 cov=25.5 corr=0 origname=NODE_1_length_256900_cov_25.545194_pilon sw=shovill-spades/1.1.0 date=20230709 27 | CTCCGCTATGTCCTTGACGTCATAGCCGACTGGCCGATAAACCGGGTCGGCGAACTGCTCC.... 28 | ``` 29 | We can look up SAMD00075771 over at the ENA, on [https://www.ebi.ac.uk/ena/browser/view/SAMD00075771?dataType=BIOSAMPLE](https://www.ebi.ac.uk/ena/browser/view/SAMD00075771?dataType=BIOSAMPLE) and there we learn that this data was submitted as 'ASM276431v1 assembly for Escherichia coli O26:H11'. We can also download the paired FASTQ there, plus the originally submitted assembly. 30 | 31 | ## metadata/ 32 | In here we find a whole bunch of files: 33 | 34 | * assembly-stats.tsv.gz: per sample how many contigs there are, the N50, N70, N90 35 | * checkm2.tsv.gz: CheckM2 is a tool for assessing microbial genome quality using machine learning. This file features estimates for completeness, contamination, number of detected coding sequences 36 | * ena_metadata.tsv.gz: All in one view of the many accession numbers and identifiers from ENA. This is all input related, where did the data come from 37 | * hq_set.sample_list.txt.gz: A list of samples that pass more stringent quality checks, as outlined in the AllTheBacteria preprint. 38 | * hq_set.removed_samples.tsv.gz: Samples that did not make it to the high-quality list, together with reasons why not 39 | * nucmer_human.gz: reads from samples that were identified as human in origin 40 | * sample_list.txt.gz: names of all ENA samples included, presumably the sum of hq_set.sample_list and hq_set.removed_samples above 41 | * species_calls.tsv.gz: automated majority species call per sample, plus indication if this is a 42 | high quality sample or not 43 | * sylph.tsv.gz: the Sylph program compares samples to GTDB reference genomes, and this file contains the best fits for each sample. The top call ends up in species_calls.tsv above. 44 | * sylph.no_matches.txt.gz: files for which Sylph had no idea 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /reproducibility/All-samples/sylph/prefetch_to_sylph_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | import os 6 | import re 7 | import subprocess 8 | 9 | success_re = re.compile(r""" '(?P.*)\.sralite' was downloaded successfully""") 10 | success_re2 = re.compile(r""" '(?P.*)\.lite' was downloaded successfully""") 11 | success_re3 = re.compile(r""" '(?P.*)' was downloaded successfully""") 12 | success_re4 = re.compile(r""" '(?P.*)\.sralite' is found locally""") 13 | success_re5 = re.compile(r""" '(?P.*)\.lite' is found locally""") 14 | success_re6 = re.compile(r""" '(?P.*)' is found locally""") 15 | failed_acc_re = re.compile(r"""err: name not found while resolving query within virtual file system module - failed to resolve accession '(?P.*)' - no data""") 16 | 17 | REGEXES = { 18 | success_re: {"success": True, "fail_reason": "NA"}, 19 | success_re2: {"success": True, "fail_reason": "NA"}, 20 | success_re3: {"success": True, "fail_reason": "NA"}, 21 | success_re4: {"success": True, "fail_reason": "NA"}, 22 | success_re5: {"success": True, "fail_reason": "NA"}, 23 | success_re6: {"success": True, "fail_reason": "NA"}, 24 | failed_acc_re: {"success": False, "fail_reason": "failed to resolve accession"}, 25 | } 26 | 27 | 28 | SYLPH_DB = "/FIX_PATH/v0.3-c200-gtdb-r214.syldb" 29 | OUT_ROOT = "/FIX_PATH/" 30 | SPLIT_ROOT = "/FIX_PATH/" 31 | 32 | def parse_stdouterr_file(infile): 33 | results = {} 34 | with open(prefetch_e) as f: 35 | for line in f: 36 | for regex, d in REGEXES.items(): 37 | match = regex.search(line) 38 | if match is not None: 39 | print("MATCH!", line) 40 | run = match.group("run") 41 | results[run] = [d["success"], d["fail_reason"]] 42 | break 43 | return results 44 | 45 | 46 | 47 | def process_one_run(indir, run_id): 48 | command = f"fasterq-dump --fasta --split-3 {run_id}" 49 | print("command", command) 50 | 51 | try: 52 | print("to_fasta", run_id, command, flush=True) 53 | subprocess.check_output(command, shell=True, cwd=indir) 54 | except: 55 | return False, "fail_fasterq-dump" 56 | 57 | fasta_1 = os.path.join(indir, f"{run_id}_1.fasta") 58 | fasta_2 = os.path.join(indir, f"{run_id}_2.fasta") 59 | if not (os.path.exists(fasta_1) and os.path.exists(fasta_2)): 60 | return False, "fail_not_all_fasta_files_made" 61 | 62 | try: 63 | tmp_out = os.path.join(indir, f"{run_id}.tmp.sketch") 64 | subprocess.check_output(f"rm -rf {tmp_out}", shell=True) 65 | subprocess.check_output(f"sylph sketch -1 {fasta_1} -2 {fasta_2} -d {tmp_out}", shell=True) 66 | outfile = os.path.join(indir, f"{run_id}.sylph.tsv") 67 | subprocess.check_output(f"sylph profile -t 1 {SYLPH_DB} {tmp_out}/*.sylsp > {outfile}", shell=True) 68 | subprocess.check_output(f"rm -rf {tmp_out}", shell=True) 69 | except: 70 | return False, "error_sylph" 71 | 72 | print("Done OK", run_id) 73 | return True, "NA" 74 | 75 | 76 | 77 | parser = argparse.ArgumentParser( 78 | description="Run sylph on dir from running prefetch on a file of run IDs", 79 | usage="%(prog)s", 80 | ) 81 | options = parser.parse_args() 82 | 83 | job_array_index = os.environ.get("LSB_JOBINDEX", None) 84 | if job_array_index is None: 85 | job_array_index = os.environ.get("SLURM_ARRAY_TASK_ID", None) 86 | 87 | if job_array_index is None: 88 | raise Exception("LSB_JOBINDEX/SLURM_ARRAY_TASK_ID not in env. Cannot continue") 89 | options.ids_file = os.path.join(SPLIT_ROOT, f"{job_array_index}") 90 | 91 | indir = os.path.join(OUT_ROOT, job_array_index) 92 | assert os.path.exists(indir) 93 | 94 | assert os.path.exists(options.ids_file) 95 | with open(options.ids_file) as f: 96 | all_runs = [x.rstrip() for x in f] 97 | all_runs.sort() 98 | 99 | 100 | status_file = os.path.join(indir, "sylph_status.json") 101 | if os.path.exists(status_file): 102 | with open(status_file) as f: 103 | sylph_results = json.load(f) 104 | else: 105 | sylph_results = {} 106 | 107 | 108 | prefetch_e = os.path.join(indir, "prefetch.stdouterr") 109 | assert os.path.exists(prefetch_e) 110 | 111 | print("Total runs:", len(all_runs), flush=True) 112 | prefetch_results = parse_stdouterr_file(prefetch_e) 113 | print(prefetch_results) 114 | 115 | for run in all_runs: 116 | sylph_status = sylph_results.get(run, (False, "unknown")) 117 | prefetch_status = prefetch_results.get(run, (False, "unknown")) 118 | 119 | if sylph_status[0]: 120 | print(run, "already done", flush=True) 121 | elif prefetch_status[0]: 122 | sylph_results[run] = process_one_run(indir, run) 123 | else: # failed for some reason 124 | sylph_results[run] = False, prefetch_status[1] 125 | 126 | sylph_status = sylph_results.get(run, (False, "unknown")) 127 | if sylph_status[0]: 128 | try: 129 | subprocess.check_output(f"rm -rf {run} {run}_?.fasta", cwd=indir, shell=True) 130 | except: 131 | sylph_results[run] = False, "error_cleaning_files" 132 | 133 | 134 | with open(status_file, "w") as f: 135 | json.dump(sylph_results, f, indent=2) 136 | -------------------------------------------------------------------------------- /reproducibility/All-samples/checkm2/checkm2_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import subprocess 6 | import sys 7 | import time 8 | 9 | # SPLIT_ROOT = full path to the directory of "split input files". 10 | # Files must be called 1, 2, 3, 4, ... 11 | # File N is processed by element N of the job array. 12 | # Each file is tab-delimited, no header line, and has two fields: 13 | # 1) sample name 2) full path to assembly fasta file 14 | SPLIT_ROOT = "FIX_PATH" 15 | 16 | ROOT_OUT = "FIX_PATH" # root directory of output checkm2 files 17 | IMG = "/FIX_PATH/checkm2.1.0.1--pyh7cba7a3_0.img" # checkm2 singularity container 18 | DB = "/FIX_PATH/uniref100.KO.1.dmnd" # checkm2 database 19 | CHECKM2 = f"singularity exec {IMG} checkm2 predict --allmodels --lowmem --database_path {DB} --remove_intermediates" 20 | CHECKM2_COLS = [ 21 | "Name", 22 | "Completeness_General", 23 | "Contamination", 24 | "Completeness_Specific", 25 | "Completeness_Model_Used", 26 | "Translation_Table_Used", 27 | "Coding_Density", 28 | "Contig_N50", 29 | "Average_Gene_Length", 30 | "Genome_Size", 31 | "GC_Content", 32 | "Total_Coding_Sequences", 33 | "Additional_Notes", 34 | ] 35 | 36 | def get_array_index(): 37 | array_index = os.environ.get("LSB_JOBINDEX", None) 38 | if array_index is None: 39 | array_index = os.environ.get("SLURM_ARRAY_TASK_ID", None) 40 | if array_index is None: 41 | raise Exception("LSB_JOBINDEX/SLURM_ARRAY_TASK_ID not in env. Cannot continue") 42 | return array_index 43 | 44 | 45 | def fix_sample_name_in_report_tsv(sample, infile, outfile): 46 | with open(infile) as f: 47 | results = [x.rstrip().split("\t") for x in f] 48 | if len(results) != 2: 49 | print("ERROR LINES!=2 IN RESULTS", sample, flush=True) 50 | return False 51 | if results[0] != CHECKM2_COLS: 52 | print("ERROR Unexpected column names", sample, flush=True) 53 | return False 54 | if len(results[1]) != len(CHECKM2_COLS): 55 | print("ERROR wrong number of fields in second line of results", sample, flush=True) 56 | return False 57 | 58 | with open(outfile, "w") as f: 59 | print(sample, *results[1][1:], sep="\t", file=f) 60 | 61 | return True 62 | 63 | 64 | def run_one_sample(sample, fasta_file): 65 | done_file = f"{sample}.done" 66 | if os.path.exists(done_file): 67 | print("Already done", sample, flush=True) 68 | return True 69 | 70 | fail_file = f"{sample}.fail" 71 | if os.path.exists(fail_file): 72 | print("Already fail", sample, flush=True) 73 | return False 74 | 75 | subprocess.check_output(f"rm -rf {sample} {sample}.tsv", shell=True) 76 | 77 | command = f"{CHECKM2} -i {fasta_file} -o {sample}" 78 | print(command, flush=True) 79 | try: 80 | subprocess.check_output(command, shell=True, timeout=2400) 81 | except: 82 | print("ERROR RUNNING CHECKM", sample, flush=True) 83 | return False 84 | 85 | result_file = os.path.join(sample, "quality_report.tsv") 86 | if not os.path.exists(result_file): 87 | print("ERROR NO RESULT FILE", sample, flush=True) 88 | return False 89 | 90 | outfile = f"{sample}.tsv" 91 | try: 92 | ok = fix_sample_name_in_report_tsv(sample, result_file, outfile) 93 | except: 94 | print("ERROR parsing checkm output file", sample, flush=True) 95 | return False 96 | 97 | if not ok: 98 | print("ERROR parsing checkm output file", sample, flush=True) 99 | return False 100 | 101 | subprocess.check_output(f"rm -rf {sample}", shell=True) 102 | subprocess.check_output(f"touch {sample}.done", shell=True) 103 | return True 104 | 105 | 106 | 107 | 108 | job_array_index = get_array_index() 109 | 110 | 111 | samples_file = os.path.join(SPLIT_ROOT, job_array_index) 112 | with open(samples_file) as f: 113 | samples = [x.rstrip().split() for x in f] 114 | 115 | 116 | outdir = os.path.join(ROOT_OUT, job_array_index) 117 | if not os.path.exists(outdir): 118 | os.mkdir(outdir) 119 | 120 | os.chdir(outdir) 121 | 122 | all_done_file = "all.done" 123 | if os.path.exists(all_done_file): 124 | print("All done already") 125 | sys.exit() 126 | 127 | results_files = [] 128 | fails = [] 129 | 130 | for sample, query_file in samples: 131 | assert sample != "all" 132 | try: 133 | ok = run_one_sample(sample, query_file) 134 | except: 135 | print("ERROR OTHER", sample, flush=True) 136 | ok = False 137 | 138 | if not ok: 139 | fails.append(sample) 140 | continue 141 | 142 | results_file = f"{sample}.tsv" 143 | if os.path.exists(results_file): 144 | results_files.append(results_file) 145 | else: 146 | fails.append(sample) 147 | 148 | 149 | if len(fails): 150 | with open("fails.txt", "w") as f_out: 151 | print(*fails, sep="\n", file=f_out) 152 | 153 | 154 | 155 | if len(results_files) > 0: 156 | with open("all.tsv", "w") as f_out: 157 | print(*CHECKM2_COLS, sep="\t", file=f_out) 158 | for filename in results_files: 159 | with open(filename) as f_in: 160 | for line in f_in: 161 | print(line, end="", file=f_out) 162 | 163 | 164 | with open(all_done_file, "w") as f: 165 | pass 166 | 167 | for sample, query_file in samples: 168 | print("deleting intermediate files", sample) 169 | command = f"rm -rf {sample} {sample}.bin_input {sample}.tsv {sample}.done" 170 | try: 171 | subprocess.check_output(command, shell=True) 172 | except: 173 | time.sleep(5) 174 | subprocess.run(command, shell=True) 175 | 176 | --------------------------------------------------------------------------------