├── reproducibility
    └── All-samples
    │   ├── sylph
    │       ├── prefetch_array_job.sh
    │       ├── sylph_prefetch_array.sbatch
    │       ├── sylph_array.sbatch
    │       ├── README.md
    │       └── prefetch_to_sylph_batch.py
    │   ├── checkm2
    │       ├── README.md
    │       ├── checkm2.sbatch
    │       └── checkm2_batch.py
    │   └── assembly-stats
    │       ├── README.md
    │       └── assembly_stats_batch.py
├── LICENSE
├── README.md
├── meetings
    └── 2024
    │   └── zoom_20240322.md
└── guide.md


/reproducibility/All-samples/sylph/prefetch_array_job.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | set -eu
 4 | 
 5 | split_files_dir=/FIX_PATH/
 6 | root_out=/FIX_PATH/
 7 | 
 8 | index=$SLURM_ARRAY_TASK_ID
 9 | outdir=$root_out/$index
10 | ids_file=$split_files_dir/$index
11 | 
12 | echo "Running prefetch on batch number $index. Outdir: $outdir"
13 | 
14 | if [ ! -d $outdir ]
15 | then
16 |     mkdir $outdir
17 | fi
18 | 
19 | cd $outdir
20 | prefetch --option-file $ids_file &> prefetch.stdouterr
21 | 
22 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/checkm2/README.md:
--------------------------------------------------------------------------------
 1 | # checkm2
 2 | 
 3 | Version of checkm2: 1.0.1. The singularity container used is here:
 4 | https://osf.io/7vpy3
 5 | 
 6 | Checkm2 database: uniref100.KO.1.dmnd. A copy of this is here:
 7 | https://osf.io/x5vtj
 8 | 
 9 | 
10 | This was all run on the EBI SLURM compute cluster. Some paths
11 | were hard-coded. You will need to change them to run on your own
12 | data. Look for `FIX_PATH` in the python script.
13 | 
14 | A SLURM job array was used, which was submitted using
15 | ```
16 | sbatch checkm2.sbatch
17 | ```
18 | Each element of the array ran a batch of samples in serial using the
19 | script `checkm2_batch.py`. See inside that script for notes on what would
20 | need changing if you want to run this script yourself.
21 | 
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 iqbal-lab-org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/checkm2/checkm2.sbatch:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #SBATCH --job-name=checkm2
 3 | #SBATCH --output=/FIX_PATH/Logs/o/%a.o
 4 | #SBATCH --error=/FIX_ATH/Logs/e/%a.e
 5 | #SBATCH --mem=2867M
 6 | #SBATCH --time=1800
 7 | #SBATCH --cpus-per-task=1
 8 | #SBATCH --signal=B:SIGUSR1@60
 9 | #SBATCH --array=1-38657%1000
10 | 
11 | 
12 | start_time=$(date +"%Y-%m-%dT%H:%M:%S")
13 | start_seconds=$(date +%s)
14 | 
15 | end_time=RUNNING
16 | exit_code=UNKNOWN
17 | 
18 | gather_stats() {
19 | # unset the trap otherwise this function can get called more than once
20 | trap - EXIT SIGUSR1
21 | end_time=$(date +"%Y-%m-%dT%H:%M:%S")
22 | end_seconds=$(date +%s)
23 | wall_clock_s=$(($end_seconds-$start_seconds))
24 | echo -e "SLURM_STATS_BEGIN
25 | SLURM_STATS	job_id	$SLURM_JOB_ID
26 | SLURM_STATS	command	/FIX_PATH/run_checkm2_batch.py
27 | SLURM_STATS	requested_ram	2.8
28 | SLURM_STATS	requested_time	1800
29 | SLURM_STATS	job_name	checkm2
30 | SLURM_STATS	start_time	$start_time
31 | SLURM_STATS	end_time	$end_time
32 | SLURM_STATS	wall_clock_s	$wall_clock_s
33 | SLURM_STATS	exit_code	$exit_code"
34 | slurmzy jobinfo $SLURM_JOB_ID |  awk '{print "SLURM_STATS_JOBINFO	"$0}'
35 | 
36 | if [ $exit_code = "UNKNOWN" ]
37 | then
38 |     exit 1
39 | else
40 |     exit $exit_code
41 | fi
42 | }
43 | 
44 | trap gather_stats EXIT SIGUSR1
45 | 
46 | /usr/bin/time -a -o /FIX_PATH/o/$SLURM_ARRAY_TASK_ID.o -v $SHELL -c "$(cat << 'EOF'
47 | /FIX_PATH/checkm2_batch.py
48 | EOF
49 | )"
50 | 
51 | exit_code=$?
52 | gather_stats
53 | 
54 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/sylph/sylph_prefetch_array.sbatch:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #SBATCH --job-name=prefetch
 3 | #SBATCH --output=/FIX_PATH/o/%a.o
 4 | #SBATCH --error=/FIX_PATH/e/%a.e
 5 | #SBATCH --mem=1G
 6 | #SBATCH --time=300
 7 | #SBATCH --cpus-per-task=1
 8 | #SBATCH --signal=B:SIGUSR1@60
 9 | #SBATCH --array=1-1000%40
10 | 
11 | start_time=$(date +"%Y-%m-%dT%H:%M:%S")
12 | start_seconds=$(date +%s)
13 | 
14 | end_time=RUNNING
15 | exit_code=UNKNOWN
16 | 
17 | gather_stats() {
18 | # unset the trap otherwise this function can get called more than once
19 | trap - EXIT SIGUSR1
20 | end_time=$(date +"%Y-%m-%dT%H:%M:%S")
21 | end_seconds=$(date +%s)
22 | wall_clock_s=$(($end_seconds-$start_seconds))
23 | echo -e "SLURM_STATS_BEGIN
24 | SLURM_STATS	job_id	$SLURM_JOB_ID
25 | SLURM_STATS	command	/FIX_PATH/prefetch_array_job.sh
26 | SLURM_STATS	requested_ram	1.0
27 | SLURM_STATS	requested_time	240
28 | SLURM_STATS	job_name	prefetch
29 | SLURM_STATS	start_time	$start_time
30 | SLURM_STATS	end_time	$end_time
31 | SLURM_STATS	wall_clock_s	$wall_clock_s
32 | SLURM_STATS	exit_code	$exit_code"
33 | slurmzy jobinfo $SLURM_JOB_ID |  awk '{print "SLURM_STATS_JOBINFO	"$0}'
34 | 
35 | exit 0
36 | 
37 | #if [ $exit_code = "UNKNOWN" ]
38 | #then
39 | #    exit 1
40 | #else
41 | #    exit $exit_code
42 | #fi
43 | }
44 | 
45 | trap gather_stats EXIT SIGUSR1
46 | 
47 | /usr/bin/time -a -o /FIX_PATH/o/$SLURM_ARRAY_TASK_ID.o -v $SHELL -c "$(cat << 'EOF'
48 | /FIX_PATH/prefetch_array_job.sh
49 | EOF
50 | )"
51 | 
52 | exit_code=$?
53 | gather_stats
54 | 
55 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/sylph/sylph_array.sbatch:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #SBATCH --job-name=sylph
 3 | #SBATCH --output=/FIX_PATH/o/%a.o
 4 | #SBATCH --error=/FIX_PATH/e/%a.e
 5 | #SBATCH --mem=13G
 6 | #SBATCH --time=2000
 7 | #SBATCH --cpus-per-task=1
 8 | #SBATCH --signal=B:SIGUSR1@60
 9 | #SBATCH --array=1-1000%500
10 | #SBATCH --dependency=aftercorr:<PUT PREFETCH SLURM ID HERE>
11 | 
12 | start_time=$(date +"%Y-%m-%dT%H:%M:%S")
13 | start_seconds=$(date +%s)
14 | 
15 | end_time=RUNNING
16 | exit_code=UNKNOWN
17 | 
18 | gather_stats() {
19 | # unset the trap otherwise this function can get called more than once
20 | trap - EXIT SIGUSR1
21 | end_time=$(date +"%Y-%m-%dT%H:%M:%S")
22 | end_seconds=$(date +%s)
23 | wall_clock_s=$(($end_seconds-$start_seconds))
24 | echo -e "SLURM_STATS_BEGIN
25 | SLURM_STATS	job_id	$SLURM_JOB_ID
26 | SLURM_STATS	command	/FIX_PATH/prefetch_to_sylph_batch.py
27 | SLURM_STATS	requested_ram	13.0
28 | SLURM_STATS	requested_time	2000
29 | SLURM_STATS	job_name	sylph
30 | SLURM_STATS	start_time	$start_time
31 | SLURM_STATS	end_time	$end_time
32 | SLURM_STATS	wall_clock_s	$wall_clock_s
33 | SLURM_STATS	exit_code	$exit_code"
34 | slurmzy jobinfo $SLURM_JOB_ID |  awk '{print "SLURM_STATS_JOBINFO	"$0}'
35 | 
36 | if [ $exit_code = "UNKNOWN" ]
37 | then
38 |     exit 1
39 | else
40 |     exit $exit_code
41 | fi
42 | }
43 | 
44 | trap gather_stats EXIT SIGUSR1
45 | 
46 | /usr/bin/time -a -o /FIX_PATH/$SLURM_ARRAY_TASK_ID.o -v $SHELL -c "$(cat << 'EOF'
47 | /FIX_PATH/prefetch_to_sylph_batch.py
48 | EOF
49 | )"
50 | 
51 | exit_code=$?
52 | gather_stats
53 | 
54 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/sylph/README.md:
--------------------------------------------------------------------------------
 1 | # Sylph
 2 | 
 3 | Version sylph used: 0.5.1.
 4 | 
 5 | The pre-built GTDB-R214 database was used: https://storage.googleapis.com/sylph-stuff/v0.3-c200-gtdb-r214.syldb
 6 | 
 7 | This was run on the EBI SLURM cluster.
 8 | Do not expect these scripts to work on your cluster without
 9 | editing. For example: they use hard-coded paths, which have here
10 | been replaced with `FIX_PATH`
11 | 
12 | This was run in two stages:
13 | 1. Download the reads using prefetch
14 | 2. Run Sylph on the reads (and delete the reads afterwards).
15 | 
16 | Each stage was run as a job array. Each element of the job array processed
17 | a batch of sequencing runs. Each sylph job was pointed to the output of a
18 | prefetch batch. In other words, each prefetch array element N downloads batch
19 | N of reads, and then sylph array element N runs sylph on that batch of reads,
20 | then deletes the reads.
21 | 
22 | The batches of samples need to be defined by having a directory
23 | of files called 1, 2, 3, 4, etc. Each file should have one run accession
24 | per line. File number N corresponds to job array element N.
25 | 
26 | These arrays were arranged by setting the sylph array to depend on the
27 | prefetch array, so that sylph job N would start when prefetch job N finished.
28 | 
29 | The prefetch job array was submitted with `sylph_prefetch_array.sbatch`.
30 | Each element of the array runs the script `prefetch_array_job.sh`, which
31 | uses prefetch to download the batch of reads.
32 | 
33 | The sylph job array was submitted with `sylph_array.sbatch`.
34 | Each array element runs the script `prefetch_to_sylph_batch.py`, which
35 | runs sylph on each read run, then deletes the reads.
36 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/assembly-stats/README.md:
--------------------------------------------------------------------------------
 1 | # Assembly stats
 2 | 
 3 | The final output file of assembly statistics is available here:
 4 | https://osf.io/h7g42.
 5 | 
 6 | Version of `assembly-stats` used: git commit 7bdb58b from
 7 | https://github.com/sanger-pathogens/assembly-stats.
 8 | 
 9 | The main script is `assembly_stats_batch.py`, which runs `assembly-stats`
10 | on a batch of assemblies, outputting a single TSV file of results.
11 | 
12 | The whole process was run as follows on the EBI compute slurm cluster.
13 | If you want to run yourself, then you will need to fix the hard-coded
14 | paths. Look for `FIX_ME` in the python script.
15 | 
16 | The script has a hard-coded path to a TSV file that looked like this:
17 | ```
18 | $ head -n3 sample_path.tsv
19 | Sample  Path
20 | SAMD00075885    1300k/batch_68/ilmn-SAMD00075885_contigs.fa.gz
21 | SAMN16231665    1300k/batch_68/ilmn-SAMN16231665_contigs.fa.gz
22 | ```
23 | It had 1943494 lines.
24 | 
25 | The SLURM jobs were submitted with:
26 | 
27 | ```
28 | mkdir Splits
29 | seq 1 10000 1943494 | awk '{s="slurmzy run 0.2 Splits/stats."$1" ./assembly_stats_batch.py "$1" "($1+9999)" Splits/stats."$1".tsv"; print s; system(s)}'
30 | ```
31 | 
32 | Note: `surmzy` can be obtained from https://github.com/martinghunt/slurmzy.
33 | It's a wrapper for running `srun`.
34 | 
35 | There was an off-by-one error meaning that the first sample needed to be
36 | run manually. And then gather all the results into one file:
37 | 
38 | ```
39 | assembly-stats -t /FIX_PATH/ilmn-SAMD00075885_contigs.fa.gz | awk 'NR>1 {OFS="\t"; $1="SAMD00075885"} 1' | sed 's/filename/sample/'  > assembly-stats.tsv
40 | for x in `seq 1 10000 1943494 `; do awk 'NR>1' Splits/stats.$x.tsv >> assembly-stats.tsv ; done
41 | ```
42 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/assembly-stats/assembly_stats_batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import csv
 5 | import os
 6 | import sys
 7 | import subprocess
 8 | 
 9 | 
10 | STATS_COLS = [
11 |     "sample",
12 |     "total_length",
13 |     "number",
14 |     "mean_length",
15 |     "longest",
16 |     "shortest",
17 |     "N_count",
18 |     "Gaps",
19 |     "N50",
20 |     "N50n",
21 |     "N70",
22 |     "N70n",
23 |     "N90",
24 |     "N90n",
25 | ]
26 | 
27 | 
28 | def parse_stats_stdout(p, sample, filename):
29 |     fields = p.stdout.strip().split("\t")
30 |     assert fields[0] == filename
31 |     assert len(fields) == len(STATS_COLS)
32 |     fields[0] = sample
33 |     return "\t".join(fields)
34 | 
35 | 
36 | RELEASE_ROOT = "FIX_PATH"
37 | TSV = f"FIX_PATH/sample_path.tsv"
38 | 
39 | parser = argparse.ArgumentParser(
40 |     description="description",
41 |     usage="%(prog)s <start> <end> <outfile>",
42 | )
43 | parser.add_argument("start", type=int, help="start line of sample_path.tsv file")
44 | parser.add_argument("end", type=int, help="end line of sample_path.tsv file")
45 | parser.add_argument("outfile", help="output file")
46 | 
47 | options = parser.parse_args()
48 | 
49 | with open(TSV) as f_in, open(options.outfile, "w") as f_out:
50 |     print(*STATS_COLS, sep="\t", file=f_out)
51 | 
52 |     for i, d in enumerate(csv.DictReader(f_in, delimiter="\t")):
53 |         if i < options.start:
54 |             continue
55 |         if i > options.end:
56 |             break
57 | 
58 |         fa = os.path.join(RELEASE_ROOT, d["Path"])
59 |         p = subprocess.run(["assembly-stats", "-u", fa], stdout=subprocess.PIPE, universal_newlines=True)
60 |         if p.returncode != 0:
61 |             print("Error", d, file=sys.stderr)
62 |             continue
63 | 
64 |         try:
65 |             to_print = parse_stats_stdout(p, d["Sample"], fa)
66 |         except:
67 |             print("Error", d, file=sys.stderr)
68 |             continue
69 | 
70 |         print(to_print, file=f_out)
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AllTheBacteria
 2 | All WGS isolate bacterial INSDC data to June 2023uniformly assembled, QC-ed, annotated, searchable.
 3 | 
 4 | Follow up to Grace Blackwell's 661k dataset (which covered everything to Nov 2018).
 5 | 
 6 | Preprint: https://doi.org/10.1101/2024.03.08.584059
 7 | 
 8 | ## Latest Release 0.2
 9 | The data are here: https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/
10 | 
11 | Changes from release 0.1 are documented in detail in the release 0.2 readme: https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/README.md
12 | 
13 | Summary of changes:
14 | * Approximately 12k contigs were removed, due to matching the human genome
15 | * Reran assembly stats and checkm2 on the changed assemblies
16 | * The "high quality" dataset changed slightly because of the assemblies changing
17 | * Species call for each sample tidied up
18 | * Added phylign indexes for searching/aligning query sequences (see https://github.com/AllTheBacteria/Phylign/blob/main/README.md)
19 | * Updated sketchlib indexes
20 | * Added file of md5sum of all files in the release
21 | 
22 | 
23 | ## Release 0.1
24 | Full details here: https://www.biorxiv.org/content/10.1101/2024.03.08.584059v1
25 | The data were here: https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.1/
26 | but have been deleted due to human contamination. Please use release 0.2 instead.
27 | 
28 | First release contains
29 | 1. About 2 million Shovill assemblies, identified by ENA sample id
30 | 2. summary of assembly statistics
31 | 3. File(s) summarising taxonomic and contamination statistics based on sylph taxnonomic abundance estimation (GTDB r214), and CheckM2 
32 | 4. A filelist specifying  "high quality" assemblies
33 | 5. A README decribing all this.
34 | The assembly workflow is in github , but we don't have a distributable container for it yet.
35 |    
36 | ## Further releases
37 | Future releases will include
38 | 1. More search indexes to come.   
39 | 2. Annotation (bakta at least)
40 | 3. Pan-genomes and harmonised gene names within species (for the top N species) for representative genomes chosen using poppunk clusters and QC metrics.
41 | 4. MLST, various species specific typing, AMR
42 | 
43 | 
44 | ## Distribution
45 | Data will be distributed at least by
46 | 1. EBI ftp which is simutaneously accessible by Globus and Aspera.
47 | 2. Zenodo would be good to add
48 | 
49 |    
50 | ## Rules of Engagement with the data
51 | Once Release 0.1 is out, anyone/everyone is welcome to use the data and publish with it. There is no expectation that the people who made the release/data should be co-authors on these publications, but we would appreciate citation of the preprint (https://www.biorxiv.org/content/10.1101/2024.03.08.584059v1). 
52 | 
53 | ## Rules of Involvement with the project
54 | All welcome, contact us via Github, Slack or the monthly zoom calls. Anyone who contributes to the project, through analysis, project management or any other means, ought to be an author of the paper. 
55 | 
56 | ## Next zoom calls
57 | 22nd March 2024, 9am and 4pm GMT
58 | 
59 | 
60 | 
61 | ## FAQ
62 | 1. What happens if two people want to run their competing methods (bad example, prokka versus bakta or one AMR tool versus another). First, anyone can do anything they like, but to get into the releases, we should discuss on a zoom call and make a decision. We shall tend towards allowing multiple analyses (eg we intend to run bakta on everything but if someone wants to run prokka too, we should we ok to add that to the release too). However, if it starts to get silly with people wanting 4 tools each run with 3 parameters, then I think we get a lot stricter - this compute isn't free (in terms of carbon, or money), so we'll make a decision and do something limited. 
63 | 
64 | 


--------------------------------------------------------------------------------
/meetings/2024/zoom_20240322.md:
--------------------------------------------------------------------------------
 1 | **Meeting notes from Zoom call at 9am GMT 22nd March 2024. Second meeting at 3pm is further down**
 2 | 
 3 | Present:
 4 | Zam Iqbal, Martin Hunt, Boas van der Putten, Jane Hawkey, Laura Carroll, Liz Batty, George Bouras, Gerry Tonkin-
 5 | 
 6 | JaneH and Dan Anderson - will split running of AMRFinder+ 
 7 | Laura Carroll - already running BGC from GECCO. plus GTDB Bacillus_A BTyper A
 8 | Gerry volunteers E coli ST
 9 | 
10 | Jane suggests use pathogen.watch MLST (they have a docker container, Martin will heklp turning it into singularity)
11 | 
12 | George Bouras will do prophage annotation, needs to wait for bakta, then some harmonising so consistent with bakta.
13 | I've pointed him at Wendy Figueroa and Daniel Cazares to combine forces.
14 | 
15 | Boas will have a think about Strep pyogenes
16 | 
17 | Matthew Croxen has offered to run any of
18 | 
19 | Streptococcus pneumoniae Pneumokitty, Streptococcus agalactiae GBS-SBG, Streptococcus pyogenes Emmtyper, Haemophilus influenzae Hicap, Escherichia coli Ec_typer, Shigella spp. Shigatyper
20 | [Jane Hawkey says there is a more recent tool from FX], Bordetella pertussis/parapertussis BPAgST, Salmonella enterica SISTR, Neisseria gonorrhoea NG MAST, NG STAR, Klebsiella pneumoniae (lots of "complex species here so may wish to go to genus to cast a broader net) Kleborate, Listeria monocytogenes Lissero, Neisseria meningitidis meningotype, Legionella pneumophila Legsta
21 | 
22 | ZAM FORGOT to mention Lesley Hoyles (Klebsiella oxytoca-related genomes (grimontii, michiganensis, pasteurii, oxytoca), and Klebsiella ornithinolytica and Klebsiella planticola) and 
23 | and Jonathan Thomas (Staphylococcus epidermidis and/or other coagulase-negative/non-aureus staphylococci)
24 | 
25 | Slightly rambling discussion about ways to
26 | a) coordinate who is doing what/what is being done for each species
27 | b) Gerry had an idea of allowing people to do their own typing of things and publish their results (eg on their own github) but have that centrally "advertised"/linked on an AllTheBacteria website.
28 | 
29 | Last year Nabil had plans to use a github mod/thing called Zen for project management, will see if he still has energy for this.
30 | Generally people happy with Oliver Schwengers suggestion of a directory structure constructed from sample id, with species/tool/taxid injected in there perhaps.
31 | 
32 | 
33 | **Meeting notes from Zoom call at 1500 GMT 22nd March 2024.**
34 | 
35 | Present: Finlay Macguire, Maria Luisa Andreani, Martin Hunt, Robert Petit, Nabil-Fareed Alikhan
36 | 
37 | Finlay would like to talk to Adrian Cazares about plasmid contig identification
38 | Finlay would also like to use this data for a wider AMR database harmonisation project
39 | Rob Petit will happily test the mof-search and convert to nextflow
40 | Rob also has a lot of compute power available and will be contacting Oliver Schwengers in case he needs help with processing Bakta
41 | 
42 | Bakta apparently runs AMRFinder+, so we need to make sure Oliver and Dan Anderson talk, otherwise we will run AMRFinder+ twice.
43 | 
44 | We started setting up some project management on the AllTheBacteria github, Zam needs to finish this off
45 | (sorting out allowing others to contribute to it).
46 | 
47 | Discussed potentially using OSF as a way to distribute the analytic products
48 | 
49 | 
50 | **Actions**
51 | 1. Zam, Martin, Dan are going to make a 0.2 release in the next few weeks. This will remove contigs from some assemblies that map very well to human. We will in that process remove release 0.1 assemblies (as there is human contamination there). The release will also have search indexes and a snakemake workflow for aligning sequences to the full set.
52 | 2. Zam will share the indexes and snakemake with Rob Petit for external testing prior to release.
53 | 3. Zam to sort out the project management aspect on the github and to make a (google? or in the repo?) sheet with species and volunteers and tools they will run.
54 | 4. We need a checklist of what you need to do for your contribution to be accepted. One key thing is you use precisely identical identifiers to those we have for the assemblies. No switching of . to + or whatever, and Nabil/Fin advocate for a simple csv with all the data or pointing to filenames. We should formalise this.
55 | 5. Zam needs to followup with various volunteers who contacted over email but are not on Slack, and who he forgot to contact.
56 | We don't need to sort out 4 before people start processing I suppose, but sooner is better
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/guide.md:
--------------------------------------------------------------------------------
 1 | **XXX This is likely 90% correct at best. Meant as a first attempt to get it right eventually**
 2 | 
 3 | Recently a group of scientists associated with the European Molecular Biology Laboratory released [a compilation of 1,932,812 assemblies of bacterial chromosomes](https://www.biorxiv.org/content/10.1101/2024.03.08.584059v1.full), building on earlier work by [Grace Blackwell et al](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3001421). 
 4 | 
 5 | This assembly was performed on FASTQ reads available on the European Nucleotide Archive as of May 2023. Only eubacterial (so not archaeal) data was included, and only Illumina produced reads were processed. [After I asked](https://github.com/AllTheBacteria/AllTheBacteria/issues/28), it now appears that archaeal chromosomes could now also be included in upcoming versions, which is wonderful news. 
 6 | 
 7 | It is important to note that this project is not a strict alternative to the selection of complete chromosomes that you can find in the [NCBI Reference Sequence Database](https://www.ncbi.nlm.nih.gov/refseq/). AllTheBacteria delivers contigs, not reference genomes or scaffolds.
 8 | 
 9 | The project very much aims to benefit from community effort, and in this spirit, I want to explain a bit what is actually in the AllTheBacteria data, and how to use it.
10 | 
11 | # AllTheBacteria releases
12 | The project has a [GitHub page](https://github.com/AllTheBacteria/AllTheBacteria) through which we can find links to releases. The current release is [0.2](https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/). 
13 | 
14 | In there we find a bunch of directories, as follows.
15 | 
16 | ## assembly/
17 | [This directory](https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/assembly/) has a cleverly arranged set of tar.xz files, grouped by bacterial species, containing FASTA files. If you know what you are looking for, you can find the right tar.xz files. The FASTA files within these tar files have been arranged using [smart software called MiniPhy](https://github.com/karel-brinda/MiniPhy) so that they compress really well.
18 | 
19 | If you are looking for a popular organism, say E. coli, you'll still have to pick from many dozens of tar files though, and see below on how the 'metadata/' directory will help you there.
20 | 
21 | In addition to tar.xz files named after popular organisms, there are also 'dustbin' files containing all remaining chromosomes.
22 | 
23 | It is important to note that the FASTA files contain contigs, they are not reference sequences. As an example, let's look at `escherichia_coli__01/SAMD00075771.fa`, this contains entries like:
24 | 
25 | ```
26 | >SAMD00075771.contig00001 len=256900 cov=25.5 corr=0 origname=NODE_1_length_256900_cov_25.545194_pilon sw=shovill-spades/1.1.0 date=20230709
27 | CTCCGCTATGTCCTTGACGTCATAGCCGACTGGCCGATAAACCGGGTCGGCGAACTGCTCC....
28 | ```
29 | We can look up SAMD00075771 over at the ENA, on [https://www.ebi.ac.uk/ena/browser/view/SAMD00075771?dataType=BIOSAMPLE](https://www.ebi.ac.uk/ena/browser/view/SAMD00075771?dataType=BIOSAMPLE) and there we learn that this data was submitted as 'ASM276431v1 assembly for Escherichia coli O26:H11'. We can also download the paired FASTQ there, plus the originally submitted assembly.
30 | 
31 | ## metadata/
32 | In here we find a whole bunch of files:
33 | 
34 |  * assembly-stats.tsv.gz: per sample how many contigs there are, the N50, N70, N90 
35 |  * checkm2.tsv.gz: CheckM2 is a tool for assessing microbial genome quality using machine learning. This file features estimates for completeness, contamination, number of detected coding sequences
36 |  * ena_metadata.tsv.gz: All in one view of the many accession numbers and identifiers from ENA. This is all input related, where did the data come from
37 |  * hq_set.sample_list.txt.gz: A list of samples that pass more stringent quality checks, as outlined in the AllTheBacteria preprint. 
38 |  * hq_set.removed_samples.tsv.gz: Samples that did not make it to the high-quality list, together with reasons why not
39 |  * nucmer_human.gz: reads from samples that were identified as human in origin
40 |  * sample_list.txt.gz: names of all ENA samples included, presumably the sum of hq_set.sample_list and hq_set.removed_samples above	
41 |  * species_calls.tsv.gz: automated majority species call per sample, plus indication if this is a 
42 | high quality sample or not	
43 |  * sylph.tsv.gz: the Sylph program compares samples to GTDB reference genomes, and this file contains the best fits for each sample. The top call ends up in species_calls.tsv above.
44 |  * sylph.no_matches.txt.gz: files for which Sylph had no idea
45 |  
46 |  
47 | 
48 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/sylph/prefetch_to_sylph_batch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import json
  5 | import os
  6 | import re
  7 | import subprocess
  8 | 
  9 | success_re = re.compile(r""" '(?P<run>.*)\.sralite' was downloaded successfully""")
 10 | success_re2 = re.compile(r""" '(?P<run>.*)\.lite' was downloaded successfully""")
 11 | success_re3 = re.compile(r""" '(?P<run>.*)' was downloaded successfully""")
 12 | success_re4 = re.compile(r""" '(?P<run>.*)\.sralite' is found locally""")
 13 | success_re5 = re.compile(r""" '(?P<run>.*)\.lite' is found locally""")
 14 | success_re6 = re.compile(r""" '(?P<run>.*)' is found locally""")
 15 | failed_acc_re = re.compile(r"""err: name not found while resolving query within virtual file system module - failed to resolve accession '(?P<run>.*)' - no data""")
 16 | 
 17 | REGEXES = {
 18 |     success_re: {"success": True, "fail_reason": "NA"},
 19 |     success_re2: {"success": True, "fail_reason": "NA"},
 20 |     success_re3: {"success": True, "fail_reason": "NA"},
 21 |     success_re4: {"success": True, "fail_reason": "NA"},
 22 |     success_re5: {"success": True, "fail_reason": "NA"},
 23 |     success_re6: {"success": True, "fail_reason": "NA"},
 24 |     failed_acc_re: {"success": False, "fail_reason": "failed to resolve accession"},
 25 | }
 26 | 
 27 | 
 28 | SYLPH_DB = "/FIX_PATH/v0.3-c200-gtdb-r214.syldb"
 29 | OUT_ROOT = "/FIX_PATH/"
 30 | SPLIT_ROOT = "/FIX_PATH/"
 31 | 
 32 | def parse_stdouterr_file(infile):
 33 |     results = {}
 34 |     with open(prefetch_e) as f:
 35 |         for line in f:
 36 |             for regex, d in REGEXES.items():
 37 |                 match = regex.search(line)
 38 |                 if match is not None:
 39 |                     print("MATCH!", line)
 40 |                     run = match.group("run")
 41 |                     results[run] = [d["success"], d["fail_reason"]]
 42 |                     break
 43 |         return results
 44 | 
 45 | 
 46 | 
 47 | def process_one_run(indir, run_id):
 48 |     command = f"fasterq-dump --fasta --split-3 {run_id}"
 49 |     print("command", command)
 50 | 
 51 |     try:
 52 |         print("to_fasta", run_id, command, flush=True)
 53 |         subprocess.check_output(command, shell=True, cwd=indir)
 54 |     except:
 55 |         return False, "fail_fasterq-dump"
 56 | 
 57 |     fasta_1 = os.path.join(indir, f"{run_id}_1.fasta")
 58 |     fasta_2 = os.path.join(indir, f"{run_id}_2.fasta")
 59 |     if not (os.path.exists(fasta_1) and os.path.exists(fasta_2)):
 60 |         return False, "fail_not_all_fasta_files_made"
 61 | 
 62 |     try:
 63 |         tmp_out = os.path.join(indir, f"{run_id}.tmp.sketch")
 64 |         subprocess.check_output(f"rm -rf {tmp_out}", shell=True)
 65 |         subprocess.check_output(f"sylph sketch -1 {fasta_1} -2 {fasta_2} -d {tmp_out}", shell=True)
 66 |         outfile = os.path.join(indir, f"{run_id}.sylph.tsv")
 67 |         subprocess.check_output(f"sylph profile -t 1 {SYLPH_DB} {tmp_out}/*.sylsp > {outfile}", shell=True)
 68 |         subprocess.check_output(f"rm -rf {tmp_out}", shell=True)
 69 |     except:
 70 |         return False, "error_sylph"
 71 | 
 72 |     print("Done OK", run_id)
 73 |     return True, "NA"
 74 | 
 75 | 
 76 | 
 77 | parser = argparse.ArgumentParser(
 78 |     description="Run sylph on dir from running prefetch on a file of run IDs",
 79 |     usage="%(prog)s",
 80 | )
 81 | options = parser.parse_args()
 82 | 
 83 | job_array_index = os.environ.get("LSB_JOBINDEX", None)
 84 | if job_array_index is None:
 85 |     job_array_index = os.environ.get("SLURM_ARRAY_TASK_ID", None)
 86 | 
 87 | if job_array_index is None:
 88 |     raise Exception("LSB_JOBINDEX/SLURM_ARRAY_TASK_ID not in env. Cannot continue")
 89 | options.ids_file = os.path.join(SPLIT_ROOT, f"{job_array_index}")
 90 | 
 91 | indir = os.path.join(OUT_ROOT, job_array_index)
 92 | assert os.path.exists(indir)
 93 | 
 94 | assert os.path.exists(options.ids_file)
 95 | with open(options.ids_file) as f:
 96 |     all_runs = [x.rstrip() for x in f]
 97 | all_runs.sort()
 98 | 
 99 | 
100 | status_file = os.path.join(indir, "sylph_status.json")
101 | if os.path.exists(status_file):
102 |     with open(status_file) as f:
103 |         sylph_results = json.load(f)
104 | else:
105 |     sylph_results = {}
106 | 
107 | 
108 | prefetch_e = os.path.join(indir, "prefetch.stdouterr")
109 | assert os.path.exists(prefetch_e)
110 | 
111 | print("Total runs:", len(all_runs), flush=True)
112 | prefetch_results = parse_stdouterr_file(prefetch_e)
113 | print(prefetch_results)
114 | 
115 | for run in all_runs:
116 |     sylph_status = sylph_results.get(run, (False, "unknown"))
117 |     prefetch_status = prefetch_results.get(run, (False, "unknown"))
118 | 
119 |     if sylph_status[0]:
120 |         print(run, "already done", flush=True)
121 |     elif prefetch_status[0]:
122 |         sylph_results[run] = process_one_run(indir, run)
123 |     else: # failed for some reason
124 |         sylph_results[run] = False, prefetch_status[1]
125 | 
126 |     sylph_status = sylph_results.get(run, (False, "unknown"))
127 |     if sylph_status[0]:
128 |         try:
129 |             subprocess.check_output(f"rm -rf {run} {run}_?.fasta", cwd=indir, shell=True)
130 |         except:
131 |             sylph_results[run] = False, "error_cleaning_files"
132 | 
133 | 
134 |     with open(status_file, "w") as f:
135 |         json.dump(sylph_results, f, indent=2)
136 | 


--------------------------------------------------------------------------------
/reproducibility/All-samples/checkm2/checkm2_batch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import os
  5 | import subprocess
  6 | import sys
  7 | import time
  8 | 
  9 | # SPLIT_ROOT = full path to the directory of "split input files".
 10 | # Files must be called 1, 2, 3, 4,  ...
 11 | # File N is processed by element N of the job array.
 12 | # Each file is tab-delimited, no header line, and has two fields:
 13 | # 1) sample name 2) full path to assembly fasta file
 14 | SPLIT_ROOT = "FIX_PATH"
 15 | 
 16 | ROOT_OUT = "FIX_PATH" # root directory of output checkm2 files
 17 | IMG = "/FIX_PATH/checkm2.1.0.1--pyh7cba7a3_0.img" # checkm2 singularity container
 18 | DB = "/FIX_PATH/uniref100.KO.1.dmnd"  # checkm2 database
 19 | CHECKM2 = f"singularity exec {IMG} checkm2 predict --allmodels --lowmem --database_path {DB} --remove_intermediates"
 20 | CHECKM2_COLS = [
 21 |     "Name",
 22 |     "Completeness_General",
 23 |     "Contamination",
 24 |     "Completeness_Specific",
 25 |     "Completeness_Model_Used",
 26 |     "Translation_Table_Used",
 27 |     "Coding_Density",
 28 |     "Contig_N50",
 29 |     "Average_Gene_Length",
 30 |     "Genome_Size",
 31 |     "GC_Content",
 32 |     "Total_Coding_Sequences",
 33 |     "Additional_Notes",
 34 | ]
 35 | 
 36 | def get_array_index():
 37 |     array_index = os.environ.get("LSB_JOBINDEX", None)
 38 |     if array_index is None:
 39 |         array_index = os.environ.get("SLURM_ARRAY_TASK_ID", None)
 40 |     if array_index is None:
 41 |         raise Exception("LSB_JOBINDEX/SLURM_ARRAY_TASK_ID not in env. Cannot continue")
 42 |     return array_index
 43 | 
 44 | 
 45 | def fix_sample_name_in_report_tsv(sample, infile, outfile):
 46 |     with open(infile) as f:
 47 |         results = [x.rstrip().split("\t") for x in f]
 48 |         if len(results) != 2:
 49 |             print("ERROR LINES!=2 IN RESULTS", sample, flush=True)
 50 |             return False
 51 |         if results[0] != CHECKM2_COLS:
 52 |             print("ERROR Unexpected column names", sample, flush=True)
 53 |             return False
 54 |         if len(results[1]) != len(CHECKM2_COLS):
 55 |             print("ERROR wrong number of fields in second line of results", sample, flush=True)
 56 |             return False
 57 | 
 58 |     with open(outfile, "w") as f:
 59 |         print(sample, *results[1][1:], sep="\t", file=f)
 60 | 
 61 |     return True
 62 | 
 63 | 
 64 | def run_one_sample(sample, fasta_file):
 65 |     done_file = f"{sample}.done"
 66 |     if os.path.exists(done_file):
 67 |         print("Already done", sample, flush=True)
 68 |         return True
 69 | 
 70 |     fail_file = f"{sample}.fail"
 71 |     if os.path.exists(fail_file):
 72 |         print("Already fail", sample, flush=True)
 73 |         return False
 74 | 
 75 |     subprocess.check_output(f"rm -rf {sample} {sample}.tsv", shell=True)
 76 | 
 77 |     command = f"{CHECKM2} -i {fasta_file} -o {sample}"
 78 |     print(command, flush=True)
 79 |     try:
 80 |         subprocess.check_output(command, shell=True, timeout=2400)
 81 |     except:
 82 |         print("ERROR RUNNING CHECKM", sample, flush=True)
 83 |         return False
 84 | 
 85 |     result_file = os.path.join(sample, "quality_report.tsv")
 86 |     if not os.path.exists(result_file):
 87 |         print("ERROR NO RESULT FILE", sample, flush=True)
 88 |         return False
 89 | 
 90 |     outfile = f"{sample}.tsv"
 91 |     try:
 92 |         ok = fix_sample_name_in_report_tsv(sample, result_file, outfile)
 93 |     except:
 94 |         print("ERROR parsing checkm output file", sample, flush=True)
 95 |         return False
 96 | 
 97 |     if not ok:
 98 |         print("ERROR parsing checkm output file", sample, flush=True)
 99 |         return False
100 | 
101 |     subprocess.check_output(f"rm -rf {sample}", shell=True)
102 |     subprocess.check_output(f"touch {sample}.done", shell=True)
103 |     return True
104 | 
105 | 
106 | 
107 | 
108 | job_array_index = get_array_index()
109 | 
110 | 
111 | samples_file = os.path.join(SPLIT_ROOT, job_array_index)
112 | with open(samples_file) as f:
113 |     samples = [x.rstrip().split() for x in f]
114 | 
115 | 
116 | outdir = os.path.join(ROOT_OUT, job_array_index)
117 | if not os.path.exists(outdir):
118 |     os.mkdir(outdir)
119 | 
120 | os.chdir(outdir)
121 | 
122 | all_done_file = "all.done"
123 | if os.path.exists(all_done_file):
124 |     print("All done already")
125 |     sys.exit()
126 | 
127 | results_files = []
128 | fails = []
129 | 
130 | for sample, query_file in samples:
131 |     assert sample != "all"
132 |     try:
133 |         ok = run_one_sample(sample, query_file)
134 |     except:
135 |         print("ERROR OTHER", sample, flush=True)
136 |         ok = False
137 | 
138 |     if not ok:
139 |         fails.append(sample)
140 |         continue
141 | 
142 |     results_file = f"{sample}.tsv"
143 |     if os.path.exists(results_file):
144 |         results_files.append(results_file)
145 |     else:
146 |         fails.append(sample)
147 | 
148 | 
149 | if len(fails):
150 |     with open("fails.txt", "w") as f_out:
151 |         print(*fails, sep="\n", file=f_out)
152 | 
153 | 
154 | 
155 | if len(results_files) > 0:
156 |     with open("all.tsv", "w") as f_out:
157 |         print(*CHECKM2_COLS, sep="\t", file=f_out)
158 |         for filename in results_files:
159 |             with open(filename) as f_in:
160 |                 for line in f_in:
161 |                     print(line, end="", file=f_out)
162 | 
163 | 
164 | with open(all_done_file, "w") as f:
165 |     pass
166 | 
167 | for sample, query_file in samples:
168 |     print("deleting intermediate files", sample)
169 |     command = f"rm -rf {sample} {sample}.bin_input {sample}.tsv {sample}.done"
170 |     try:
171 |         subprocess.check_output(command, shell=True)
172 |     except:
173 |         time.sleep(5)
174 |         subprocess.run(command, shell=True)
175 | 
176 | 


--------------------------------------------------------------------------------