Quality score is calculated as: Completeness - 5 x Contamination.
"
73 | )
74 | add_stats("Quality score >50 ", df.query("Quality_score>50"))
75 | add_stats("Good quality", df.query("Completeness>90 & Contamination <5"))
76 | add_stats("Quality score >90 ", df.query("Quality_score>90"))
77 |
78 | div["table"] = st.to_html()
79 |
80 | logging.info(df.describe())
81 |
82 | # Bin Id Completeness completeness_general Contamination completeness_specific completeness_model_used translation_table_used coding_density contig_n50 average_gene_length genome_size gc_content total_coding_sequences additional_notes quality_score sample Ambigious_bases Length_contigs Length_scaffolds N50 N_contigs N_scaffolds logN50
83 | hover_data = [
84 | "Completeness_Model_Used",
85 | "Coding_Density",
86 | "N50",
87 | "GC_Content",
88 | ]
89 | size_name = "Genome_Size"
90 |
91 | lineage_name = "Species"
92 |
93 | # 2D plot
94 |
95 | logging.info("make 2d plot")
96 | fig = px.scatter(
97 | data_frame=df,
98 | y="Completeness",
99 | x="Contamination",
100 | color=lineage_name,
101 | size=size_name,
102 | hover_data=hover_data,
103 | hover_name="Bin Id",
104 | )
105 | fig.update_yaxes(range=(50, 102))
106 | fig.update_xaxes(range=(-0.2, 10.1))
107 | div["2D"] = fig.to_html(**HTML_PARAMS)
108 |
109 | # 2D plot
110 |
111 | logging.info("make 2d plot species")
112 | fig = px.scatter(
113 | data_frame=df.loc[df.Representative.unique()],
114 | y="Completeness",
115 | x="Contamination",
116 | color=lineage_name,
117 | size=size_name,
118 | hover_data=hover_data,
119 | hover_name="Bin Id",
120 | )
121 | fig.update_yaxes(range=(50, 102))
122 | fig.update_xaxes(range=(-0.2, 10.1))
123 | div["2Dsp"] = fig.to_html(**HTML_PARAMS)
124 |
125 | ## By sample
126 | logging.info("plot by sample")
127 | fig = px.strip(
128 | data_frame=df,
129 | y="Quality_score",
130 | x="Sample",
131 | color=lineage_name,
132 | hover_data=hover_data,
133 | hover_name="Bin Id",
134 | )
135 | fig.update_yaxes(range=(50, 102))
136 | div["bySample"] = fig.to_html(**HTML_PARAMS)
137 |
138 | # # By species
139 | # logging.info("plot by species")
140 | # fig = px.strip(
141 | # data_frame=df,
142 | # y="Quality_score",
143 | # x=lineage_name,
144 | # hover_data=hover_data,
145 | # hover_name="Bin Id",
146 | # )
147 | # fig.update_yaxes(range=(50, 102))
148 | # div["byPhylum"] = fig.to_html(**HTML_PARAMS)
149 |
150 | return div
151 |
152 |
153 | # main
154 |
155 |
156 | div = make_plots(bin_info=snakemake.input.bin_info)
157 |
158 |
159 | make_html(
160 | div=div,
161 | report_out=snakemake.output.report,
162 | html_template_file=os.path.join(reports_dir, "template_bin_report.html"),
163 | wildcards=snakemake.wildcards,
164 | )
165 |
--------------------------------------------------------------------------------
/workflow/report/common_report.py:
--------------------------------------------------------------------------------
1 | import plotly.io as pio
2 | import os, sys
3 |
4 | atlas_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
5 |
6 | reports_dir = os.path.join(atlas_dir, "report")
7 |
8 | sys.path.append(os.path.join(atlas_dir, "scripts"))
9 |
10 |
11 | pio.templates.default = "simple_white"
12 | HTML_PARAMS = dict(
13 | include_plotlyjs=False,
14 | full_html=False,
15 | )
16 |
17 |
18 | ## make html report
19 |
20 |
21 | def make_html(
22 | html_template_file,
23 | report_out,
24 | div,
25 | css_file=os.path.join(reports_dir, "report.css"),
26 | wildcards={},
27 | ):
28 | html_template = open(html_template_file).read()
29 | css_content = open(css_file).read()
30 |
31 | html_string = html_template.format(div=div, css_content=css_content, **wildcards)
32 |
33 | with open(report_out, "w") as outf:
34 | outf.write(html_string)
35 |
--------------------------------------------------------------------------------
/workflow/report/report.css:
--------------------------------------------------------------------------------
1 | /* Overrides of notebook CSS for static HTML export */
2 | body {
3 | overflow: visible;
4 | font-size: 14pt;
5 | padding: 8px;
6 | margin:0 100;
7 | background:whitesmoke;
8 | }
9 |
10 | h1 {
11 | text-align: center
12 | }
13 |
14 | p {
15 | font-size: 14pt;
16 | }
17 |
18 | .float-container {
19 | padding: 2px;
20 | height:100%;
21 | width:100%;
22 | }
23 |
24 | .float-child {
25 | width: 50%;
26 | float: left;
27 | padding: 2px;
28 | }
29 |
30 | @media not print {
31 | #notebook-container {
32 | padding: 15px;
33 | background-color: #fff;
34 | min-height: 0;
35 | -webkit-box-shadow: 0px 0px 12px 1px rgba(87, 87, 87, 0.2);
36 | box-shadow: 0px 0px 12px 1px rgba(87, 87, 87, 0.2);
37 | }
38 | }
39 | @media print {
40 | #notebook-container {
41 | width: 100%;
42 | }
43 |
--------------------------------------------------------------------------------
/workflow/report/template_QC_report.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
Quality Control Report
20 |
21 |
22 |
Number of reads that went through the quality control process.
23 |
24 | {div[Reads]}
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | Step |
34 | Output |
35 |
36 |
37 |
38 | raw |
39 | the input reads |
40 |
41 | deduplicated |
42 | after (optional) deduplication step |
43 |
44 | filtered |
45 | trimmed, quality filtered |
46 |
47 | qc |
48 | final reads, contaminants removed |
49 |
50 |
51 |
52 |
53 |
Total number of reads/bases after QC
54 |
55 |
56 |
57 |
58 | {div[Total_Reads]}
59 |
60 |
61 |
62 | {div[Total_Bases]}
63 |
64 |
65 |
66 |
67 |
68 |
Base quality values along reads
69 |
70 | {div[quality_QC]}
71 |
72 |
Read length
73 |
74 |
75 | {div[Length]}
76 |
77 |
Insert size
78 |
The size of the reads + the space between. Ideally, the paired-end reads don't overlap.
79 |
80 | {div[Insert]}
81 |
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/workflow/report/template_assembly_report.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
Assembly Summary
20 |
21 |
22 |
Total assembly length
23 |
24 | {div[Total]}
25 |
26 |
27 |
Fragmentation
28 |
29 |
30 | N50/N90 is a measure of how fractionated assemblies are:
31 | 50%/90% of the assembly consists of contigs of length N50/N90 or longer.
32 | You need N50/N90-number contigs to get 50%/90% of the total assembly length.
33 |
34 |
35 |
36 |
37 |
38 |
39 | {div[N50]}
40 |
41 |
42 |
43 | {div[N90]}
44 |
45 |
46 |
47 |
48 |
Genes / Reads
49 |
50 |
51 |
52 |
53 | {div[N_Predicted_Genes]}
54 |
55 |
56 |
57 | {div[Percent_Assembled_Reads]}
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/workflow/report/template_bin_report.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
20 |
21 |
Bin Report for Binner {binner}
22 |
Genome completeness and contamination, and taxonomy were estimated unsing CheckM2.
23 | {div[QualityScore]}
24 |
For all the information see the file {div[input_file]}
25 |
26 |
Number of genomes
27 | {div[table]}
28 |
29 |
"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.
30 |
31 |
Quality for all bins
32 | {div[2D]}
33 |
34 |
35 |
Quality for Species representatives
36 | {div[2Dsp]}
37 |
38 |
39 |
40 |
Quality score by Sample
41 |
42 |
43 |
44 | {div[bySample]}
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/workflow/rules/cdhit.smk:
--------------------------------------------------------------------------------
1 | def parse_cd_hit_file(clstr_file):
2 | """
3 |
4 | >Cluster 0
5 | 0 342nt, >S1_83_1... *
6 | 1 342nt, >S2_82_1... at +/100.00%
7 | >Cluster 1
8 | 0 339nt, >S1_61_1... *
9 | 1 339nt, >S2_59_1... at +/100.00%
10 |
11 |
12 | """
13 | import numpy as np
14 |
15 | def parse_line(line):
16 | _, length, name, identity = (
17 | line.strip().replace("...", "\t").replace(", ", "\t").split("\t")
18 | )
19 |
20 | length = int(length.replace("nt", ""))
21 | name = name[1:]
22 | if "*" in identity:
23 | identity = np.nan
24 | else:
25 | identity = float(identity[identity.rfind("/") + 1 : identity.rfind("%")])
26 |
27 | return name, length, identity
28 |
29 | Clusters = []
30 | with open(clstr_file) as f:
31 | for line in f:
32 | if line[0] == ">": # new cluster
33 | cluster = dict(elements=[], representative=None)
34 | Clusters.append(cluster)
35 | else:
36 | name, length, identity = parse_line(line)
37 | cluster["elements"].append((name, length, identity))
38 | if np.isnan(identity):
39 | cluster["representative"] = name
40 | return Clusters
41 |
42 |
43 | def write_cd_hit_clusters(Clusters, file_handle):
44 | for cluster in Clusters:
45 | for element in cluster["elements"]:
46 | file_handle.write(
47 | f"{element[0]}\t{element[1]}\t{element[2]}\t{cluster['representative']}\n"
48 | )
49 |
50 |
51 | localrules:
52 | parse_clstr_files,
53 | rename_gene_clusters,
54 |
55 |
56 | rule cluster_genes:
57 | input:
58 | fna_dir="Genecatalog/all_genes/predicted_genes.fna",
59 | output:
60 | temp("Genecatalog/representatives_of_clusters.fasta"),
61 | temp("Genecatalog/gene_catalog_oldnames.clstr"),
62 | conda:
63 | "%s/cd-hit.yaml" % CONDAENV
64 | log:
65 | "logs/Genecatalog/cluster_genes.log",
66 | threads: config.get("threads", 1)
67 | resources:
68 | mem_mb=config["mem"] * 1000,
69 | params:
70 | coverage=config["genecatalog"]["coverage"],
71 | identity=config["genecatalog"]["minid"],
72 | extra=config["genecatalog"]["extra"],
73 | prefix=lambda wc, output: os.path.splitext(output[1])[0],
74 | shell:
75 | """
76 | cd-hit-est -i {input} -T {threads} \
77 | -M {resources.mem}000 -o {params.prefix} \
78 | -c {params.identity} -n 9 -d 0 {params.extra} \
79 | -aS {params.coverage} -aL {params.coverage} &> {log}
80 |
81 | mv {params.prefix} {output[0]} 2>> {log}
82 | """
83 |
84 |
85 | rule parse_clstr_files:
86 | input:
87 | clustered_dir="Genecatalog/gene_catalog_oldnames.clstr",
88 | output:
89 | temp("Genecatalog/orf2gene_oldnames.tsv"),
90 | run:
91 | with open(output[0], "w") as fout:
92 | fout.write(f"ORF\tLength\tIdentity\tRepresentative\n")
93 | Clusters = parse_cd_hit_file(input[0])
94 | write_cd_hit_clusters(Clusters, fout)
95 |
96 |
97 | rule generate_orf_info:
98 | input:
99 | cluster_attribution="Genecatalog/orf2gene_oldnames.tsv",
100 | output:
101 | cluster_attribution="Genecatalog/clustering/orf_info.parquet",
102 | rep2genenr="Genecatalog/clustering/representative2genenr.tsv",
103 | threads: 1
104 | run:
105 | import pandas as pd
106 | import numpy as np
107 |
108 | from utils import gene_scripts
109 |
110 | # cd hit format ORF\tLength\tIdentity\tRepresentative\n
111 | orf2gene = pd.read_csv(input.orf2gene, sep="\t")
112 |
113 | # rename gene repr to Gene0000XX
114 |
115 | # split orf names in sample, contig_nr, and orf_nr
116 | orf_info = gene_scripts.split_orf_to_index(orf2gene.ORF)
117 |
118 | # rename representative
119 |
120 | representative_names = orf2gene.Representative.unique()
121 |
122 | map_names = pd.Series(
123 | index=representative_names,
124 | data=np.arange(1, len(representative_names) + 1, dtype=np.uint),
125 | )
126 |
127 |
128 | orf_info["GeneNr"] = orf2gene.Representative.map(map_names)
129 |
130 |
131 | orf_info.to_parquet(output.cluster_attribution)
132 |
133 |
134 | # Save name of representatives
135 | map_names.index.name = "Representative"
136 | map_names.name = "GeneNr"
137 | map_names.to_csv(output.rep2genenr, sep="\t")
138 |
--------------------------------------------------------------------------------
/workflow/rules/derep.smk:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | rule run_skani:
5 | input:
6 | paths="Binning/{binner}/filtered_bins_paths.txt",
7 | output:
8 | temp("Intermediate/dereplication/{binner}_distance_matrix.txt"),
9 | log:
10 | "logs/binning/{binner}/dereplication/skani_calculation.log",
11 | resources:
12 | mem_mb=config["mem"] * 1000,
13 | time_min=60 * config["runtime"]["default"],
14 | params:
15 | #preset= "medium", # fast, medium or slow
16 | min_af=config["genome_dereplication"]["overlap"] * 100,
17 | extra="",
18 | threads: config["threads"]
19 | conda:
20 | "../envs/skani.yaml"
21 | shell:
22 | "skani triangle "
23 | " {params.extra} "
24 | " -l {input.paths} "
25 | " -o {output} "
26 | " -t {threads} "
27 | " --sparse --ci "
28 | " --min-af {params.min_af} "
29 | " &> {log} "
30 |
31 |
32 | rule skani_2_parquet:
33 | input:
34 | rules.run_skani.output,
35 | output:
36 | "Binning/{binner}/genome_similarities.parquet",
37 | resources:
38 | mem_mb=config["mem"] * 1000,
39 | time_min=60 * config["runtime"]["simplejob"],
40 | log:
41 | "logs/binning/{binner}/dereplication/skani_2_parquet.log",
42 | threads: 1
43 | run:
44 | try:
45 | skani_column_dtypes = {
46 | "Ref_file": "category",
47 | "Query_file": "category",
48 | "ANI": float,
49 | "Align_fraction_ref": float,
50 | "Align_fraction_query": float,
51 | "ANI_5_percentile": float,
52 | "ANI_95_percentile": float,
53 | } # Ref_name Query_name
54 |
55 | import pandas as pd
56 |
57 | import pandas as pd
58 |
59 | df = pd.read_table(input[0])
60 |
61 | from utils.io import simplify_path
62 |
63 | df = pd.read_table(
64 | input[0],
65 | usecols=list(skani_column_dtypes.keys()),
66 | dtype=skani_column_dtypes,
67 | )
68 |
69 | df["Ref"] = df.Ref_file.cat.rename_categories(simplify_path)
70 | df["Query"] = df.Query_file.cat.rename_categories(simplify_path)
71 |
72 | df.to_parquet(output[0])
73 |
74 | except Exception as e:
75 | import traceback
76 |
77 | with open(log[0], "w") as logfile:
78 | traceback.print_exc(file=logfile)
79 |
80 | raise e
81 |
82 |
83 | rule cluster_species:
84 | input:
85 | dist="Binning/{binner}/genome_similarities.parquet",
86 | bin_info="Binning/{binner}/filtered_bin_info.tsv",
87 | params:
88 | linkage_method="average",
89 | pre_cluster_threshold=0.925,
90 | threshold=config["genome_dereplication"]["ANI"],
91 | conda:
92 | "../envs/species_clustering.yaml"
93 | log:
94 | "logs/binning/{binner}/dereplication/species_clustering.log",
95 | output:
96 | bin_info="Binning/{binner}/bin_info.tsv",
97 | bins2species="Binning/{binner}/bins2species.tsv",
98 | script:
99 | "../scripts/cluster_species.py"
100 |
101 |
102 | rule build_bin_report:
103 | input:
104 | bin_info="Binning/{binner}/bin_info.tsv",
105 | bins2species="Binning/{binner}/bins2species.tsv",
106 | output:
107 | report="reports/bin_report_{binner}.html",
108 | conda:
109 | "../envs/report.yaml"
110 | log:
111 | "logs/binning/report_{binner}.log",
112 | script:
113 | "../report/bin_report.py"
114 |
--------------------------------------------------------------------------------
/workflow/rules/dram.smk:
--------------------------------------------------------------------------------
1 | DBDIR = config["database_dir"]
2 |
3 |
4 | def get_dram_config(wildcards):
5 | old_dram_path = f"{DBDIR}/Dram"
6 | if Path(old_dram_path).exists():
7 | logger.error(
8 | f"Detected an old database for DRAM in {old_dram_path}. You can delete it."
9 | )
10 |
11 | return config.get("dram_config_file", f"{DBDIR}/DRAM/DRAM.config")
12 |
13 |
14 | localrules:
15 | dram_download,
16 | concat_annotations,
17 |
18 |
19 | rule dram_download:
20 | output:
21 | dbdir=directory(f"{DBDIR}/DRAM/db/"),
22 | config=f"{DBDIR}/DRAM/DRAM.config",
23 | threads: config["threads"]
24 | resources:
25 | mem_mb=config["mem"] * 1000,
26 | time_min=60 * config["runtime"]["default"],
27 | log:
28 | "logs/dram/download_dram.log",
29 | benchmark:
30 | "logs/benchmarks/dram/download_dram.tsv"
31 | conda:
32 | "../envs/dram.yaml"
33 | shell:
34 | " DRAM-setup.py prepare_databases "
35 | " --output_dir {output.dbdir} "
36 | " --threads {threads} "
37 | " --verbose "
38 | " --skip_uniref "
39 | " &> {log} "
40 | " ; "
41 | " DRAM-setup.py export_config --output_file {output.config}"
42 |
43 |
44 | rule DRAM_annotate:
45 | input:
46 | fasta="genomes/genomes/{genome}.fasta",
47 | #checkm= "genomes/checkm/completeness.tsv",
48 | #gtdb_dir= "genomes/taxonomy/gtdb/classify",
49 | config=get_dram_config,
50 | output:
51 | outdir=directory("genomes/annotations/dram/intermediate_files/{genome}"),
52 | threads: config["simplejob_threads"]
53 | resources:
54 | mem_mb=config["simplejob_mem"] * 1000,
55 | time_min=60 * config["runtime"]["default"],
56 | conda:
57 | "../envs/dram.yaml"
58 | params:
59 | extra=config.get("dram_extra", ""),
60 | min_contig_size=config.get("minimum_contig_length", "1000"),
61 | log:
62 | "logs/dram/run_dram/{genome}.log",
63 | benchmark:
64 | "logs/benchmarks/dram/run_dram/{genome}.tsv"
65 | shell:
66 | " DRAM.py annotate "
67 | " --config_loc {input.config} "
68 | " --input_fasta {input.fasta}"
69 | " --output_dir {output.outdir} "
70 | " --threads {threads} "
71 | " --min_contig_size {params.min_contig_size} "
72 | " {params.extra} "
73 | " --verbose &> {log}"
74 | #" --gtdb_taxonomy {input.gtdb_dir}/{params.gtdb_file} "
75 | #" --checkm_quality {input.checkm} "
76 |
77 |
78 | def get_all_dram(wildcards):
79 | all_genomes = get_all_genomes(wildcards)
80 |
81 | return expand(rules.DRAM_annotate.output.outdir, genome=all_genomes)
82 |
83 |
84 | DRAM_ANNOTATON_FILES = ["annotations.tsv"]
85 |
86 |
87 | rule concat_annotations:
88 | input:
89 | get_all_dram,
90 | output:
91 | expand("genomes/annotations/dram/{annotation}", annotation=DRAM_ANNOTATON_FILES),
92 | resources:
93 | time_min=60 * config["runtime"]["default"],
94 | run:
95 | from utils import io
96 |
97 | for i, annotation_file in enumerate(DRAM_ANNOTATON_FILES):
98 | input_files = [
99 | os.path.join(dram_folder, annotation_file) for dram_folder in input
100 | ]
101 |
102 | io.pandas_concat(
103 | input_files, output[i], sep="\t", index_col=0, axis=0, disk_based=True
104 | )
105 |
106 |
107 | rule DRAM_destill:
108 | input:
109 | rules.concat_annotations.output,
110 | config=get_dram_config,
111 | output:
112 | outdir=directory("genomes/annotations/dram/distil"),
113 | threads: 1
114 | resources:
115 | mem_mb=config["simplejob_mem"] * 1000,
116 | ttime_min=60 * config["runtime"]["simplejob"],
117 | conda:
118 | "../envs/dram.yaml"
119 | log:
120 | "logs/dram/distil.log",
121 | shell:
122 | " DRAM.py distill "
123 | " --config_loc {input.config} "
124 | " --input_file {input[0]}"
125 | " --output_dir {output} "
126 | " &> {log}"
127 |
128 |
129 | rule get_all_modules:
130 | input:
131 | annotations="genomes/annotations/dram/annotations.tsv",
132 | config=get_dram_config,
133 | output:
134 | "genomes/annotations/dram/kegg_modules.tsv",
135 | threads: 1
136 | resources:
137 | mem_mb=config["simplejob_mem"] * 1000,
138 | time_min=60 * config["runtime"]["default"],
139 | conda:
140 | "../envs/dram.yaml"
141 | log:
142 | "logs/dram/get_all_modules.log",
143 | script:
144 | "../scripts/DRAM_get_all_modules.py"
145 |
146 |
147 | rule dram:
148 | input:
149 | "genomes/annotations/dram/distil",
150 | "genomes/annotations/dram/kegg_modules.tsv",
151 |
--------------------------------------------------------------------------------
/workflow/rules/gtdbtk.smk:
--------------------------------------------------------------------------------
1 | gtdb_dir = "genomes/taxonomy/gtdb"
2 |
3 |
4 | rule identify:
5 | input:
6 | flag=rules.extract_gtdb.output,
7 | genes_flag="genomes/annotations/genes/predicted",
8 | output:
9 | directory(f"{gtdb_dir}/identify"),
10 | threads: config["threads"]
11 | conda:
12 | "../envs/gtdbtk.yaml"
13 | log:
14 | "logs/taxonomy/gtdbtk/identify.txt",
15 | f"{gtdb_dir}/gtdbtk.log",
16 | params:
17 | outdir=gtdb_dir,
18 | extension="faa",
19 | gene_dir=lambda wc, input: os.path.abspath(os.path.dirname(input.genes_flag)),
20 | shell:
21 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
22 | "gtdbtk identify "
23 | "--genes --genome_dir {params.gene_dir} "
24 | " --out_dir {params.outdir} "
25 | "--extension {params.extension} "
26 | "--cpus {threads} &> {log[0]}"
27 |
28 |
29 | checkpoint align:
30 | input:
31 | f"{gtdb_dir}/identify",
32 | output:
33 | directory(f"{gtdb_dir}/align"),
34 | threads: config["threads"]
35 | resources:
36 | mem_mb=config["large_mem"] * 1000,
37 | conda:
38 | "../envs/gtdbtk.yaml"
39 | log:
40 | "logs/taxonomy/gtdbtk/align.txt",
41 | f"{gtdb_dir}/gtdbtk.log",
42 | params:
43 | outdir=gtdb_dir,
44 | shell:
45 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
46 | "gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} "
47 | "--cpus {threads} &> {log[0]}"
48 |
49 |
50 | rule classify:
51 | input:
52 | rules.align.output,
53 | genome_dir=genome_dir,
54 | output:
55 | directory(f"{gtdb_dir}/classify"),
56 | threads: config["threads"] #pplacer needs much memory for not many threads
57 | resources:
58 | mem_mb=config["large_mem"] * 1000,
59 | time_min=60 * config["runtime"]["long"],
60 | conda:
61 | "../envs/gtdbtk.yaml"
62 | log:
63 | "logs/taxonomy/gtdbtk/classify.txt",
64 | f"{gtdb_dir}/gtdbtk.log",
65 | params:
66 | outdir=gtdb_dir,
67 | extension="fasta",
68 | mashdir=Path(GTDBTK_DATA_PATH) / "mash_db",
69 | shell:
70 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
71 | "gtdbtk classify --genome_dir {input.genome_dir} --align_dir {params.outdir} "
72 | " --mash_db {params.mashdir} "
73 | "--out_dir {params.outdir} "
74 | " --tmpdir {resources.tmpdir} "
75 | "--extension {params.extension} "
76 | "--cpus {threads} &> {log[0]}"
77 |
78 |
79 | rule combine_taxonomy:
80 | input:
81 | folder=f"{gtdb_dir}/classify",
82 | output:
83 | combined=f"{gtdb_dir}/gtdbtk.combined.summary.tsv",
84 | taxonomy="genomes/taxonomy/gtdb_taxonomy.tsv",
85 | log:
86 | "logs/taxonomy/gtdbtk/combine.txt",
87 | script:
88 | "../scripts/combine_taxonomy.py"
89 |
90 |
91 | rule build_tree:
92 | input:
93 | f"{gtdb_dir}/align/{{msa}}.user_msa.fasta.gz",
94 | output:
95 | temp("genomes/taxonomy/gtdb/{msa}.unrooted.tree"),
96 | log:
97 | "logs/genomes/tree/{msa}.log",
98 | "logs/genomes/tree/{msa}.err",
99 | threads: max(config["threads"], 3)
100 | params:
101 | outdir=lambda wc, output: Path(output[0]).parent,
102 | conda:
103 | "../envs/gtdbtk.yaml"
104 | shell:
105 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
106 | "gtdbtk infer --msa_file {input} "
107 | " --out_dir {params.outdir} "
108 | " --prefix {wildcards.msa} "
109 | " --cpus {threads} "
110 | "--tmpdir {resources.tmpdir} > {log[0]} 2> {log[1]}"
111 |
112 |
113 | localrules:
114 | root_tree,
115 |
116 |
117 | rule root_tree:
118 | input:
119 | tree=rules.build_tree.output[0],
120 | wildcard_constraints:
121 | msa="((?!unrooted).)*",
122 | output:
123 | tree="genomes/tree/{msa}.nwk",
124 | conda:
125 | "../envs/tree.yaml"
126 | threads: 1
127 | resources:
128 | mem_mb=config["simplejob_mem"] * 1000,
129 | ttime_min=60 * config["runtime"]["simplejob"],
130 | log:
131 | "logs/genomes/tree/root_tree_{msa}.log",
132 | script:
133 | "../scripts/root_tree.py"
134 |
135 |
136 | def all_gtdb_trees_input(wildcards):
137 | dir = checkpoints.align.get().output[0]
138 |
139 | domains = glob_wildcards(f"{dir}/gtdbtk.{{domain}}.user_msa.fasta.gz").domain
140 |
141 | return expand("genomes/tree/gtdbtk.{domain}.nwk", domain=domains)
142 |
143 |
144 | rule all_gtdb_trees:
145 | input:
146 | all_gtdb_trees_input,
147 | output:
148 | touch("genomes/tree/finished_gtdb_trees"),
149 |
--------------------------------------------------------------------------------
/workflow/rules/patch.smk:
--------------------------------------------------------------------------------
1 | localrules:
2 | copy_assembly,
3 |
4 |
5 | # Rules that are usefull temporarily to update to new version of atlas
6 |
7 |
8 | ruleorder: copy_assembly > finalize_contigs
9 |
10 |
11 | rule copy_assembly:
12 | input:
13 | "{sample}/{sample}_contigs.fasta",
14 | output:
15 | "Assembly/fasta/{sample}.fasta",
16 | shell:
17 | "cp {input} {output}"
18 |
--------------------------------------------------------------------------------
/workflow/rules/predict_genes_of_genomes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os, sys
4 | import logging, traceback
5 |
6 | logging.basicConfig(
7 | filename=snakemake.log[0],
8 | level=logging.INFO,
9 | format="%(asctime)s %(message)s",
10 | datefmt="%Y-%m-%d %H:%M:%S",
11 | )
12 |
13 | logging.captureWarnings(True)
14 |
15 |
16 | def handle_exception(exc_type, exc_value, exc_traceback):
17 | if issubclass(exc_type, KeyboardInterrupt):
18 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
19 | return
20 |
21 | logging.error(
22 | "".join(
23 | [
24 | "Uncaught exception: ",
25 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
26 | ]
27 | )
28 | )
29 |
30 |
31 | # Install exception handler
32 | sys.excepthook = handle_exception
33 |
34 | #### Begining of scripts
35 |
36 | # python 3.5 without f strings
37 |
38 | import os, shutil, sys
39 | import uuid
40 | import itertools
41 | from glob import glob
42 | from snakemake.shell import shell
43 | from snakemake.io import glob_wildcards
44 | from multiprocessing import Pool
45 |
46 |
47 | def predict_genes(genome, fasta, out_dir, log):
48 | fna = "{}/{}.fna".format(out_dir, genome)
49 | faa = "{}/{}.faa".format(out_dir, genome)
50 | gff = "{}/{}.gff".format(out_dir, genome)
51 |
52 | shell('printf "{genome}:\n" > {log}'.format(genome=genome, log=log))
53 | shell(
54 | "prodigal -i {fasta} -o {gff} -d {fna} -a {faa} -p sinlge -c -m -f gff 2>> {log} ".format(
55 | fasta=fasta, log=log, gff=gff, fna=fna, faa=faa
56 | )
57 | )
58 | shell('printf "\n" >> {log}'.format(log=log))
59 |
60 |
61 | def predict_genes_genomes(input_dir, out_dir, log, threads):
62 | genomes_fastas = glob(os.path.join(input_dir, "*.fasta"))
63 |
64 | os.makedirs(out_dir, exist_ok=True)
65 |
66 | temp_log_dir = os.path.join(os.path.dirname(log), "tmp_" + uuid.uuid4().hex)
67 | os.makedirs(temp_log_dir, exist_ok=False)
68 |
69 | genome_names = []
70 | log_names = []
71 | for fasta in genomes_fastas:
72 | genome_name = os.path.splitext(os.path.split(fasta)[-1])[0]
73 | genome_names.append(genome_name)
74 | log_names.append(os.path.join(temp_log_dir, genome_name + ".prodigal.tmp"))
75 |
76 | pool = Pool(threads)
77 | pool.starmap(
78 | predict_genes,
79 | zip(genome_names, genomes_fastas, itertools.repeat(out_dir), log_names),
80 | )
81 |
82 | # cat in python
83 | with open(log, "ab") as f_out:
84 | for logfile in log_names:
85 | with open(logfile, "rb") as f_in:
86 | shutil.copyfileobj(f_in, f_out)
87 |
88 | shell("rm -r {temp_log_dir}".format(temp_log_dir=temp_log_dir))
89 |
90 |
91 | if __name__ == "__main__":
92 | predict_genes_genomes(
93 | snakemake.input.dir,
94 | snakemake.output[0],
95 | snakemake.log[0],
96 | int(snakemake.threads),
97 | )
98 |
--------------------------------------------------------------------------------
/workflow/rules/scg_blank_diamond.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | # The MIT License (MIT)
4 | # Copyright (c) 2016 Alexander J Probst
5 |
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
7 |
8 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
9 |
10 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 |
12 | # https://github.com/AJProbst/sngl_cp_gn
13 |
14 | #1: $search_engine name
15 | #2: $proteins
16 | #3: $DIR\/db/bac.all.faa
17 | #4: $DIR\/db/bac.scg.faa
18 | #5: $DIR\/db/bac.scg.lookup
19 | #6: $threads
20 |
21 | d = ARGV[0]
22 |
23 | input_file = ARGV[1]
24 | output_dir = File.dirname(input_file)
25 |
26 | datab = ARGV[2]
27 | db_all = File.dirname(input_file) + "/all_prot"
28 | puts "database name of all proteins is #{datab}"
29 |
30 | db_name = ARGV[3]
31 | puts "database name of SCGs is #{db_name}"
32 |
33 | db_lookup = ARGV[4]
34 | puts "database lookup is #{db_lookup}"
35 |
36 | threads = ARGV[5]
37 |
38 | #build databases
39 | full_db = system "#{d} makedb --in #{datab} -d #{db_all}.dmnd"
40 | abort "makeblastdb did not work for #{datab}, please check your input file" unless full_db
41 |
42 | # find SCG candidates
43 | puts "finding SCG candidates..."
44 | input_blast_database = system "#{d} makedb --in #{input_file} -d #{input_file}.dmnd"
45 | input_blast_out = File.join(output_dir,File.basename(input_file) + ".findSCG.b6")
46 | abort "makeblastdb did not work for #{input_file}, please check your input file" unless input_blast_database
47 | input_blast_ok = system "#{d} blastp --query #{db_name} --db #{input_file}.dmnd --max-target-seqs 0 --outfmt 6 qseqid sseqid pident length qlen slen evalue bitscore --out #{input_blast_out} --evalue 0.01 --threads #{threads}"
48 | system "rm #{input_file}.dmnd"
49 | abort "blast did not work, please check your input file." unless input_blast_ok
50 |
51 | input_blast_out_whitelist = File.join(output_dir,File.basename(input_file) + ".findSCG.b6.whitelist")
52 | system "awk '{print$2}' #{input_blast_out} | sort -u > #{input_blast_out_whitelist}"
53 | scg_candidates = File.join(output_dir,File.basename(input_file) + ".scg.candidates.faa")
54 | system "pullseq -i #{input_file} -n #{input_blast_out_whitelist} > #{scg_candidates}"
55 | system "rm #{input_blast_out_whitelist}"
56 |
57 | # verify SCGs by blasting against all proteins of all genomes
58 | puts "verifying selected SCGs..."
59 | db_blast_out = File.join(output_dir,File.basename(input_file) + ".all.b6")
60 | db_blast_ok = system "#{d} blastp --query #{scg_candidates} --db #{db_all} --evalue 0.00001 --threads #{threads} --out #{db_blast_out} --outfmt 6 qseqid sseqid pident length qlen slen evalue bitscore --max-target-seqs 1"
61 | abort "verifying blast did not work" unless db_blast_ok
62 | system "rm #{db_all}.dmnd"
63 | puts "starting annotations of single copy cogs..."
64 |
65 | # Read db_lookup
66 | lookup_h = {}
67 | File.open(db_lookup).each do |line|
68 | sbj, annotation = line.chomp.split
69 | lookup_h[sbj]=annotation
70 | end
71 |
72 | # now compare and print
73 | File.open(File.join(output_dir,File.basename(input_file)+".scg"), "w") do |file|
74 | File.open(db_blast_out).each do |line|
75 | next if line =~ /^#/
76 | line.chomp!
77 | temp = line.split(/\t/)
78 | query, sbjct = temp[0], temp[1]
79 | aln_len, sbjct_len = temp[3], temp[5]
80 | if lookup_h[sbjct] && aln_len > (sbjct_len*0.5)
81 | file.puts "#{query.split[0]}\t#{lookup_h[sbjct]}"
82 | end
83 | end
84 | end
85 |
86 | puts "successfully finished"
87 |
--------------------------------------------------------------------------------
/workflow/rules/screen.smk:
--------------------------------------------------------------------------------
1 |
2 | rule generate_sketch:
3 | input:
4 | unpack(get_input_fastq),
5 | output:
6 | "Intermediate/screen/sketches/{sample}.sketch.gz",
7 | log:
8 | "logs/screen/make_sketch/{sample}.log",
9 | conda:
10 | "../envs/required_packages.yaml"
11 | threads: 1
12 | resources:
13 | mem_mb=config["simplejob_mem"] * 1000,
14 | java_mem=int(config["simplejob_mem"] * JAVA_MEM_FRACTION),
15 | shell:
16 | "bbsketch.sh "
17 | "in={input[0]}"
18 | " samplerate=0.5"
19 | " minkeycount=2 "
20 | " out={output} "
21 | " blacklist=nt ssu=f name0={wildcards.sample} depth=t overwrite=t "
22 | " -Xmx{resources.java_mem}g "
23 | " &> {log}"
24 | # take only one read
25 |
26 |
27 | rule compare_sketch:
28 | input:
29 | expand(rules.generate_sketch.output, sample=SAMPLES),
30 | output:
31 | "QC/screen/sketch_comparison.tsv.gz",
32 | priority: 100
33 | log:
34 | "logs/screen/compare_sketch.log",
35 | conda:
36 | "../envs/required_packages.yaml"
37 | threads: 1
38 | resources:
39 | mem_mb=config["mem"] * 1000,
40 | java_mem=int(config["mem"] * JAVA_MEM_FRACTION),
41 | shell:
42 | "comparesketch.sh alltoall "
43 | " format=3 out={output} "
44 | " records=5000 "
45 | " {input} "
46 | " -Xmx{resources.java_mem}g "
47 | " &> {log}"
48 |
49 |
50 | # sendsketch.sh sample2.sketch printdepth2=t level=2 printqfname=f printvolume=t color=f out
51 |
--------------------------------------------------------------------------------
/workflow/rules/semibin.smk:
--------------------------------------------------------------------------------
1 |
2 | rule semibin_generate_data_multi:
3 | input:
4 | fasta=rules.combine_contigs.output,
5 | bams=get_bams_of_bingroup,
6 | output:
7 | directory("Intermediate/cobinning/{bingroup}/semibin/data_multi"),
8 | # expand(
9 | # "Cobinning/SemiBin/samples/{sample}/{files}",
10 | # sample=SAMPLES,
11 | # files=["data.csv", "data_split.csv"],
12 | # ),
13 | conda:
14 | "../envs/semibin.yaml"
15 | threads: config["threads"]
16 | resources:
17 | mem_mb=config["mem"] * 1000,
18 | time_min=60 * config["runtime"]["default"],
19 | log:
20 | "logs/semibin/{bingroup}/generate_data_multi.log",
21 | benchmark:
22 | "logs/benchmarks/semibin/{bingroup}/generate_data_multi.tsv"
23 | params:
24 | # output_dir="Cobinning/SemiBin",
25 | separator=config["cobinning_separator"],
26 | shell:
27 | "SemiBin generate_sequence_features_multi"
28 | " --input-fasta {input.fasta} "
29 | " --input-bam {input.bams} "
30 | " --output {output} "
31 | " --threads {threads} "
32 | " --separator {params.separator} "
33 | " 2> {log}"
34 |
35 |
36 | rule semibin_train:
37 | input:
38 | flag=get_assembly,
39 | fasta_sample=rules.filter_contigs.output[0],
40 | bams=get_bams_of_bingroup,
41 | data_folder=rules.semibin_generate_data_multi.output[0],
42 | output:
43 | "Intermediate/cobinning/{bingroup}/semibin/models/{sample}/model.h5",
44 | conda:
45 | "../envs/semibin.yaml"
46 | threads: config["threads"]
47 | resources:
48 | mem_mb=config["mem"] * 1000,
49 | time_min=60 * config["runtime"]["default"],
50 | log:
51 | "logs/semibin/{bingroup}/train/{sample}.log",
52 | benchmark:
53 | "logs/benchmarks/semibin/{bingroup}/train/{sample}.tsv"
54 | params:
55 | output_dir=lambda wc, output: os.path.dirname(output[0]),
56 | data=lambda wc, input: Path(input.data_folder)
57 | / "samples"
58 | / wc.sample
59 | / "data.csv",
60 | data_split=lambda wc, input: Path(input.data_folder)
61 | / "samples"
62 | / wc.sample
63 | / "data_split.csv",
64 | extra=config["semibin_train_extra"],
65 | shell:
66 | "SemiBin train_self "
67 | " --output {params.output_dir} "
68 | " --threads {threads} "
69 | " --data {params.data} "
70 | " --data-split {params.data_split} "
71 | " {params.extra} "
72 | " 2> {log}"
73 |
74 |
75 | def semibin_input(wildcards):
76 | bingroup_of_sample = sampleTable.loc[wildcards.sample, "BinGroup"]
77 | samples_of_bingroup = sampleTable.query(
78 | f'BinGroup=="{bingroup_of_sample}"'
79 | ).index.tolist()
80 |
81 | assert len(samples_of_bingroup) > 1
82 |
83 | mapping = dict(
84 | fasta=rules.filter_contigs.output[0].format(**wildcards),
85 | bams=expand(
86 | "Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam",
87 | sample=samples_of_bingroup,
88 | bingroup=bingroup_of_sample,
89 | ),
90 | data_folder=rules.semibin_generate_data_multi.output[0].format(
91 | bingroup=bingroup_of_sample, **wildcards
92 | ),
93 | model=rules.semibin_train.output[0].format(
94 | bingroup=bingroup_of_sample, **wildcards
95 | ),
96 | )
97 |
98 | return mapping
99 |
100 |
101 | rule run_semibin:
102 | input:
103 | unpack(semibin_input),
104 | output:
105 | # contains no info to bingroup
106 | directory(
107 | "Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/"
108 | ),
109 | conda:
110 | "../envs/semibin.yaml"
111 | threads: config["threads"]
112 | resources:
113 | mem_mb=config["mem"] * 1000,
114 | time_min=60 * config["runtime"]["default"],
115 | log:
116 | "logs/semibin/bin/{sample}.log",
117 | benchmark:
118 | "logs/benchmarks/semibin/bin/{sample}.tsv"
119 | params:
120 | output_dir=lambda wc, output: os.path.dirname(output[0]),
121 | data=lambda wc, input: Path(input.data_folder)
122 | / "samples"
123 | / wc.sample
124 | / "data.csv",
125 | min_bin_kbs=int(config["cobining_min_bin_size"] / 1000),
126 | extra=config["semibin_options"],
127 | shell:
128 | "SemiBin bin "
129 | " --input-fasta {input.fasta} "
130 | " --output {params.output_dir} "
131 | " --threads {threads} "
132 | " --data {params.data} "
133 | " --model {input.model} "
134 | " --minfasta-kbs {params.min_bin_kbs}"
135 | " {params.extra} "
136 | " 2> {log}"
137 |
138 |
139 | localrules:
140 | parse_semibin_output,
141 |
142 |
143 | ruleorder: parse_semibin_output > get_unique_cluster_attribution
144 |
145 |
146 | rule parse_semibin_output:
147 | input:
148 | rules.run_semibin.output[0],
149 | output:
150 | "{sample}/binning/SemiBin/cluster_attribution.tsv",
151 | conda:
152 | "../envs/semibin.yaml"
153 | log:
154 | "logs/semibin/parse_output/{sample}.log",
155 | params:
156 | extension=".fa",
157 | script:
158 | "../scripts/parse_semibin.py"
159 |
160 |
161 | rule semibin:
162 | input:
163 | expand("{sample}/binning/SemiBin/cluster_attribution.tsv", sample=SAMPLES),
164 |
--------------------------------------------------------------------------------
/workflow/rules/sra.smk:
--------------------------------------------------------------------------------
1 | wildcard_constraints:
2 | sra_run="[S,E,D]RR[0-9]+",
3 |
4 |
5 | localrules:
6 | prefetch,
7 |
8 |
9 | SRA_read_fractions = ["_1", "_2"] if PAIRED_END else [""]
10 | SRA_SUBDIR_RUN = "SRA/Runs"
11 |
12 |
13 | rule prefetch:
14 | output:
15 | sra=temp(touch(SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}_downloaded")),
16 | # not givins sra file as output allows for continue from the same download
17 | params:
18 | outdir=SRA_SUBDIR_RUN, # prefetch creates file in subfolder with run name automatically
19 | log:
20 | "logs/SRAdownload/prefetch/{sra_run}.log",
21 | benchmark:
22 | "logs/benchmarks/SRAdownload/prefetch/{sra_run}.tsv"
23 | threads: 1
24 | resources:
25 | mem_mb=1000,
26 | time_min=60 * int(config["runtime"]["simplejob"]),
27 | internet_connection=1,
28 | conda:
29 | "%s/sra.yaml" % CONDAENV
30 | shell:
31 | " mkdir -p {params.outdir} 2> {log} "
32 | " ; "
33 | " prefetch "
34 | " --output-directory {params.outdir} "
35 | " -X 999999999 "
36 | " --progress "
37 | " --log-level info "
38 | " {wildcards.sra_run} &>> {log} "
39 | " ; "
40 | " vdb-validate {params.outdir}/{wildcards.sra_run}/{wildcards.sra_run}.sra &>> {log} "
41 |
42 |
43 | rule extract_run:
44 | input:
45 | flag=rules.prefetch.output,
46 | output:
47 | temp(
48 | expand(
49 | SRA_SUBDIR_RUN + "/{{sra_run}}/{{sra_run}}{fraction}.fastq.gz",
50 | fraction=SRA_read_fractions,
51 | )
52 | ),
53 | params:
54 | outdir=os.path.abspath(SRA_SUBDIR_RUN + "/{sra_run}"),
55 | sra_file=SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}.sra",
56 | log:
57 | "logs/SRAdownload/extract/{sra_run}.log",
58 | benchmark:
59 | "logs/benchmarks/SRAdownload/fasterqdump/{sra_run}.tsv"
60 | threads: config["simplejob_threads"]
61 | resources:
62 | time_min=60 * int(config["runtime"]["simplejob"]),
63 | mem_mb=1000, #default 100Mb
64 | conda:
65 | "%s/sra.yaml" % CONDAENV
66 | shell:
67 | " vdb-validate {params.sra_file} &>> {log} "
68 | " ; "
69 | " parallel-fastq-dump "
70 | " --threads {threads} "
71 | " --gzip --split-files "
72 | " --outdir {params.outdir} "
73 | " --tmpdir {resources.tmpdir} "
74 | " --skip-technical --split-3 "
75 | " -s {params.sra_file} &>> {log} "
76 | " ; "
77 | " rm -f {params.sra_file} 2>> {log} "
78 |
79 |
80 | RunTable = None
81 |
82 |
83 | def get_runids_for_biosample(wildcards):
84 | global RunTable
85 | if RunTable is None:
86 | from atlas.init.parse_sra import load_and_validate_runinfo_table
87 |
88 | RunTable = load_and_validate_runinfo_table("RunInfo.tsv")
89 |
90 | run_ids = RunTable.query(f"BioSample == '{wildcards.sample}'").index.tolist()
91 |
92 | return run_ids
93 |
94 |
95 | def get_runs_for_biosample(wildcards):
96 | run_ids = get_runids_for_biosample(wildcards)
97 |
98 | ReadFiles = {}
99 | for fraction in SRA_read_fractions:
100 | if fraction == "":
101 | key = "se"
102 | else:
103 | key = fraction
104 |
105 | ReadFiles[key] = expand(
106 | SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}{fraction}.fastq.gz",
107 | fraction=fraction,
108 | sra_run=run_ids,
109 | )
110 |
111 | return ReadFiles
112 |
113 |
114 | rule merge_runs_to_sample:
115 | input:
116 | unpack(get_runs_for_biosample),
117 | output:
118 | expand(
119 | "SRA/Samples/{{sample}}/{{sample}}{fraction}.fastq.gz",
120 | fraction=SRA_read_fractions,
121 | ),
122 | threads: 1
123 | run:
124 | from utils import io
125 |
126 | for i, fraction in enumerate(SRA_read_fractions):
127 | if fraction == "":
128 | fraction = "se"
129 | io.cat_files(input[fraction], output[i])
130 |
131 |
132 | rule download_sra:
133 | input:
134 | expand(
135 | "SRA/Samples/{sample}/{sample}{fraction}.fastq.gz",
136 | fraction=SRA_read_fractions,
137 | sample=SAMPLES,
138 | ),
139 |
--------------------------------------------------------------------------------
/workflow/rules/strains.smk:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | rule instrain_profile:
5 | input:
6 | bam="genomes/alignments/bams/{sample}.bam",
7 | genomes="genomes/all_contigs.fasta",
8 | # genes=lambda wc: get_all_genes(wc, extension=".fna"),
9 | scaffold_to_genome="genomes/clustering/contig2genome.tsv",
10 | output:
11 | directory("strains/intermediate_files/{sample}"),
12 | threads: config["threads"]
13 | params:
14 | extra=config.get("instrain_profile_extra", ""),
15 | log:
16 | "logs/strains/profile/{sample}.log",
17 | conda:
18 | "../envs/instrain.yaml"
19 | benchmark:
20 | "logs/benchmarks/strains/profile/{sample}.tsv"
21 | resources:
22 | mem_mb=config["mem"] * 1000,
23 | time_min=60 * config["runtime"]["long"],
24 | shell:
25 | #" cat {input.genes} > {resources.tmpdir}/all_genome_genes.fna 2> {log} "
26 | #" ; "
27 | "inStrain profile "
28 | " {input.bam} {input.genomes} "
29 | " -o {output} "
30 | " -p {threads} "
31 |
32 | " -s {input.scaffold_to_genome} "
33 | " --database_mode "
34 | " {params.extra} &>> {log}"
35 | #" -g {resources.tmpdir}/all_genome_genes.fna "
36 |
37 |
38 | rule instrain_compare:
39 | input:
40 | profiles=expand("strains/intermediate_files/{sample}", sample=SAMPLES),
41 | scaffold_to_genome="genomes/clustering/contig2genome.tsv",
42 | output:
43 | directory("strains/comparison"),
44 | threads: config["threads"]
45 | params:
46 | extra=config.get("instrain_compare_extra", ""),
47 | log:
48 | "logs/strains/compare.log",
49 | conda:
50 | "../envs/instrain.yaml"
51 | benchmark:
52 | "logs/benchmarks/strains/compare.tsv"
53 | resources:
54 | mem_mb=config["mem"] * 1000,
55 | time_min=60 * config["runtime"]["long"],
56 | shell:
57 | "inStrain compare "
58 | " --input {input.profiles} "
59 | " -o {output} "
60 | " -p {threads} "
61 | " -s {input.scaffold_to_genome} "
62 | " --database_mode "
63 | " {params.extra} &> {log}"
64 |
65 |
66 | # usage: inStrain compare -i [INPUT [INPUT ...]] [-o OUTPUT] [-p PROCESSES] [-d]
67 | # [-h] [--version] [-s [STB [STB ...]]] [-c MIN_COV]
68 | # [-f MIN_FREQ] [-fdr FDR] [--database_mode]
69 | # [--breadth BREADTH] [-sc SCAFFOLDS] [--genome GENOME]
70 | # [--store_coverage_overlap]
71 | # [--store_mismatch_locations]
72 | # [--include_self_comparisons] [--skip_plot_generation]
73 | # [--group_length GROUP_LENGTH] [--force_compress]
74 | # [-ani ANI_THRESHOLD] [-cov COVERAGE_TRESHOLD]
75 | # [--clusterAlg {ward,single,complete,average,weighted,median,centroid}]
76 |
--------------------------------------------------------------------------------
/workflow/scripts/DRAM_get_all_modules.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 |
4 | import sys, os
5 | import logging, traceback
6 |
7 | logging.basicConfig(
8 | filename=snakemake.log[0],
9 | level=logging.INFO,
10 | format="%(asctime)s %(message)s",
11 | datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 |
14 |
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 | if issubclass(exc_type, KeyboardInterrupt):
17 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 | return
19 |
20 | logging.error(
21 | "".join(
22 | [
23 | "Uncaught exception: ",
24 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 | ]
26 | )
27 | )
28 |
29 |
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 |
33 |
34 | import pandas as pd
35 |
36 | annotation_file = snakemake.input.annotations
37 | module_output_table = snakemake.output[0]
38 |
39 | from mag_annotator.database_handler import DatabaseHandler
40 | from mag_annotator.summarize_genomes import build_module_net, make_module_coverage_frame
41 |
42 | annotations = pd.read_csv(annotation_file, sep="\t", index_col=0)
43 |
44 |
45 | # get db_locs and read in dbs
46 | database_handler = DatabaseHandler(logger=logging, config_loc=snakemake.input.config)
47 |
48 |
49 | if "module_step_form" not in database_handler.config["dram_sheets"]:
50 | raise ValueError(
51 | "Module step form location must be set in order to summarize genomes"
52 | )
53 |
54 | module_steps_form = pd.read_csv(
55 | database_handler.config["dram_sheets"]["module_step_form"], sep="\t"
56 | )
57 |
58 | all_module_nets = {
59 | module: build_module_net(module_df)
60 | for module, module_df in module_steps_form.groupby("module")
61 | }
62 |
63 | module_coverage_frame = make_module_coverage_frame(
64 | annotations, all_module_nets, groupby_column="fasta"
65 | )
66 |
67 | module_coverage_frame.to_csv(module_output_table, sep="\t")
68 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_busco.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import logging, traceback
3 |
4 | logging.basicConfig(
5 | filename=snakemake.log[0],
6 | level=logging.INFO,
7 | format="%(asctime)s %(message)s",
8 | datefmt="%Y-%m-%d %H:%M:%S",
9 | )
10 |
11 | logging.captureWarnings(True)
12 |
13 |
14 | def handle_exception(exc_type, exc_value, exc_traceback):
15 | if issubclass(exc_type, KeyboardInterrupt):
16 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
17 | return
18 |
19 | logging.error(
20 | "".join(
21 | [
22 | "Uncaught exception: ",
23 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
24 | ]
25 | )
26 | )
27 |
28 |
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 |
32 | #### Begining of scripts
33 |
34 | import pandas as pd
35 | from utils.parsers import read_busco_output
36 |
37 |
38 | def main(samples, completeness_files, bin_table):
39 | sample_data = {}
40 | div = {}
41 |
42 | df = pd.DataFrame()
43 |
44 | for i, sample in enumerate(samples):
45 | sample_data = read_busco_output(completeness_files[i])
46 | sample_data["Sample"] = sample
47 |
48 | df = df.append(sample_data)
49 |
50 | # remove missing
51 |
52 | failed_genomes = df.index[df.Dataset.str.lower().str.contains("run failed")]
53 |
54 | if len(failed_genomes) > 0:
55 | logging.warning(
56 | "Following genomes didn't pass BUSCO. I ignore them, because "
57 | "I think theas means they are too bad to be quantified:\n"
58 | f"{failed_genomes}"
59 | )
60 |
61 | df.loc[failed_genomes, ["Completeness", "Contamination", "Quality_score"]] = 0
62 |
63 | df.to_csv(bin_table, sep="\t")
64 |
65 |
66 | if __name__ == "__main__":
67 | main(
68 | samples=snakemake.params.samples,
69 | completeness_files=snakemake.input.completeness_files,
70 | bin_table=snakemake.output.bin_table,
71 | )
72 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_checkm.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import logging, traceback
3 |
4 | logging.basicConfig(
5 | filename=snakemake.log[0],
6 | level=logging.INFO,
7 | format="%(asctime)s %(message)s",
8 | datefmt="%Y-%m-%d %H:%M:%S",
9 | )
10 |
11 | logging.captureWarnings(True)
12 |
13 |
14 | def handle_exception(exc_type, exc_value, exc_traceback):
15 | if issubclass(exc_type, KeyboardInterrupt):
16 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
17 | return
18 |
19 | logging.error(
20 | "".join(
21 | [
22 | "Uncaught exception: ",
23 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
24 | ]
25 | )
26 | )
27 |
28 |
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 |
32 | #### Begining of scripts
33 |
34 | import pandas as pd
35 | from utils.parsers import read_checkm_output
36 |
37 |
38 | def main(samples, completeness_files, taxonomy_files, bin_table):
39 | sample_data = {}
40 | div = {}
41 |
42 | df = pd.DataFrame()
43 |
44 | for i, sample in enumerate(samples):
45 | sample_data = read_checkm_output(
46 | taxonomy_table=taxonomy_files[i], completness_table=completeness_files[i]
47 | )
48 | sample_data["Sample"] = sample
49 |
50 | df = df.append(sample_data)
51 |
52 | df.to_csv(bin_table, sep="\t")
53 |
54 |
55 | if __name__ == "__main__":
56 | main(
57 | samples=snakemake.params.samples,
58 | taxonomy_files=snakemake.input.taxonomy_files,
59 | completeness_files=snakemake.input.completeness_files,
60 | bin_table=snakemake.output.bin_table,
61 | )
62 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_checkm2.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import logging, traceback
3 |
4 | logging.basicConfig(
5 | filename=snakemake.log[0],
6 | level=logging.INFO,
7 | format="%(asctime)s %(message)s",
8 | datefmt="%Y-%m-%d %H:%M:%S",
9 | )
10 |
11 | logging.captureWarnings(True)
12 |
13 |
14 | def handle_exception(exc_type, exc_value, exc_traceback):
15 | if issubclass(exc_type, KeyboardInterrupt):
16 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
17 | return
18 |
19 | logging.error(
20 | "".join(
21 | [
22 | "Uncaught exception: ",
23 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
24 | ]
25 | )
26 | )
27 |
28 |
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 |
32 | #### Begining of scripts
33 |
34 | import pandas as pd
35 | from utils.parsers import read_checkm2_output
36 |
37 |
38 | def main(samples, completeness_files, bin_table):
39 | sample_data = {}
40 | div = {}
41 |
42 | df_list = []
43 |
44 | for i, sample in enumerate(samples):
45 | sample_data = read_checkm2_output(completness_table=completeness_files[i])
46 | sample_data["Sample"] = sample
47 |
48 | df_list.append(sample_data)
49 |
50 | df = pd.concat(df_list, axis=0)
51 |
52 | df.to_csv(bin_table, sep="\t")
53 |
54 |
55 | if __name__ == "__main__":
56 | main(
57 | samples=snakemake.params.samples,
58 | completeness_files=snakemake.input.completeness_files,
59 | bin_table=snakemake.output.bin_table,
60 | )
61 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_contig_stats.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import logging, traceback
3 |
4 | logging.basicConfig(
5 | filename=snakemake.log[0],
6 | level=logging.INFO,
7 | format="%(asctime)s %(message)s",
8 | datefmt="%Y-%m-%d %H:%M:%S",
9 | )
10 |
11 |
12 | def handle_exception(exc_type, exc_value, exc_traceback):
13 | if issubclass(exc_type, KeyboardInterrupt):
14 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
15 | return
16 |
17 | logging.error(
18 | "".join(
19 | [
20 | "Uncaught exception: ",
21 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
22 | ]
23 | )
24 | )
25 |
26 |
27 | # Install exception handler
28 | sys.excepthook = handle_exception
29 |
30 |
31 | import pandas as pd
32 | from utils.parsers_bbmap import parse_pileup_log_file
33 |
34 |
35 | def parse_map_stats(sample_data, out_tsv):
36 | sample_stats = {}
37 | for sample in sample_data.keys():
38 | df = pd.read_csv(sample_data[sample]["contig_stats"], sep="\t")
39 |
40 | assert df.shape[0] == 1, "Assumed only one row in file {}; found {}".format(
41 | sample_data[sample]["contig_stats"], df.iloc[0]
42 | )
43 |
44 | # n genes
45 | genes_df = pd.read_csv(sample_data[sample]["gene_table"], index_col=0, sep="\t")
46 | df["N_Predicted_Genes"] = genes_df.shape[0]
47 |
48 | # mappingt stats
49 | mapping_stats = parse_pileup_log_file(sample_data[sample]["mapping_log"])
50 | df["Assembled_Reads"] = mapping_stats["Mapped reads"]
51 | df["Percent_Assembled_Reads"] = mapping_stats["Percent mapped"]
52 |
53 | logging.info(f"Stats for sample {sample}\n{df}")
54 |
55 | sample_stats[sample] = df
56 |
57 | stats_df = pd.concat(sample_stats, axis=0)
58 | stats_df.index = stats_df.index.get_level_values(0)
59 | # remove contig stats and keep only scaffold stats
60 | stats_df = stats_df.loc[:, ~stats_df.columns.str.startswith("scaf_")]
61 | stats_df.columns = stats_df.columns.str.replace("ctg_", "")
62 | # save
63 | stats_df.to_csv(out_tsv, sep="\t")
64 | return stats_df
65 |
66 |
67 | def main(samples, contig_stats, gene_tables, mapping_logs, combined_stats):
68 | sample_data = {}
69 | for sample in samples:
70 | sample_data[sample] = {}
71 | for c_stat in contig_stats:
72 | # underscore version was for simplified local testing
73 | # if "%s_" % sample in c_stat:
74 | if "%s/" % sample in c_stat:
75 | sample_data[sample]["contig_stats"] = c_stat
76 | for g_table in gene_tables:
77 | # if "%s_" % sample in g_table:
78 | if "%s/" % sample in g_table:
79 | sample_data[sample]["gene_table"] = g_table
80 | for mapping_log in mapping_logs:
81 | # if "%s_" % sample in mapping_log:
82 | if "%s/" % sample in mapping_log:
83 | sample_data[sample]["mapping_log"] = mapping_log
84 |
85 | parse_map_stats(sample_data, combined_stats)
86 |
87 |
88 | if __name__ == "__main__":
89 | main(
90 | samples=snakemake.params.samples,
91 | contig_stats=snakemake.input.contig_stats,
92 | gene_tables=snakemake.input.gene_tables,
93 | mapping_logs=snakemake.input.mapping_logs,
94 | combined_stats=snakemake.output.combined_contig_stats,
95 | )
96 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_coverage_MAGs.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import logging, traceback
3 |
4 | logging.basicConfig(
5 | filename=snakemake.log[0],
6 | level=logging.INFO,
7 | format="%(asctime)s %(message)s",
8 | datefmt="%Y-%m-%d %H:%M:%S",
9 | )
10 |
11 |
12 | def handle_exception(exc_type, exc_value, exc_traceback):
13 | if issubclass(exc_type, KeyboardInterrupt):
14 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
15 | return
16 |
17 | logging.error(
18 | "".join(
19 | [
20 | "Uncaught exception: ",
21 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
22 | ]
23 | )
24 | )
25 |
26 |
27 | # Install exception handler
28 | sys.excepthook = handle_exception
29 |
30 |
31 | import pandas as pd
32 | import os, gc
33 | from utils.parsers_bbmap import read_coverage_binned, combine_coverages
34 |
35 |
36 | contig2genome = pd.read_csv(
37 | snakemake.input.contig2genome, header=None, index_col=0, sep="\t"
38 | ).iloc[:, 0]
39 |
40 |
41 | # sum counts
42 | logging.info("Loading counts and coverage per contig")
43 |
44 | combined_cov, Counts_contigs = combine_coverages(
45 | snakemake.input.coverage_files, snakemake.params.samples
46 | )
47 |
48 | combined_cov = combined_cov.T
49 |
50 | combined_cov.insert(
51 | 0, "Genome", value=pd.Categorical(contig2genome.loc[combined_cov.index].values)
52 | )
53 |
54 | logging.info(f"Saving coverage to {snakemake.output.coverage_contigs}")
55 |
56 | combined_cov.reset_index().to_parquet(snakemake.output.coverage_contigs)
57 |
58 | logging.info("Sum counts per genome")
59 |
60 | Counts_genome = Counts_contigs.groupby(contig2genome, axis=1).sum().T
61 | Counts_genome.index.name = "Sample"
62 |
63 | logging.info(f"Saving counts to {snakemake.output.counts}")
64 |
65 | Counts_genome.reset_index().to_parquet(snakemake.output.counts)
66 | del Counts_genome, combined_cov, Counts_contigs
67 | gc.collect()
68 |
69 | # Binned coverage
70 | logging.info("Loading binned coverage")
71 | binCov = {}
72 | for i, cov_file in enumerate(snakemake.input.binned_coverage_files):
73 | sample = snakemake.params.samples[i]
74 |
75 | binCov[sample] = read_coverage_binned(cov_file)
76 |
77 | binCov = pd.DataFrame.from_dict(binCov)
78 |
79 | logging.info("Add genome information to it")
80 | binCov.insert(
81 | 0,
82 | "Genome",
83 | value=pd.Categorical(contig2genome.loc[binCov.index.get_level_values(0)].values),
84 | )
85 |
86 | gc.collect()
87 | logging.info(f"Saving combined binCov to {snakemake.output.binned_cov}")
88 | binCov.reset_index().to_parquet(snakemake.output.binned_cov)
89 |
90 | # Median coverage
91 | logging.info("Calculate median coverage")
92 | Median_abund = binCov.groupby("Genome").median().T
93 | del binCov
94 | gc.collect()
95 | logging.info(f"Saving mediuan coverage {snakemake.output.median_abund}")
96 | Median_abund.reset_index().to_parquet(snakemake.output.median_abund)
97 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_dram_gene_annotations.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import logging, traceback
3 |
4 | logging.basicConfig(
5 | filename=snakemake.log[0],
6 | level=logging.INFO,
7 | format="%(asctime)s %(message)s",
8 | datefmt="%Y-%m-%d %H:%M:%S",
9 | )
10 |
11 |
12 | def handle_exception(exc_type, exc_value, exc_traceback):
13 | if issubclass(exc_type, KeyboardInterrupt):
14 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
15 | return
16 |
17 | logging.error(
18 | "".join(
19 | [
20 | "Uncaught exception: ",
21 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
22 | ]
23 | )
24 | )
25 |
26 |
27 | # Install exception handler
28 | sys.excepthook = handle_exception
29 |
30 |
31 | from pathlib import Path
32 | import numpy as np
33 | import pandas as pd
34 | from collections import defaultdict
35 |
36 | db_columns = {
37 | "kegg": ["ko_id", "kegg_hit"],
38 | "peptidase": [
39 | "peptidase_id",
40 | "peptidase_family",
41 | "peptidase_hit",
42 | "peptidase_RBH",
43 | "peptidase_identity",
44 | "peptidase_bitScore",
45 | "peptidase_eVal",
46 | ],
47 | "pfam": ["pfam_hits"],
48 | "cazy": ["cazy_ids", "cazy_hits", "cazy_subfam_ec", "cazy_best_hit"],
49 | # "heme": ["heme_regulatory_motif_count"],
50 | }
51 |
52 | Tables = defaultdict(list)
53 |
54 | for file in snakemake.input:
55 | df = pd.read_csv(file, index_col=0, sep="\t")
56 |
57 | # drop un-annotated genes
58 | df = df.query("rank!='E'")
59 |
60 | # change index from 'subset1_Gene111' -> simply 'Gene111'
61 | # Gene name to nr
62 | df.index = (
63 | df.index.str.split("_", n=1, expand=True)
64 | .get_level_values(1)
65 | .str[len("Gene") :]
66 | .astype(np.int64)
67 | )
68 | df.index.name = "GeneNr"
69 |
70 | # select columns, drop na rows and append to list
71 | for db in db_columns:
72 | cols = db_columns[db]
73 |
74 | if not df.columns.intersection(cols).empty:
75 | Tables[db].append(df[cols].dropna(axis=0, how="all"))
76 |
77 | del df
78 |
79 | out_dir = Path(snakemake.output[0])
80 | out_dir.mkdir()
81 |
82 | for db in Tables:
83 | combined = pd.concat(Tables[db], axis=0)
84 |
85 | combined.sort_index(inplace=True)
86 |
87 | combined.reset_index().to_parquet(out_dir / (db + ".parquet"))
88 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_gene_coverages.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os, sys
3 | import logging, traceback
4 |
5 | logging.basicConfig(
6 | filename=snakemake.log[0],
7 | level=logging.INFO,
8 | format="%(asctime)s %(message)s",
9 | datefmt="%Y-%m-%d %H:%M:%S",
10 | )
11 |
12 |
13 | def handle_exception(exc_type, exc_value, exc_traceback):
14 | if issubclass(exc_type, KeyboardInterrupt):
15 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
16 | return
17 |
18 | logging.error(
19 | "".join(
20 | [
21 | "Uncaught exception: ",
22 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
23 | ]
24 | )
25 | )
26 |
27 |
28 | # Install exception handler
29 | sys.excepthook = handle_exception
30 |
31 | #### Begining of script
32 | import numpy as np
33 | import pandas as pd
34 | import gc, os
35 |
36 |
37 | import h5py
38 |
39 | import h5py
40 |
41 | import psutil
42 |
43 |
44 | def measure_memory(write_log_entry=True):
45 | mem_uage = psutil.Process().memory_info().rss / (1024 * 1024)
46 |
47 | if write_log_entry:
48 | logging.info(f"The process is currently using {mem_uage: 7.0f} MB of RAM")
49 |
50 | return mem_uage
51 |
52 |
53 | logging.info("Start")
54 | measure_memory()
55 |
56 | N_samples = len(snakemake.input.covstats)
57 |
58 | logging.info("Read gene info")
59 |
60 | gene_info = pd.read_table(snakemake.input.info)
61 |
62 | # Gene name is only first part of first column
63 | gene_info.index = gene_info["#Name"].str.split(" ", n=1, expand=True)[0]
64 | gene_info.index.name = "GeneName"
65 | gene_info.drop("#Name", axis=1, inplace=True)
66 |
67 | gene_info.sort_index(inplace=True)
68 | N_genes = gene_info.shape[0]
69 | # gene_list= gene_info.index
70 |
71 | # Sort
72 | gene_info.sort_index(inplace=True)
73 | N_genes = gene_info.shape[0]
74 |
75 | gene_info[
76 | ["Samples_nz_coverage", "Samples_nz_counts", "Sum_coverage", "Max_coverage"]
77 | ] = 0
78 |
79 |
80 | # gene_list= gene_info.index
81 |
82 |
83 | logging.info("Open hdf files for writing")
84 |
85 | gene_matrix_shape = (N_samples, N_genes)
86 |
87 | with h5py.File(snakemake.output.cov, "w") as hdf_cov_file, h5py.File(
88 | snakemake.output.counts, "w"
89 | ) as hdf_counts_file:
90 | combined_cov = hdf_cov_file.create_dataset(
91 | "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip"
92 | )
93 | combined_counts = hdf_counts_file.create_dataset(
94 | "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip"
95 | )
96 |
97 | # add Smaple names attribute
98 | sample_names = np.array(list(snakemake.params.samples)).astype("S")
99 | combined_cov.attrs["sample_names"] = sample_names
100 | combined_counts.attrs["sample_names"] = sample_names
101 |
102 | gc.collect()
103 |
104 | Summary = {}
105 |
106 | logging.info("Start reading files")
107 | initial_mem_uage = measure_memory()
108 |
109 | for i, sample in enumerate(snakemake.params.samples):
110 | logging.info(f"Read coverage file for sample {i+1} / {N_samples}")
111 | sample_cov_file = snakemake.input.covstats[i]
112 |
113 | data = pd.read_parquet(
114 | sample_cov_file, columns=["GeneName", "Reads", "Median_fold"]
115 | ).set_index("GeneName")
116 |
117 | assert (
118 | data.shape[0] == N_genes
119 | ), f"I only have {data.shape[0]} /{N_genes} in the file {sample_cov_file}"
120 |
121 | # genes are not sorted :-()
122 | assert (
123 | data.index.is_monotonic_increasing
124 | ), f"data is not sorted by index in {sample_cov_file}"
125 |
126 | # downcast data
127 | # median is int
128 | Median_fold = pd.to_numeric(data.Median_fold, downcast="integer")
129 | Reads = pd.to_numeric(data.Reads, downcast="integer")
130 |
131 | # delete interminate data and release mem
132 | del data
133 |
134 | # get summary statistics per sample
135 | logging.debug("Extract Summary statistics")
136 |
137 | Summary[sample] = {
138 | "Sum_coverage": Median_fold.sum(),
139 | "Total_counts": Reads.sum(),
140 | "Genes_nz_counts": (Reads > 0).sum(),
141 | "Genes_nz_coverage": (Median_fold > 0).sum(),
142 | }
143 |
144 | # get gene wise stats
145 | gene_info["Samples_nz_counts"] += (Reads > 0) * 1
146 | gene_info["Samples_nz_coverage"] += (Median_fold > 0) * 1
147 | gene_info["Sum_coverage"] += Median_fold
148 |
149 | gene_info["Max_coverage"] = np.fmax(gene_info["Max_coverage"], Median_fold)
150 |
151 | combined_cov[i, :] = Median_fold.values
152 | combined_counts[i, :] = Reads.values
153 |
154 | del Median_fold, Reads
155 | gc.collect()
156 |
157 | current_mem_uage = measure_memory()
158 |
159 |
160 | logging.info("All samples processed")
161 | gc.collect()
162 |
163 | logging.info("Save sample Summary")
164 | pd.DataFrame(Summary).T.to_csv(snakemake.output.sample_info, sep="\t")
165 |
166 |
167 | logging.info("Save gene Summary")
168 |
169 | # downcast
170 | for col in gene_info.columns:
171 | if col == "GC":
172 | gene_info[col] = pd.to_numeric(gene_info[col], downcast="float")
173 | else:
174 | gene_info[col] = pd.to_numeric(gene_info[col], downcast="integer")
175 |
176 | gene_info.reset_index().to_parquet(snakemake.output.gene_info)
177 |
--------------------------------------------------------------------------------
/workflow/scripts/combine_taxonomy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os, sys
3 | import logging, traceback
4 |
5 | logging.basicConfig(
6 | filename=snakemake.log[0],
7 | level=logging.INFO,
8 | format="%(asctime)s %(message)s",
9 | datefmt="%Y-%m-%d %H:%M:%S",
10 | )
11 |
12 |
13 | def handle_exception(exc_type, exc_value, exc_traceback):
14 | if issubclass(exc_type, KeyboardInterrupt):
15 | sys.__excepthook__(exc_type, exc_value, exc_traceback)
16 | return
17 |
18 | logging.error(
19 | "".join(
20 | [
21 | "Uncaught exception: ",
22 | *traceback.format_exception(exc_type, exc_value, exc_traceback),
23 | ]
24 | )
25 | )
26 |
27 |
28 | # Install exception handler
29 | sys.excepthook = handle_exception
30 |
31 | #### Begining of scripts
32 |
33 | import pandas as pd
34 | import numpy as np
35 | from utils.taxonomy import tax2table
36 |
37 | from glob import glob
38 |
39 | gtdb_classify_folder = snakemake.input.folder
40 |
41 | taxonomy_files = glob(f"{gtdb_classify_folder}/gtdbtk.*.summary.tsv")
42 |
43 | N_taxonomy_files = len(taxonomy_files)
44 | logging.info(f"Found {N_taxonomy_files} gtdb taxonomy files.")
45 |
46 | if (0 == N_taxonomy_files) or (N_taxonomy_files > 2):
47 | raise Exception(
48 | f"Found {N_taxonomy_files} number of taxonomy files 'gtdbtk.*.summary.tsv' in {gtdb_classify_folder} expect 1 or 2."
49 | )
50 |
51 |
52 | DT = pd.concat([pd.read_table(file, index_col=0) for file in taxonomy_files], axis=0)
53 |
54 | DT.to_csv(snakemake.output.combined)
55 |
56 | Tax = tax2table(DT.classification, remove_prefix=True)
57 | Tax.to_csv(snakemake.output.taxonomy, sep="\t")
58 |
--------------------------------------------------------------------------------
/workflow/scripts/convert_jgi2vamb_coverage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 | import re
5 |
6 |
7 | def main(jgi_file):
8 | # parsing input
9 | header = {}
10 | col2keep = ["contigName", "contigLen", "totalAvgDepth"]
11 | with open(jgi_file) as inF:
12 | for i, line in enumerate(inF):
13 | line = line.rstrip().split("\t")
14 | if i == 0:
15 | header = {x: ii for ii, x in enumerate(line)}
16 | col2keep += [x for x in line if x.endswith(".bam")]
17 | print("\t".join(col2keep))
18 | continue
19 | elif line[0] == "":
20 | continue
21 | # contig ID
22 | contig = line[header["contigName"]]
23 | # collect per-sample info
24 | out = []
25 | for col in col2keep:
26 | out.append(line[header[col]])
27 | print("\t".join(out))
28 |
29 |
30 | if __name__ == "__main__":
31 | if "snakemake" in globals():
32 | with open(snakemake.log[0], "w") as log:
33 | sys.stderr = log
34 |
35 | with open(snakemake.output[0], "w") as outf:
36 | sys.stdout = outf
37 |
38 | main(snakemake.input[0])
39 |
40 | else:
41 | import argparse
42 | import logging
43 |
44 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
45 |
46 | class CustomFormatter(
47 | argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
48 | ):
49 | pass
50 |
51 | desc = (
52 | "Converting jgi_summarize_bam_contig_depths output to format used by VAMB"
53 | )
54 | epi = """DESCRIPTION:
55 | Output format: contigName