├── .gitignore ├── Pima ├── VERSION ├── __init__.py ├── accessory_scripts │ ├── MarkdownReport.py │ ├── __init__.py │ ├── building_pycircos_figures.py │ ├── pChunks.R │ └── sam2psl.py ├── data │ ├── ames_single_copy_genes.fna │ ├── amr.fasta │ ├── gene_drug.tsv │ ├── inc.fasta │ ├── pima.css │ └── reference_sequences │ │ └── Bacillus_anthracis │ │ ├── amr_appendices │ │ ├── beta-lactams.md │ │ ├── beta-lactams.png │ │ ├── macrolides.md │ │ ├── macrolides.png │ │ ├── quinolones.md │ │ ├── quinolones.png │ │ ├── tetracyclines.md │ │ └── tetracyclines.png │ │ ├── ba_virulence_genes.bed │ │ └── confirmed_amr_mutations.bed ├── modules │ ├── __init__.py │ ├── annotations.py │ ├── assembly.py │ ├── check_contamination.py │ ├── compare_to_ref.py │ ├── download_references.py │ ├── evaluate_assembly.py │ ├── fastq.py │ ├── illumina_polishing.py │ ├── multiplexed.py │ ├── ont_polishing.py │ ├── outdir.py │ ├── plasmids.py │ ├── report.py │ └── visualizations.py ├── nextflow_parallelization │ ├── main.nf │ ├── modules │ │ ├── copy_results.nf │ │ └── pima_singleplex.nf │ └── nextflow.config.template ├── pima.py ├── pima_colors.py ├── pima_data.py └── utils │ ├── __init__.py │ ├── cli.py │ ├── mapping.py │ ├── settings.py │ └── utils.py ├── README.md ├── conda_recipe ├── build.sh ├── environment.yml ├── environment_open_versions.yml ├── meta.yaml ├── meta_open_versions.yaml └── post-link.sh ├── dockerbuild └── Dockerfile ├── pima └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | #TEMP DEV 2 | Pima_refactor_notes.md 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # database files 10 | Pima/data/plasmids_and_vectors* 11 | Pima/data/kraken2* 12 | Pima/data/reference_sequences/Bacillus_anthracis/genome.fasta* 13 | 14 | # Nextflow 15 | Pima/nextflow_parallelization/nextflow.config 16 | 17 | # C extensions 18 | *.so 19 | 20 | # raven files? 21 | raven.cereal 22 | 23 | # Distribution / packaging 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # VSCODE 44 | .vscode/ 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | .pybuilder/ 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # poetry 115 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 116 | # This is especially recommended for binary packages to ensure reproducibility, and is more 117 | # commonly ignored for libraries. 118 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 119 | #poetry.lock 120 | 121 | # pdm 122 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 123 | #pdm.lock 124 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 125 | # in version control. 126 | # https://pdm.fming.dev/#use-with-ide 127 | .pdm.toml 128 | 129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 130 | __pypackages__/ 131 | 132 | # Celery stuff 133 | celerybeat-schedule 134 | celerybeat.pid 135 | 136 | # SageMath parsed files 137 | *.sage.py 138 | 139 | # Environments 140 | .env 141 | .venv 142 | env/ 143 | venv/ 144 | ENV/ 145 | env.bak/ 146 | venv.bak/ 147 | 148 | # Spyder project settings 149 | .spyderproject 150 | .spyproject 151 | 152 | # Rope project settings 153 | .ropeproject 154 | 155 | # mkdocs documentation 156 | /site 157 | 158 | # mypy 159 | .mypy_cache/ 160 | .dmypy.json 161 | dmypy.json 162 | 163 | # Pyre type checker 164 | .pyre/ 165 | 166 | # pytype static type analyzer 167 | .pytype/ 168 | 169 | # Cython debug symbols 170 | cython_debug/ 171 | 172 | # PyCharm 173 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 174 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 175 | # and can be added to the global gitignore or merged into this file. For a more nuclear 176 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 177 | #.idea/ 178 | -------------------------------------------------------------------------------- /Pima/VERSION: -------------------------------------------------------------------------------- 1 | 2.1.1 -------------------------------------------------------------------------------- /Pima/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/__init__.py -------------------------------------------------------------------------------- /Pima/accessory_scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/accessory_scripts/__init__.py -------------------------------------------------------------------------------- /Pima/accessory_scripts/building_pycircos_figures.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from collections import defaultdict 5 | 6 | from pycircos import Garc, Gcircle 7 | import matplotlib.pyplot as plt 8 | import matplotlib.colors as colors 9 | import matplotlib.patches as mpatches 10 | 11 | 12 | 13 | class BuildCircosPlots: 14 | """Build circos plot for genome alignment, coverage data, and gene annotations 15 | 16 | Expects inputs that are generated by the main 'pima.py' pipeline. Creates a png image \ 17 | for each genetic element within the reference genome provided to pima. 18 | 19 | Typical usage example: 20 | circos_elem = BuildCircosPlots(ge_name=ge_name, 21 | ge_size=int(ge_size), 22 | aln_file=aln_file, 23 | cov_file=cov_file, 24 | illumina_cov_file=illumina_cov_file, 25 | gene_file=gene_file, 26 | outdir=) 27 | circos_fig = circos_elem.main() 28 | circos_fig.save(file_name=f"{Path}/{ge_name}", format="png", dpi=300) 29 | 30 | """ 31 | def __init__(self, ge_name: str, ge_size: int, aln_file: str | None, outdir:str, cov_file: str | None,illumina_cov_file: str | None, gene_file: str | None, legend = True) -> None: 32 | """Initializing data to build the circos plots 33 | 34 | Args: 35 | ge_name (str): Name of the genetic element being plotted (chromosome, plasmid, etc..) 36 | ge_size (int): Length of the genetic element (bp) 37 | aln_file (str, optional): Path to the genome:genome alignment file, generated by dnadiff & filtered 38 | outdir (str): Path to save the results 39 | cov_file (str, optional): Path to the ONT coverage data, filtered for columns "sequenceID\tpos\tcoverage (cut -f1,2,4 from mpileup). Defaults to None | str. 40 | illumina_cov_file (str, optional): Path to the illumina coverage data, filtered for columns "sequenceID\tpos\tcoverage (cut -f1,2,4 from mpileup. Defaults to None | str. 41 | gene_file (str, optional): Path to a bed file indicating which genes to draw on the reference genome map. Defaults to None | str. 42 | legend (bool, optional): Generate a legend (T / F) 43 | Returns: 44 | None 45 | """ 46 | 47 | self.ge_name = ge_name 48 | self.ge_size = ge_size 49 | self.aln_file = aln_file 50 | self.outdir = outdir 51 | self.cov_file = cov_file 52 | self.illumina_cov_file = illumina_cov_file 53 | self.gene_file = gene_file 54 | self.legend = legend 55 | 56 | 57 | 58 | #instantiate dictionaries that contain data for each of the plotting steps 59 | self.aln_dict = defaultdict(dict) 60 | self.n_colors = defaultdict(dict) 61 | self.cov_dict = defaultdict(dict) 62 | self.illumina_cov_dict = defaultdict(dict) 63 | self.ge_circle = Gcircle() 64 | 65 | def build_reference_genome(self) -> None: 66 | """Generates the reference genome backbone 67 | 68 | Returns: 69 | None 70 | """ 71 | elem_arc = Garc(arc_id = self.ge_name, 72 | size = self.ge_size, 73 | raxis_range = [600, 700], 74 | facecolor="#FFFFFF", 75 | linewidth=1, 76 | edgecolor="#000000", 77 | label_visible=False, 78 | interspace=0 79 | ) 80 | self.ge_circle.add_garc(elem_arc) 81 | self.ge_circle.set_garcs() 82 | 83 | def add_tickmarks(self) -> None: 84 | """Places logical tick marks indicating genome element size 85 | 86 | Returns: 87 | None 88 | """ 89 | # use different spacing for the chromosome vs plasmids 90 | if self.ge_size > 2000000: 91 | tick_major_positions = list(range(0,self.ge_size,1000000)) 92 | tick_minor_positions = list(range(0,self.ge_size,200000)) 93 | tick_labels = [f"{int(i/1000000)} Mb" for i in tick_major_positions] 94 | else: 95 | tick_major_positions = list(range(0, self.ge_size,20000)) 96 | tick_minor_positions = list(range(0,self.ge_size,5000)) 97 | tick_labels = [f"{int(i/1000)} kb" for i in tick_major_positions] 98 | 99 | #major tick marks with labels 100 | self.ge_circle.tickplot(garc_id=self.ge_name, 101 | raxis_range=(550,600), 102 | tickpositions=tick_major_positions, 103 | tickdirection="inner", 104 | ticklabels=tick_labels, 105 | ticklabelmargin=10 106 | ) 107 | # minor tick marks 108 | self.ge_circle.tickplot(garc_id=self.ge_name, 109 | raxis_range=(550,600), 110 | tickpositions=tick_minor_positions, 111 | tickdirection="inner", 112 | tickwidth=0.5 113 | ) 114 | 115 | def build_alignment_plot(self) -> None: 116 | """Build garcs indicating which portions of the query genome align to the reference genome backbone 117 | 118 | Can show up to 20 colors representing query genome contigs before repeating. 119 | """ 120 | if self.aln_file is None: 121 | return 122 | 123 | self.aln_dict["start"] = [] 124 | self.aln_dict["len"] = [] 125 | self.aln_dict["contig"] = [] 126 | self.aln_dict["color"] = [] 127 | 128 | with open(self.aln_file,"r") as fin: 129 | for line in fin: 130 | ref_name, ref_start, ref_end, qname = line.rstrip().split("\t") 131 | aln_len = int(ref_end) - int(ref_start) 132 | 133 | self.aln_dict["start"].append(int(ref_start)) 134 | self.aln_dict["len"].append(int(aln_len)) 135 | self.aln_dict["contig"].append(qname) 136 | 137 | ## Generate different colors for different chromosomes 138 | qual_colors_20 = plt.colormaps['tab20'] 139 | uniq_contigs = [*set(self.aln_dict['contig'])] 140 | self.n_colors['n_contigs'] = len(uniq_contigs) 141 | self.n_colors['contig_names'] = uniq_contigs 142 | self.n_colors['colors'] = [] 143 | for i, contig_id in enumerate(uniq_contigs): 144 | self.n_colors['colors'].append(colors.rgb2hex(qual_colors_20(i))) 145 | 146 | ## Add colors to the alignment dictionary 147 | for contig_name in self.aln_dict['contig']: 148 | i = self.n_colors['contig_names'].index(contig_name) 149 | self.aln_dict['color'].append(self.n_colors['colors'][i]) 150 | 151 | ## build the barplots that map the genome aligment onto the chromosome plot 152 | if not len(self.aln_dict['start']) == 0: 153 | self.ge_circle.barplot(self.ge_name, 154 | data = [1]*len(self.aln_dict["start"]), 155 | positions=self.aln_dict["start"], 156 | width=self.aln_dict["len"], 157 | raxis_range=[600,700], 158 | facecolor=self.aln_dict["color"], 159 | linewidth=0) 160 | 161 | def build_coverage_plot(self) -> None: 162 | """Generates a garc fillplot of the ONT coverage data""" 163 | if self.cov_file is None: 164 | return 165 | 166 | self.cov_dict["pos"] = [] 167 | self.cov_dict["cov"] = [] 168 | with open(self.cov_file, 'r') as fin: 169 | for line in fin: 170 | element, pos, cov, = line.rstrip().rsplit("\t") 171 | self.cov_dict["pos"].append(int(pos)) 172 | self.cov_dict["cov"].append(int(cov)) 173 | 174 | if not len(self.cov_dict['pos']) == 0: 175 | self.ge_circle.fillplot(self.ge_name, 176 | data = self.cov_dict["cov"], 177 | positions=self.cov_dict["pos"], 178 | raxis_range=[701,850], 179 | base_value=0, 180 | rlim=(0,max(self.cov_dict["cov"])), 181 | facecolor="#808080") 182 | 183 | def build_illumina_coverage_plot(self) -> None: 184 | """Generates a garc fillplot of the Illumina coverage data""" 185 | if self.illumina_cov_file is None: 186 | return 187 | 188 | self.illumina_cov_dict["pos"] = [] 189 | self.illumina_cov_dict["cov"] = [] 190 | with open(self.illumina_cov_file, 'r') as fin: 191 | for line in fin: 192 | element, pos, cov, = line.rstrip().rsplit("\t") 193 | self.illumina_cov_dict["pos"].append(int(pos)) 194 | self.illumina_cov_dict["cov"].append(int(cov)) 195 | 196 | if not len(self.illumina_cov_dict['pos']) == 0: 197 | self.ge_circle.fillplot(self.ge_name, 198 | data = self.illumina_cov_dict["cov"], 199 | positions=self.illumina_cov_dict["pos"], 200 | raxis_range=[851,1000], 201 | base_value=0, 202 | rlim=(0,max(self.illumina_cov_dict["cov"])), 203 | facecolor="#808080") 204 | 205 | def add_cov_labels(self) -> None: 206 | """Generates the middle text indicating genetic element name (from the reference) and coverage statistics""" 207 | self.ge_circle.ax.text(0.5,0.53,f"{self.ge_name}", fontsize=20, 208 | transform=self.ge_circle.ax.transAxes, 209 | ha='center') 210 | 211 | if (self.cov_file is not None and len(self.cov_dict['cov']) != 0): 212 | min_cov = min(self.cov_dict['cov']) 213 | max_cov = max(self.cov_dict['cov']) 214 | avg_cov = round(sum(self.cov_dict['cov']) / len(self.cov_dict['cov']), 1) 215 | self.ge_circle.ax.text(0.5,0.45, 216 | f"ONT Average Coverage: {avg_cov}\nONT Minimum Coverage: {min_cov}\nONT Maximum Coverage: {max_cov}", 217 | fontsize=10, 218 | transform=self.ge_circle.ax.transAxes, 219 | ha='center') 220 | if (self.illumina_cov_file is not None and len(self.illumina_cov_dict['cov']) != 0): 221 | illumina_min_cov = min(self.illumina_cov_dict['cov']) 222 | illumina_max_cov = max(self.illumina_cov_dict['cov']) 223 | illumina_avg_cov = round(sum(self.illumina_cov_dict['cov']) / len(self.illumina_cov_dict['cov']), 1) 224 | self.ge_circle.ax.text(0.5,0.37, 225 | f"Illumina Average Coverage: {illumina_avg_cov}\n" + 226 | f"Illumina Minimum Coverage: {illumina_min_cov}\n" + 227 | f"Illumina Maximum Coverage: {illumina_max_cov}", 228 | fontsize=10, 229 | transform=self.ge_circle.ax.transAxes, 230 | ha='center') 231 | 232 | def add_gene_loc(self) -> None: 233 | """Draws gene annotations onto the reference backbone given a bed file""" 234 | if self.gene_file is None: 235 | return 236 | 237 | gene_dict = defaultdict(dict) 238 | label_pos = [] 239 | label_id = [] 240 | gene_dict['pos'] = [] 241 | gene_dict['width'] = [] 242 | with open(self.gene_file) as fin: 243 | for line in fin: 244 | elem_name, start, stop, gene_name = line.rstrip().split("\t") 245 | if elem_name == self.ge_name: 246 | start = int(start) 247 | width = int(stop)-start-1 248 | gene_dict['pos'].append(start) 249 | gene_dict['width'].append(width) 250 | label_pos.append(int(round(start + (width / 2),0))) 251 | label_id.append(f"$\\it\u007b{gene_name}\u007d$") 252 | 253 | if len(gene_dict['pos']) > 0: 254 | self.ge_circle.barplot(garc_id = self.ge_name, 255 | data=[1]*len(gene_dict['pos']), 256 | positions=gene_dict['pos'], 257 | width=gene_dict['width'], 258 | raxis_range=(600,700), 259 | facecolor="#FF000040", 260 | edgecolor="#000000", 261 | linewidth=1) 262 | 263 | self.ge_circle.tickplot(garc_id = self.ge_name, 264 | tickpositions=label_pos, 265 | ticklabels=label_id, 266 | tickdirection="inner", 267 | tickcolor="#FF0000", 268 | ticklabelcolor="#FF0000", 269 | ticklabelmargin=20) 270 | 271 | def build_legend(self) -> None: 272 | """Generates the legend showing: genome:genome alignments, contig IDs, and the coverage data""" 273 | leg = [] 274 | leg.append(mpatches.Patch(facecolor = "#FFFFFF", edgecolor= "#000000", label = "No alignment to reference")) 275 | #leg.append(mpatches.Patch(facecolor = "#1f77b4", label = ">98%% sequence alignment to $\it{Ba}$ Ames Ancestor")) 276 | #leg.append(mpatches.Patch(facecolor = "#808080", label = 'Inner: ONT Coverage across $\it{Ba}$ Ames Ancestor')) 277 | #leg.append(mpatches.Patch(facecolor = "#808080", label = "Outer: Illumina Coverage across $\it{Ba}$ Ames Ancestor")) 278 | 279 | if not len(self.n_colors) == 0: 280 | for i, contig in enumerate(self.n_colors['contig_names']): 281 | leg_patch = mpatches.Patch(color = self.n_colors['colors'][i], label = contig) 282 | leg.append(leg_patch) 283 | 284 | #both illumina and ONT coverage data 285 | if len(self.cov_dict['cov']) != 0 and len(self.illumina_cov_dict['cov']) != 0: 286 | leg.append(mpatches.Patch(facecolor = "#808080", label = 'Inner: ONT Coverage across reference')) 287 | leg.append(mpatches.Patch(facecolor = "#808080", label = "Outer: Illumina Coverage across reference")) 288 | 289 | # only ONT 290 | elif len(self.cov_dict['cov']) != 0: 291 | leg.append(mpatches.Patch(facecolor = "#808080", label = 'ONT Coverage across reference')) 292 | 293 | # only Illumina 294 | elif len(self.cov_dict['cov']) == 0 and len(self.illumina_cov_dict['cov']) != 0: 295 | leg.append(mpatches.Patch(facecolor = "#808080", label = 'Illumina Coverage across reference')) 296 | 297 | #self.ge_circle.ax.legend(handles=leg, prop={'size': 9}, bbox_to_anchor=(0.18,0.11)) #loc=3 298 | self.ge_circle.figure.legend(handles=leg, prop={'size': 9}, loc=3) 299 | 300 | def main(self) -> Gcircle: 301 | """Executes each method in the BuildCircosPlots class depending on data inputs 302 | 303 | Returns: 304 | Gcircle: A class object containing all the data to be visualized 305 | """ 306 | self.build_reference_genome() 307 | self.add_tickmarks() 308 | self.build_alignment_plot() 309 | self.build_coverage_plot() 310 | self.build_illumina_coverage_plot() 311 | self.add_cov_labels() 312 | self.add_gene_loc() 313 | if self.legend: 314 | self.build_legend() 315 | return self.ge_circle 316 | 317 | 318 | if __name__ == "__main__": 319 | import argparse 320 | import sys 321 | import re 322 | import subprocess 323 | 324 | parser = argparse.ArgumentParser() 325 | # add input for reference genome only 326 | parser.add_argument("-i", "--input_dir", help="Output directory generated by a previous pima (v1.4 or higher) run") 327 | parser.add_argument("-r", "--reference_genome", help="Input reference genome to draw specific gene locations on") 328 | parser.add_argument("-o", "--output_dir", help="Where to save the results") 329 | parser.add_argument("-g", "--gene_file", required=False, 330 | help="bed file format with the location of the genes to draw in the reference coordinates format") 331 | parser.add_argument("--image_format", help = "The output image type, support is determined by the matplotlib backend: 'png', 'pdf', 'ps', 'eps', and 'svg' should all work", 332 | required = False, default='png') 333 | 334 | args = parser.parse_args() 335 | 336 | if args.input_dir and args.reference_genome: 337 | print("Please specify either -i OR -r. These are mutually exclusive flags") 338 | sys.exit(0) 339 | 340 | 341 | outdir = args.output_dir 342 | if not os.path.exists(outdir): 343 | os.makedirs(outdir) 344 | 345 | 346 | def validate_files(var_fp): 347 | if not os.path.isfile(var_fp): 348 | var_fp = None 349 | return var_fp 350 | else: 351 | return var_fp 352 | 353 | ## Building circos plots to represent specific gene functions on a reference genome 354 | if args.reference_genome: 355 | reference_genome = args.reference_genome 356 | reference_sizes = os.path.join(outdir, "reference.sizes") 357 | 358 | # build the reference.sizes file 359 | command = ' '.join(['faidx -i chromsizes', reference_genome, ' | sort -k 1,1 -k 2,2n']) 360 | reference_sizes = [x for x in re.split(r'\n', subprocess.check_output(command, shell = True).decode('utf-8')) if x] 361 | 362 | # add the gene file 363 | gene_file = args.gene_file 364 | 365 | ## Manually constructing the Gcircle object because I want to modify some of the steps 366 | for element in reference_sizes: 367 | ge_name, ge_size = element.rsplit() 368 | 369 | circos_elem = BuildCircosPlots(ge_name=ge_name, 370 | aln_file=None, 371 | cov_file = None, 372 | illumina_cov_file = None, 373 | ge_size=int(ge_size), 374 | gene_file=gene_file, 375 | legend=False, 376 | outdir=None) 377 | circos_fig = circos_elem.main() 378 | 379 | # a hack - but we want the reference genome to be filled for these figures instead of blank 380 | circos_elem.ge_circle.barplot(ge_name, 381 | data = [1], 382 | positions=[int(1)], 383 | width=[int(ge_size)-1], 384 | raxis_range=[600,700], 385 | facecolor="#1f77b4", 386 | linewidth=0) 387 | circos_fig.save(file_name=f"{outdir}/{ge_name}", format=args.image_format , dpi=300) 388 | 389 | ## Building custom pima plots 390 | else: 391 | with open(f"{args.input_dir}/insertions/reference.sizes", "r") as fin: 392 | for line in fin: 393 | ge_name, ge_size = line.rstrip().rsplit() 394 | aln_file = f"{args.input_dir}/circos/{ge_name}/alignment.txt" 395 | cov_file = f"{args.input_dir}/circos/{ge_name}/coverage.mpileup" 396 | illumina_cov_file = f"{args.input_dir}/circos/{ge_name}/illumina_coverage.mpileup" 397 | gene_file = args.gene_file 398 | 399 | 400 | aln_file = validate_files(aln_file) 401 | cov_file = validate_files(cov_file) 402 | illumina_cov_file = validate_files(illumina_cov_file) 403 | 404 | circos_elem = BuildCircosPlots(ge_name=ge_name, 405 | ge_size=int(ge_size), 406 | aln_file=aln_file, 407 | cov_file=cov_file, 408 | illumina_cov_file=illumina_cov_file, 409 | gene_file=gene_file, 410 | outdir=None) 411 | 412 | circos_fig = circos_elem.main() 413 | 414 | circos_fig.save(file_name=f"{outdir}/{ge_name}", format="png" , dpi=300) 415 | 416 | """ 417 | for dir in os.scandir("../pima_downsample_ont/"): 418 | if dir.is_dir(): 419 | dirname = os.path.basename(dir) 420 | with open(f"{dir.path}/insertions/reference.sizes", "r") as fin: 421 | for line in fin: 422 | ge_name, ge_size = line.rstrip().rsplit() 423 | aln_file = f"{dir.path}/circos/{ge_name}/alignment.txt" 424 | cov_file = f"{dir.path}/circos/{ge_name}/coverage.mpileup" 425 | illumina_cov_file = f"{dir.path}/circos/{ge_name}/illumina_coverage.mpileup" 426 | gene_file = "/scicomp/home-pure/tsz0/Projects/devPima/MergedPima/data/ba_virulence_genes.bed" 427 | #outdir = f"{dir.path}/pima_out/20230308_Minion_TM_01/Sterne-CLR1-2/circos/{ge_name}" 428 | circos_elem = BuildCircosPlots(ge_name=ge_name, 429 | ge_size=int(ge_size), 430 | aln_file=aln_file, 431 | cov_file=cov_file, 432 | illumina_cov_file=illumina_cov_file, 433 | gene_file=gene_file, 434 | outdir=None) 435 | circos_fig = circos_elem.main() 436 | circos_fig.save(file_name=f"{dirname}_{ge_name}", format="png" , dpi=300) 437 | """ -------------------------------------------------------------------------------- /Pima/data/pima.css: -------------------------------------------------------------------------------- 1 | html { 2 | line-height: 1.5; 3 | font-family: Georgia, serif; 4 | font-size: 20px; 5 | color: #1a1a1a; 6 | background-color: #fdfdfd; 7 | } 8 | body { 9 | margin: 0 auto; 10 | max-width: 50em; 11 | padding-left: 20px; 12 | padding-right: 20px; 13 | padding-top: 20px; 14 | padding-bottom: 20px; 15 | hyphens: auto; 16 | overflow-wrap: break-word; 17 | font-kerning: normal; 18 | } 19 | @media print { 20 | body { 21 | background-color: transparent; 22 | color: black; 23 | font-size: 10pt; 24 | } 25 | p, h2, h3 { 26 | orphans: 3; 27 | widows: 3; 28 | } 29 | h2, h3, h4 { 30 | page-break-after: avoid; 31 | } 32 | } 33 | p { 34 | margin: 1em 0; 35 | } 36 | a { 37 | color: #3333FF; 38 | } 39 | a:visited { 40 | color: #1a1a1a; 41 | } 42 | img { 43 | margin: auto; 44 | display: block; 45 | max-width: 70%; 46 | } 47 | h1, h2, h3, h4, h5, h6 { 48 | margin-top: 1.4em; 49 | } 50 | h5, h6 { 51 | font-size: .8em; 52 | font-style: italic; 53 | } 54 | h6 { 55 | font-weight: normal; 56 | } 57 | ol, ul { 58 | padding-left: 1.7em; 59 | margin-top: 1em; 60 | } 61 | li > ol, li > ul { 62 | margin-top: 0; 63 | } 64 | blockquote { 65 | margin: 1em 0 1em 1.7em; 66 | padding-left: 1em; 67 | border-left: 2px solid #e6e6e6; 68 | color: #606060; 69 | } 70 | code { 71 | font-family: Menlo, Monaco, 'Lucida Console', Consolas, monospace; 72 | font-size: 85%; 73 | margin: 0; 74 | } 75 | pre { 76 | margin: 1em 0; 77 | overflow: auto; 78 | } 79 | pre code { 80 | padding: 0; 81 | overflow: visible; 82 | overflow-wrap: normal; 83 | } 84 | .sourceCode { 85 | background-color: transparent; 86 | overflow: visible; 87 | } 88 | hr { 89 | background-color: #1a1a1a; 90 | border: none; 91 | height: 1px; 92 | margin: 1em 0; 93 | } 94 | table { 95 | border-collapse: collapse; 96 | font-variant-numeric: lining-nums tabular-nums; 97 | font-size: 80%; 98 | } 99 | table caption { 100 | margin-bottom: 0.75em; 101 | } 102 | tbody { 103 | margin-top: 0.5em; 104 | border-top: 1px solid #1a1a1a; 105 | border-bottom: 1px solid #1a1a1a; 106 | text-align: center; 107 | } 108 | th { 109 | border-top: 1px solid #1a1a1a; 110 | padding: 0.25em 0.5em 0.25em 0.5em; 111 | } 112 | td { 113 | padding: 0.125em 0.5em 0.25em 0.5em; 114 | } 115 | 116 | tr:nth-child(even) {background: #CCC} 117 | tr:nth-child(odd) {background: #FFF} 118 | 119 | header { 120 | margin-bottom: 4em; 121 | text-align: center; 122 | } 123 | #TOC li { 124 | list-style: none; 125 | } 126 | #TOC ul { 127 | padding-left: 1.3em; 128 | } 129 | #TOC > ul { 130 | padding-left: 0; 131 | } 132 | #TOC a:not(:hover) { 133 | text-decoration: none; 134 | } 135 | code{white-space: pre-wrap;} 136 | span.smallcaps{font-variant: small-caps;} 137 | span.underline{text-decoration: underline;} 138 | div.column{display: inline-block; vertical-align: top; width: 50%;} 139 | div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} 140 | ul.task-list{list-style: none;} 141 | .display.math{display: block; text-align: center; margin: 0.5rem auto;} 142 | -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/beta-lactams.md: -------------------------------------------------------------------------------- 1 | ### Appendix LETTER: Location of β-Lactams AMR associated genes within Bacillus anthracis str. Ames Ancestor Chromosome 2 | 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2 4 | **Antibiotic Class:** Beta-lactams 5 | **Antibiotics:** Penicillin 6 | **AMR genes description:** 7 | - rsiP: Sigma-70 family RNA polymerase sigma factor 8 | 9 | **Gene location:** 10 | - 2,323,269 – 2,324,096 11 | 12 | ![Chromosomal location](beta-lactams.png) 13 | 14 | | Gene | AMR description | Manuscript | 15 | | :------ | :------ | :------ | 16 | | rsiP | Mutations leading to truncation of RsiP have been described as a basis for PEN resistance. | https://pubmed.ncbi.nlm.nih.gov/30574557/ https://pubmed.ncbi.nlm.nih.gov/19717606/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3923885/ | 17 | | sigP | Mutations in the sigP gene has been associated with PEN resistance. |  https://pubmed.ncbi.nlm.nih.gov/30574557/ | 18 |
19 | 20 | -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/beta-lactams.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/beta-lactams.png -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/macrolides.md: -------------------------------------------------------------------------------- 1 | ### Appendix LETTER: Location of macrolides (rplV) AMR associated genes within Bacillus anthracis Ames Ancestor Chromosome 2 | 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2 4 | **Antibiotic Class:** Macrolides 5 | **Antibiotics:** Clarithromycin 6 | **AMR genes description:** 7 | - rplV: 50S ribosomal protein L22 8 | 9 | **Gene location:** 10 | - 124,092-124,433 11 | 12 | ![Chromosomal location](macrolides.png) 13 | 14 | | Gene | AMR description | Manuscript | 15 | | :------ | :------ | :------ | 16 | |rplV | It has been reported that a 27-nucleotide repeat sequence insertion in the rplV gene induced a specific resistance to macrolide antibiotics. | https://pubmed.ncbi.nlm.nih.gov/29899844/ | 17 |
18 | 19 | -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/macrolides.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/macrolides.png -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/quinolones.md: -------------------------------------------------------------------------------- 1 | ### Appendix LETTER: Location of quinolones AMR associated genes within Bacillus anthracis str. Ames Ancestor Chromosome 2 | 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2 4 | **Antibiotic Class:** Quinolones 5 | **Antibiotics:** Ciprofloxacin, ofloxacin, levofloxacin, and moxifloxacin 6 | **AMR genes description:** 7 | - gyrA: DNA gyrase subunit A 8 | - gyrB: DNA topoisomerase (ATP-hydrolyzing) subunit B 9 | - parC: DNA topoisomerase IV subunit A 10 | - parE: DNA topoisomerase IV subunit B 11 | - tetR: TetR family transcriptional regulator 12 | 13 | **Gene location:** 14 | - gyrA: 6,595 -9,066 15 | - gyrB: 4,584 - 6,506 16 | - parC: 3,362,705 - 3,365,128 17 | - parE: 3,365,130 - 3,367,094 18 | - GBAA_RS04545 or tetR: 842,403 – 842,981 19 | 20 | ![Chromosomal location](quinolones.png) 21 | 22 | | Gene | AMR description | Manuscript | 23 | | :------ | :------ | :------ | 24 | | gyrA | Mutations in the gyrA gene have been associated with resistance to certain classes of antibiotics, particularly fluoroquinolones. | https://pubmed.ncbi.nlm.nih.gov/12821500/ | 25 | | gyrB | Mutations in the gyrB gene can also lead to resistance against fluoroquinolone antibiotics. However, compared to the gyrA gene, mutations in gyrB are less commonly associated with antibiotic resistance. | https://pubmed.ncbi.nlm.nih.gov/15190035/ | 26 | | parC | Mutations in the parC gene can lead to resistance against fluoroquinolone antibiotics, similar to the gyrA and gyrB genes.Mutations in the parC gene can lead to resistance against fluoroquinolone antibiotics, similar to the gyrA and gyrB genes. | https://pubmed.ncbi.nlm.nih.gov/32273351/ | 27 | | parE | Similar to the gyrA, gyrB, and parC genes, mutations in the parE gene can also contribute to antibiotic resistance, particularly against fluoroquinolone antibiotics. | https://www.osti.gov/servlets/purl/1117920 | 28 | | GBAA_RS04545 or tetR | TetR-type transcriptional regulators have been described as a novel "mutation hot spot" that leads to the increased expression of multidrug efflux systems for CIP resistance. | https://pubmed.ncbi.nlm.nih.gov/20385868/ https://pubmed.ncbi.nlm.nih.gov/32273351/ | 29 |
30 | 31 | -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/quinolones.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/quinolones.png -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/tetracyclines.md: -------------------------------------------------------------------------------- 1 | ### Appendix LETTER: Location of tetracyclines AMR associated genes within Bacillus anthracis str. Ames Ancestor Chromosome 2 | 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2 4 | **Antibiotic Class:** Tetracyclines 5 | **Antibiotics:** Tetracycline 6 | **AMR genes description:** 7 | - tetA: Tetracycline resistance MFS efflux pump 8 | - tetMWOS: TetM/W/O/S family tetracycline resistance ribosomal protection protein 9 | - rpsJ: 30S ribosomal protein S10 10 | 11 | **Gene location:** 12 | - tetA: 8,431,92 – 8,443,94 13 | - tetMWOS: 2,805,522 - 2,807,465 14 | - rpsJ: 1,209,62 – 1,212,70 15 | 16 | ![Chromosomal location](tetracyclines.png) 17 | 18 | | Gene | AMR description | Manuscript | 19 | | :------ | :------ | :------ | 20 | | tetA | Mutations in the tetA gene, which encode a tetracycline efflux protein have been associated with tetracyclines resistance. | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2847397/ https://pubmed.ncbi.nlm.nih.gov/14702405/ | 21 | | tetMWOS | Mechanisms of tetracycline resistance involving ribosomal protection proteins have established a correlation between mutations in tetracycline binding sites and changes in MIC data. |  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3576927/ | 22 | | rpsJ | Mutations in the rpsJ, encoding changes or deletions in residues 53–60 in the 30S ribosomal subunit protein S10, have been linked to tetracycline or tigecycline resistance in in vitro studies with Gram-positive bacteria |  https://pubmed.ncbi.nlm.nih.gov/26989065/ | 23 |
24 | 25 | -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/tetracyclines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/tetracyclines.png -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/ba_virulence_genes.bed: -------------------------------------------------------------------------------- 1 | pX01 143779 146073 pagA 2 | pX01 149357 151786 lef 3 | pX01 122608 125010 cya 4 | pX02 48928 50376 acpB 5 | pX02 52147 52290 capE 6 | pX02 52305 53891 capD 7 | pX02 53888 55123 capA 8 | pX02 55135 55584 capC 9 | pX02 55599 56993 capB 10 | pX02 68439 69890 acpA -------------------------------------------------------------------------------- /Pima/data/reference_sequences/Bacillus_anthracis/confirmed_amr_mutations.bed: -------------------------------------------------------------------------------- 1 | #contig start stop name type drug priority note 2 | chromosome 2322815 2322816 sigP snp beta-lactam confirmed_location A to G mutation at this location in sigP has been shown to confer penicillin resistance in a lab setting. 3 | chromosome 2323276 2323277 rsiP indel beta-lactam confirmed_location A single deletion in a homopolyer region that is part of the end of sigP and beginning of rsiP (sigma / antisigma factors) has been shown to confer penicillin resistance in a lab setting. 4 | chromosome 2323283 2323284 rsiP indel beta-lactam confirmed_location The insertion of G has been shown to confer penicillin resistance in a lab setting. 5 | chromosome 2323306 2323307 rsiP indel beta-lactam confirmed_location Deletion of A has been shown to confer penicillin resistance in a lab setting. 6 | chromosome 2323738 2323739 rsiP snp beta-lactam confirmed_location SNP in this location has been shown to confer penicillin resistance in a lab setting. 7 | chromosome 2323738 2323739 rsiP indel beta-lactam confirmed_location The insertion of G has been shown to confer penicillin resistance in a lab setting. 8 | chromosome 2323919 2323925 rsiP indel beta-lactam confirmed_location A 7bp deletion in this location has been shown to confer penicillin resistance in a lab setting. 9 | chromosome 2324040 2324078 rsiP indel beta-lactam confirmed_location Single A deletion has been shown to confer penicillin resistance in a lab setting. 10 | chromosome 124375 124396 rplV indel macrolide confirmed_location A 24bp insertion in rplV has been shown to confer macrolide resistance in a lab setting. 11 | chromosome 124351 124363 rplV indel macrolide confirmed_location A 12bp insertion in rplV has been shown to confer macrolide resistance in a lab setting. 12 | chromosome 5053 5054 gyrB snp quinolones confirmed_location A to G mutation in gyrB has been shown to confer quinolone resistance in a lab setting. 13 | chromosome 5892 5893 gyrB snp quinolones confirmed_location A to G mutation in gyrB has been shown to confer quinolone resistance in a lab setting. 14 | chromosome 6006 6007 gyrB snp quinolones confirmed_location A G to A mutation in gyrB has been shown to confer quinolone resistance in a lab setting. 15 | chromosome 6847 6848 gyrA snp quinolones confirmed_location C to T mutation in gyrA has been shown to confer quinolone resistance in a lab setting. 16 | chromosome 6849 6850 gyrA snp quinolones confirmed_location G to C mutation in gyrA has been shown to confer quinolone resistance in a lab setting. 17 | chromosome 6850 6851 gyrA snp quinolones confirmed_location C to G mutation in gyrA has been shown to confer quinolone resistance in a lab setting. 18 | chromosome 6858 6859 gyrA snp quinolones confirmed_location G to A mutation in gyrA has been shown to confer quinolone resistance in a lab setting. 19 | chromosome 6859 6860 gyrA snp quinolones confirmed_location A to C mutation in gyrA has been shown to confer quinolone resistance in a lab setting. 20 | chromosome 6860 6861 gyrA snp quinolones confirmed_location A to G mutation in gyrA has been shown to confer quinolone resistance in a lab setting. 21 | chromosome 3362946 3362947 parC snp quinolones confirmed_location C to either T or A mutations in parC have been shown to confer quinolone resistance in lab settings 22 | chromosome 3362947 3362948 parC snp quinolones confirmed_location C to T mutation in parC have been shown to confer quinolone resistance in lab settings 23 | chromosome 3362950 3362951 parC snp quinolones confirmed_location T to C mutation in parC have been shown to confer quinolone resistance in lab settings 24 | chromosome 3362957 3362958 parC snp quinolones confirmed_location G to A mutation in parC have been shown to confer quinolone resistance in lab settings 25 | chromosome 3362992 3362993 parC snp quinolones confirmed_location Any mutation in parC at this location has been shown to confer quinolone resistance in lab setting 26 | chromosome 3364886 3364887 parC snp quinolones confirmed_location G to A mutation in parC has been shown to confer quinolone resistance in lab settings 27 | chromosome 3365453 3365454 parC snp quinolones confirmed_location G to T mutation in parC has been shown to confer quinolone resistance in lab settings 28 | chromosome 3366419 3366420 parE snp quinolones confirmed_location Any mutation in parE at this location has been shown to confer quinolone resistance in lab setting 29 | chromosome 748008 842715 tetR large-indel quinolones confirmed_location Multiple large deletions in this region have been shown to confer quinolone resistance under lab conditions. 30 | chromosome 842290 842291 tetR indel quinolones confirmed_location Single T deletion has been shown to confer quinolone resistance in a lab setting 31 | chromosome 842290 842291 tetR indel quinolones confirmed_location A 5 bp deletion has been shown to confer quinolone resistance in a lab setting 32 | chromosome 842398 842399 tetR indel quinolones confirmed_location A 2 bp insertion has been shown to confer quinolone resistance in a lab setting 33 | chromosome 842517 842518 tetR snp quinolones confirmed_location C to T mutation has been shown to confer quinolone resistance in lab settings 34 | chromosome 842613 842614 tetR indel quinolones confirmed_location An 11 bp insertion has been shown to confer quinolone resistance in lab settings 35 | chromosome 842709 842710 tetR indel quinolones confirmed_location A 2 bp insertion has been shown to confer quinolone resistance in a lab setting 36 | chromosome 842714 842715 tetR indel quinolones confirmed_location A 1 bp deletion has been shown to confer quinolone resistance in a lab setting 37 | chromosome 108732 108733 rpoB snp rifamycim confirmed_location G to T mutation has been shown to confer rifamycim resistance in lab settings 38 | chromosome 2322745 2323278 sigP any beta-lactam potential_confer_amr While this mutation has not been observed before, other mutations in this region confer beta-lactam resistance under laboratory conditions 39 | chromosome 2323268 2324095 rsiP any beta-lactam potential_confer_amr While this mutation has not been observed before, other mutations in this region confer beta-lactam resistance under laboratory conditions 40 | chromosome 124091 124432 rplV any macrolide potential_confer_amr While this mutation has not been observed before, other mutations in this region confer macrolide resistance under laboratory conditions 41 | chromosome 4583 6505 gyrB any quinolones potential_confer_amr While this mutation has not been observed before, other mutations in this region confer quinolone resistance under laboratory conditions 42 | chromosome 6594 9065 gyrA any quinolones potential_confer_amr While this mutation has not been observed before, other mutations in this region confer quinolone resistance under laboratory conditions 43 | chromosome 3362704 3365127 parC any quinolones potential_confer_amr While this mutation has not been observed before, other mutations in this region confer quinolone resistance under laboratory conditions 44 | chromosome 3365129 3367093 parE any quinolones potential_confer_amr While this mutation has not been observed before, other mutations in this region confer quinolone resistance under laboratory conditions 45 | chromosome 842402 842980 tetR any quinolones potential_confer_amr While this mutation has not been observed before, other mutations in this region confer quinolone resistance under laboratory conditions 46 | chromosome 108390 111923 rpoB any rifamycim potential_confer_amr While this mutation has not been observed before, other mutations in this region confer rifamycim resistance under laboratory conditions 47 | -------------------------------------------------------------------------------- /Pima/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .download_references import ( 2 | validate_download, 3 | validate_organism, 4 | ) 5 | from .outdir import validate_output_dir 6 | from .multiplexed import validate_multiplex_fastq, initialize_multiplex_analysis 7 | from .fastq import ( 8 | validate_ont_fastq, 9 | info_given_ont_fastq, 10 | validate_illumina_fastq, 11 | info_illumina_fastq, 12 | validate_genome_estimate, 13 | estimate_genome_size, 14 | ) 15 | from .check_contamination import validate_contamination_check, fastq_contamination 16 | 17 | from .assembly import ( 18 | validate_genome_fasta, 19 | validate_genome_assembly_size, 20 | validate_assembly_info, 21 | validate_assembler, 22 | flye_ont_fastq, 23 | raven_ont_fastq, 24 | spades_illumina_fastq, 25 | check_assembly_coverages, 26 | ) 27 | 28 | from .ont_polishing import ( 29 | validate_medaka, 30 | medaka_ont_assembly, 31 | ) 32 | 33 | from .illumina_polishing import( 34 | validate_illumina_polish, 35 | pilon_assembly, 36 | polypolish_assembly, 37 | ) 38 | 39 | from .evaluate_assembly import( 40 | validate_evaluate_assembly, 41 | check_for_small_contigs_and_fragmentation, 42 | ) 43 | 44 | from .annotations import ( 45 | validate_features, 46 | validate_blast, 47 | blast_feature_sets, 48 | ) 49 | 50 | from .plasmids import ( 51 | validate_plasmids, 52 | call_plasmids, 53 | ) 54 | 55 | from .compare_to_ref import ( 56 | validate_reference_fasta, 57 | validate_mutations, 58 | validate_quast, 59 | call_insertions, 60 | quast_genome, 61 | call_amr_mutations, 62 | ) 63 | 64 | from .visualizations import ( 65 | validate_draw_amr_matrix, 66 | validate_draw_features, 67 | validate_draw_circos, 68 | draw_features, 69 | draw_amr_matrix, 70 | draw_circos, 71 | ) 72 | 73 | from .report import ( 74 | validate_make_report, 75 | make_report, 76 | ) 77 | 78 | __all__ = [ 79 | # Validation 80 | "validate_download", 81 | "validate_organism", 82 | "validate_output_dir", 83 | "validate_multiplex_fastq", 84 | "validate_genome_estimate", 85 | "validate_ont_fastq", 86 | "validate_illumina_fastq", 87 | "validate_contamination_check", 88 | "validate_genome_fasta", 89 | "validate_genome_assembly_size", 90 | "validate_assembly_info", #check for coverages 91 | "validate_assembler", 92 | "validate_medaka", 93 | "validate_illumina_polish", 94 | "validate_evaluate_assembly", #check for contig size/# issues 95 | "validate_plasmids", 96 | "validate_features", 97 | "validate_blast", 98 | "validate_reference_fasta", 99 | "validate_quast", 100 | "validate_mutations", 101 | "validate_draw_amr_matrix", 102 | "validate_draw_features", 103 | "validate_draw_circos", 104 | "validate_make_report", 105 | # Analysis 106 | "initialize_multiplex_analysis", 107 | "estimate_genome_size", 108 | "info_given_ont_fastq", 109 | "info_illumina_fastq", 110 | "fastq_contamination", 111 | "flye_ont_fastq", 112 | "raven_ont_fastq", 113 | "spades_illumina_fastq", 114 | "medaka_ont_assembly", 115 | "pilon_assembly", 116 | "polypolish_assembly", 117 | "check_assembly_coverages", 118 | "check_for_small_contigs_and_fragmentation", 119 | "call_plasmids", 120 | "blast_feature_sets", 121 | "call_insertions", 122 | "quast_genome", 123 | "call_amr_mutations", 124 | "draw_features", 125 | "draw_amr_matrix", 126 | "draw_circos", 127 | "make_report", 128 | ] 129 | -------------------------------------------------------------------------------- /Pima/modules/annotations.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import re 3 | import os 4 | import subprocess 5 | 6 | import pandas as pd 7 | 8 | from Pima.pima_data import PimaData 9 | from Pima.utils.settings import Settings 10 | from Pima.utils.utils import ( 11 | print_and_log, 12 | print_and_run, 13 | validate_utility, 14 | validate_file_and_size, 15 | find_checkpoint, 16 | make_start_file, 17 | make_finish_file, 18 | std_files, 19 | error_out, 20 | ) 21 | 22 | 23 | def validate_features(pima_data: PimaData, settings: Settings): 24 | # skip conditions 25 | if pima_data.only_assemble: 26 | return 27 | 28 | if pima_data.feature_fastas is None: 29 | pima_data.feature_fastas = [] 30 | 31 | if not pima_data.no_amr: 32 | pima_data.feature_fastas.append(pima_data.amr_database) 33 | pima_data.feature_colors.append(settings.amr_default_color) 34 | if validate_file_and_size(pima_data, settings.amr_gene_drug_tsv): 35 | pima_data.amr_gene_drug = pd.read_csv( 36 | settings.amr_gene_drug_tsv, 37 | index_col=None, 38 | sep="\t", 39 | quoting=csv.QUOTE_NONE, 40 | header=None, 41 | ) 42 | pima_data.drug_categories = pima_data.amr_gene_drug.iloc[:, 1].unique() 43 | 44 | if not pima_data.no_inc: 45 | pima_data.feature_fastas.append(pima_data.inc_database) 46 | pima_data.feature_colors.append(settings.inc_default_color) 47 | 48 | if len(pima_data.feature_fastas) == 0: 49 | return 50 | 51 | if not pima_data.will_have_genome_fasta: 52 | return 53 | 54 | print_and_log( 55 | pima_data, 56 | "Validating feature sets", 57 | pima_data.main_process_verbosity, 58 | pima_data.main_process_color, 59 | ) 60 | 61 | for feature_fasta in pima_data.feature_fastas: 62 | if not validate_file_and_size(pima_data, feature_fasta): 63 | # See if the missing database can be downloaded 64 | if feature_fasta in settings.included_databases: 65 | if not pima_data.download: 66 | pima_data.errors.append( 67 | f"Can't find feature database {feature_fasta} or it is empty. Try --download?" 68 | ) 69 | else: 70 | pima_data.errors.append(f"Can't find feature database {feature_fasta}") 71 | 72 | 73 | def validate_blast(pima_data: PimaData): 74 | # skip conditions 75 | if pima_data.only_assemble: 76 | return 77 | 78 | if len(pima_data.feature_fastas) == 0: 79 | return 80 | 81 | if not pima_data.will_have_genome_fasta: 82 | return 83 | 84 | print_and_log( 85 | pima_data, 86 | "Validating blast utilities", 87 | pima_data.main_process_verbosity, 88 | pima_data.main_process_color, 89 | ) 90 | 91 | for utility in ["makeblastdb", "blastn", "bedtools"]: 92 | if validate_utility(pima_data, utility, f"{utility} isn't on the PATH."): 93 | command = utility + " -version" 94 | pima_data.versions[utility] = re.search( 95 | r"[0-9]+\.[0-9.]+", print_and_run(pima_data, command)[0] 96 | ).group(0) 97 | pima_data.analysis.append(["blast_feature_sets", pima_data]) 98 | 99 | 100 | def blast_feature_sets(pima_data: PimaData): 101 | """Find genes within both 'amr' and 'inc' databases within the assembly 102 | 103 | Generates a dictionary of dataframes, 1 dataframe for amr and 1 for inc 104 | """ 105 | 106 | print_and_log( 107 | pima_data, 108 | "BLASTing feature sets", 109 | pima_data.main_process_verbosity, 110 | pima_data.main_process_color, 111 | ) 112 | 113 | # Keep track of feature hits for reporting 114 | pima_data.features_dir = os.path.join(pima_data.output_dir, "features") 115 | 116 | # Check if results already exist 117 | if find_checkpoint(pima_data, pima_data.features_dir): 118 | print_and_log( 119 | pima_data, 120 | "BLASTing features had previously been run and finished successfully", 121 | pima_data.main_process_verbosity, 122 | pima_data.main_process_color, 123 | ) 124 | pima_data.did_blast_feature_sets = True 125 | found_feature_dirs = [ 126 | feature_dir.path 127 | for feature_dir in os.scandir(pima_data.features_dir) 128 | if feature_dir.is_dir() 129 | ] 130 | for feature_dir in found_feature_dirs: 131 | feature_name = os.path.basename(feature_dir) 132 | best_bed = os.path.join(feature_dir, "best.bed") 133 | parse_blast_features(pima_data, best_bed, feature_name) 134 | return 135 | 136 | os.makedirs(pima_data.features_dir) 137 | make_start_file(pima_data, pima_data.features_dir) 138 | 139 | # Make a blast database of the genome 140 | make_blast_database(pima_data, pima_data.genome_fasta) 141 | 142 | for feature_number in range(len(pima_data.feature_fastas)): 143 | feature_fasta = pima_data.feature_fastas[feature_number] 144 | feature_name = re.sub(r"\.f.*", "", os.path.basename(feature_fasta)) 145 | feature_dir = os.path.join(pima_data.features_dir, feature_name) 146 | blast_features(pima_data, feature_fasta, feature_dir, feature_name) 147 | pima_data.feature_dirs += [feature_dir] 148 | pima_data.feature_names += [feature_name] 149 | 150 | pima_data.did_blast_feature_sets = True 151 | make_finish_file(pima_data, pima_data.features_dir) 152 | 153 | 154 | def make_blast_database(pima_data: PimaData, database_fasta: str): 155 | 156 | if os.path.isfile(f"{database_fasta}.nin"): 157 | command = " ".join( 158 | [ 159 | 'blastdbcmd -info -db', 160 | database_fasta, 161 | ] 162 | ) 163 | result = subprocess.run(command, shell=True, capture_output=True, text=True) 164 | if result.returncode == 0: 165 | return 166 | 167 | print_and_log( 168 | pima_data, 169 | "Making a BLAST database for " + database_fasta, 170 | pima_data.sub_process_verbosity, 171 | pima_data.sub_process_color, 172 | ) 173 | std_prefix = re.sub(r"\.[^.]*$", "", database_fasta) 174 | stdout_file, stderr_file = std_files(std_prefix) 175 | command = " ".join( 176 | [ 177 | "makeblastdb -in", 178 | database_fasta, 179 | "-dbtype nucl -parse_seqids", 180 | "1>", stdout_file, 181 | "2>", stderr_file, 182 | ] 183 | ) 184 | print_and_run(pima_data, command) 185 | 186 | 187 | def blast_features( 188 | pima_data: PimaData, feature_fasta: str, feature_dir: str, feature_name: str 189 | ): 190 | # Make a directory for the new features 191 | os.makedirs(feature_dir) 192 | 193 | # BLASTn the feature set 194 | blast_output = os.path.join(feature_dir, "blast_output.tsv") 195 | print_and_log( 196 | pima_data, 197 | "BLASTing features against the assembly", 198 | pima_data.sub_process_verbosity, 199 | pima_data.sub_process_color, 200 | ) 201 | blastn_stdout, blastn_stderr = std_files(os.path.join(feature_dir, "blastn")) 202 | command = " ".join( 203 | [ 204 | "blastn -db", 205 | pima_data.genome_fasta, 206 | "-query", 207 | feature_fasta, 208 | "-perc_identity 95.0", 209 | '-outfmt "6', 210 | 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen"', 211 | "-evalue 1e-10 -out ", 212 | blast_output, 213 | "1>", blastn_stdout, 214 | "2>", blastn_stderr, 215 | ] 216 | ) 217 | print_and_run(pima_data, command) 218 | 219 | # Clean up the results into a handy BED file 220 | print_and_log( 221 | pima_data, 222 | "Converting feature hits to BED", 223 | pima_data.sub_process_verbosity, 224 | pima_data.sub_process_color, 225 | ) 226 | all_bed = os.path.join(feature_dir, "all.bed") 227 | command = " ".join( 228 | [ 229 | "cat", 230 | blast_output, 231 | "| awk -F '\\t' '($3 >= 95) && ($4 / $14 >= .90){OFS = \"\\t\";" 232 | + 'print $2,($9 < $10 ? $9 : $10),($9 < $10 ? $10 : $9),$1,$3/100,($9 < $10 ? "+" : "-")}\'', 233 | "| sort -k 1,1 -k 2,2n >", 234 | all_bed, 235 | ] 236 | ) 237 | print_and_run(pima_data, command) 238 | 239 | # Make clusters of hits 240 | print_and_log( 241 | pima_data, 242 | "Clustering feature hits", 243 | pima_data.sub_process_verbosity, 244 | pima_data.sub_process_color, 245 | ) 246 | merge_bed = os.path.join(feature_dir, "merge.bed") 247 | _, merge_stderr = std_files(os.path.join(feature_dir, "bedtools_merge")) 248 | command = " ".join( 249 | ["bedtools merge -d -30 -i", all_bed, "1>", merge_bed, "2>", merge_stderr] 250 | ) 251 | print_and_run(pima_data, command) 252 | 253 | # Pick the best hit for each cluster 254 | print_and_log( 255 | pima_data, 256 | "Finding the best hit for each feature cluster", 257 | pima_data.sub_process_verbosity, 258 | pima_data.sub_process_color, 259 | ) 260 | best_bed = os.path.join(feature_dir, "best.bed") 261 | command = " ".join( 262 | [ 263 | "bedtools intersect", 264 | "-a", 265 | all_bed, 266 | "-b", 267 | merge_bed, 268 | "-f .9 -F .9 -wao", 269 | "| awk '$7 != \".\"'", 270 | '| awk \'{OFS="\\t";locus=$7"\\t"$8"\\t"$9; if($5 > s[locus]){s[locus]=$5;id = sprintf("%.3f", $5); b[locus] = $1"\\t"$2"\\t"$3"\\t"$4"\\t"id"\\t"$6}}', 271 | "END{for(i in b){print b[i]}}'", 272 | "| sort -k 1,1 -k2,2n", 273 | ">" + best_bed, 274 | ] 275 | ) 276 | print_and_run(pima_data, command) 277 | parse_blast_features(pima_data, best_bed, feature_name) 278 | 279 | 280 | def parse_blast_features(pima_data: PimaData, best_bed: str, feature_name: str): 281 | # Keep the feature hits for later drawing. It may be empty, i.e., no feature hits 282 | try: 283 | best = pd.read_csv(filepath_or_buffer=best_bed, sep="\t", header=None) 284 | except FileNotFoundError: 285 | best = pd.DataFrame() 286 | except pd.errors.EmptyDataError: 287 | best = pd.DataFrame() 288 | except Exception as e: 289 | error_out( 290 | pima_data, f"Unexpected exception when processing BLAST features: {e}" 291 | ) 292 | 293 | pima_data.feature_hits[feature_name] = best 294 | -------------------------------------------------------------------------------- /Pima/modules/check_contamination.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import pandas as pd 5 | 6 | from Pima.pima_data import PimaData 7 | from Pima.utils.settings import Settings 8 | 9 | from Pima.utils.utils import ( 10 | print_and_log, 11 | print_and_run, 12 | validate_utility, 13 | validate_file_and_size_or_error, 14 | make_start_file, 15 | make_finish_file, 16 | std_files, 17 | find_checkpoint, 18 | ) 19 | 20 | def validate_contamination_check(pima_data: PimaData, settings: Settings): 21 | 22 | if not pima_data.contam_check: 23 | return 24 | 25 | print_and_log( 26 | pima_data, 27 | 'Validating contamination check', 28 | pima_data.main_process_verbosity, 29 | pima_data.main_process_color, 30 | ) 31 | 32 | if not pima_data.will_have_ont_fastq and pima_data.illumina_fastq is None: 33 | pima_data.errors.append('--contamination requires a set of FASTQ reads') 34 | 35 | if validate_utility(pima_data, 'kraken2', 'kraken2 is not on the PATH (required by --contamination-check).'): 36 | command = 'kraken2 --version' 37 | pima_data.versions['kraken2'] = re.search(r'[0-9]+\.[0-9.]+', print_and_run(pima_data, command)[0]).group(0) 38 | 39 | if os.path.isdir(settings.kraken_database_default): 40 | pima_data.kraken_database = settings.kraken_database_default 41 | elif os.path.isdir(settings.DockerPathKraken): 42 | pima_data.kraken_database = settings.DockerPathKraken 43 | else: 44 | pima_data.errors.append("No kraken2 database detected, try and run pima with --download. Exiting now.") 45 | 46 | pima_data.analysis.append(['fastq_contamination', pima_data, settings]) 47 | 48 | 49 | def fastq_contamination(pima_data: PimaData, settings: Settings): 50 | 51 | print_and_log( 52 | pima_data, 53 | 'Running Kraken2 to check for contamination', 54 | pima_data.main_process_verbosity, 55 | pima_data.main_process_color, 56 | ) 57 | 58 | pima_data.kraken_dir = os.path.join(pima_data.output_dir, 'contamination') 59 | 60 | if find_checkpoint(pima_data, pima_data.kraken_dir): 61 | print_and_log( 62 | pima_data, 63 | 'Using existing kraken2 report', 64 | pima_data.sub_process_verbosity, 65 | pima_data.sub_process_color, 66 | ) 67 | pima_data.did_kraken_fastq = True 68 | if os.path.isdir(os.path.join(pima_data.kraken_dir, "ont")): 69 | pima_data.kraken_fracs['ONT'] = read_kraken_report(os.path.join(pima_data.kraken_dir, "ont", "kraken.report")) 70 | 71 | if os.path.isdir(os.path.join(pima_data.kraken_dir, "illumina")): 72 | pima_data.kraken_fracs['Illumina'] = read_kraken_report(os.path.join(pima_data.kraken_dir, "illumina", "kraken.report")) 73 | return 74 | 75 | os.makedirs(pima_data.kraken_dir) 76 | make_start_file(pima_data, pima_data.kraken_dir) 77 | 78 | if not (pima_data.ont_fastq is None): 79 | print_and_log( 80 | pima_data, 81 | 'Running Kraken2 on ONT data', 82 | pima_data.sub_process_verbosity, 83 | pima_data.sub_process_color, 84 | ) 85 | ont_kraken_dir = os.path.join(pima_data.kraken_dir, 'ont') 86 | pima_data.kraken_fracs['ONT'] = kraken_fastq(pima_data, settings, pima_data.ont_fastq, ont_kraken_dir) 87 | 88 | if not (pima_data.illumina_fastq is None): 89 | print_and_log( 90 | pima_data, 91 | 'Running Kraken2 on Illumina data', 92 | pima_data.sub_process_verbosity, 93 | pima_data.sub_process_color, 94 | ) 95 | illumina_kraken_dir = os.path.join(pima_data.kraken_dir, 'illumina') 96 | pima_data.kraken_fracs['Illumina'] = kraken_fastq(pima_data, settings, pima_data.illumina_fastq, illumina_kraken_dir) 97 | 98 | pima_data.did_kraken_fastq = True 99 | make_finish_file(pima_data, pima_data.kraken_dir) 100 | 101 | 102 | def kraken_fastq(pima_data: PimaData, settings: Settings, fastq, fastq_dir: str): 103 | 104 | os.makedirs(fastq_dir) 105 | 106 | kraken_files = [os.path.join(fastq_dir, 'kraken.' + i) for i in ['report', 'out', 'class', 'unclass']] 107 | kraken_report, kraken_out, kraken_class, kraken_unclass = kraken_files 108 | kraken_stdout, kraken_stderr = std_files(os.path.join(fastq_dir, 'kraken')) 109 | 110 | fastq_arg = fastq 111 | if isinstance(fastq, list): 112 | fastq_arg = ' '.join(fastq) 113 | 114 | command = " ".join( 115 | [ 116 | 'kraken2', 117 | '--threads', str(pima_data.threads), 118 | '--report', kraken_report, 119 | '--out', kraken_out, 120 | '--class', kraken_class, 121 | '--unclass', kraken_unclass, 122 | '--db', pima_data.kraken_database, 123 | fastq_arg, 124 | '1>', kraken_stdout, '2>', kraken_stderr, 125 | ] 126 | ) 127 | print_and_run(pima_data, command) 128 | 129 | [validate_file_and_size_or_error(pima_data, i, i + ' missing after Kraken2', i + ' file is size 0 after Kraken2', ) 130 | for i in kraken_files] 131 | 132 | # Read in the Kraken fractions and pull out the useful parts 133 | kraken_fracs = read_kraken_report(kraken_report) 134 | pima_data.files_to_clean.append([kraken_class, kraken_unclass, kraken_out]) 135 | return(kraken_fracs) 136 | 137 | def read_kraken_report(kraken_report: str): 138 | kraken_fracs = pd.read_csv(kraken_report, delimiter = '\t', header = None) 139 | kraken_fracs.index = kraken_fracs.iloc[:, 4].values 140 | kraken_fracs = kraken_fracs.loc[kraken_fracs.iloc[:, 3].str.match('[UG]1?'), :] 141 | kraken_fracs = kraken_fracs.loc[(kraken_fracs.iloc[:, 0] >= 1) | (kraken_fracs.iloc[:, 3] == 'U'), :] 142 | kraken_fracs = kraken_fracs.iloc[:, [0, 1, 3, 5]] 143 | kraken_fracs.columns = ['Fraction', 'Reads', 'Level', 'Taxa'] 144 | kraken_fracs['Fraction'] = (kraken_fracs['Fraction'] / 100).round(4) 145 | kraken_fracs.sort_values(by = 'Fraction', inplace = True, ascending = False) 146 | kraken_fracs['Taxa'] = kraken_fracs['Taxa'].str.lstrip() 147 | return kraken_fracs -------------------------------------------------------------------------------- /Pima/modules/download_references.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import glob 5 | 6 | from Pima.pima_data import PimaData 7 | from Pima.utils.settings import Settings 8 | 9 | from Pima.utils.utils import ( 10 | print_and_log, 11 | print_and_run, 12 | validate_file_and_size, 13 | ) 14 | 15 | def validate_download(pima_data: PimaData, settings: Settings): 16 | 17 | if not pima_data.download: 18 | return 19 | 20 | pima_data.errors = [] 21 | 22 | pima_data.verbosity = 3 23 | 24 | download_databases(pima_data, settings) 25 | # We don't need this as part of the analysis pipeline, if user tries to donwload, run the download function and then end 26 | #pima_data.analysis.append(['download_databases', pima_data, settings]) 27 | 28 | print_and_log( 29 | pima_data, 30 | "Finished validating databases. Re-run analysis without '--download' argument", 31 | pima_data.main_process_verbosity, 32 | pima_data.main_process_color, 33 | ) 34 | sys.exit(0) 35 | 36 | 37 | def validate_organism(pima_data: PimaData): 38 | if pima_data.only_assemble: 39 | return 40 | 41 | if not pima_data.organism and not pima_data.list_organisms: 42 | return 43 | 44 | list_of_org = [ 45 | "Bacillus_anthracis", 46 | ] 47 | 48 | if pima_data.list_organisms: 49 | print_and_log( 50 | pima_data, 51 | f"List of available reference organisms:\n{' '.join(list_of_org)}", 52 | pima_data.main_process_verbosity, 53 | pima_data.main_process_color, 54 | ) 55 | sys.exit(0) 56 | 57 | print_and_log( 58 | pima_data, 59 | 'Validating organism', 60 | pima_data.main_process_verbosity, 61 | pima_data.main_process_color, 62 | ) 63 | 64 | if pima_data.organism and pima_data.reference_fasta: 65 | pima_data.errors.append("--organism and --reference-genome are mutually exclusive") 66 | 67 | if pima_data.organism and pima_data.mutation_region_bed: 68 | pima_data.errors.append("--organism and --mutation-regions are mutually exclusive") 69 | 70 | if not pima_data.organism in list_of_org: 71 | pima_data.errors.append( 72 | f"--organism {pima_data.organism} is not available, please specify a specifc --reference-genome and --mutations-regions" 73 | f" or run PiMA without a reference" 74 | ) 75 | return 76 | 77 | if not os.path.isdir(pima_data.reference_dir): 78 | os.mkdir(pima_data.reference_dir) 79 | 80 | pima_data.organism_dir = os.path.join(pima_data.reference_dir, pima_data.organism) 81 | if not os.path.isdir(pima_data.organism_dir): 82 | os.mkdir(pima_data.organism_dir) 83 | 84 | pima_data.reference_fasta = os.path.join(pima_data.organism_dir, "genome.fasta") 85 | pima_data.mutation_region_bed = os.path.join(pima_data.organism_dir, "confirmed_amr_mutations.bed") 86 | #pima_data.mutation_regions = os.path.join(pima_data.organism_dir, "mutation_regions.bed") 87 | pima_data.organism_amr_appendices = glob.glob(os.path.join(pima_data.organism_dir, "amr_appendices","*md")) 88 | 89 | if not validate_file_and_size(pima_data, pima_data.reference_fasta): 90 | print_and_log( 91 | pima_data, 92 | f"Downloading reference genome for {pima_data.organism}", 93 | pima_data.sub_process_verbosity, 94 | pima_data.sub_process_color, 95 | ) 96 | download_organism(pima_data, pima_data.organism) 97 | pima_data.load_reference() 98 | 99 | pima_data.will_have_reference_fasta = True 100 | 101 | def download_organism(pima_data: PimaData, organism: str): 102 | 103 | print_and_log( 104 | pima_data, 105 | f"Downloading references specific for {organism}", 106 | pima_data.main_process_verbosity, 107 | pima_data.main_process_color, 108 | ) 109 | 110 | if organism == "Bacillus_anthracis": 111 | genome_temp = os.path.join(pima_data.organism_dir, "genome_temp.fasta") 112 | genome = os.path.join(pima_data.organism_dir, "genome.fasta") 113 | command = " ".join( 114 | [ 115 | "wget -O", 116 | f"{genome_temp}.gz", 117 | "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/445/GCA_000008445.1_ASM844v1/GCA_000008445.1_ASM844v1_genomic.fna.gz", 118 | "1> /dev/null 2> /dev/null;", 119 | "gunzip", f"{genome_temp}.gz", 120 | ] 121 | ) 122 | print_and_run(pima_data, command) 123 | 124 | replace_dict = { 125 | ">AE017334.2 Bacillus anthracis str. 'Ames Ancestor', complete genome": ">chromosome", 126 | ">AE017336.2 Bacillus anthracis str. 'Ames Ancestor' plasmid pXO1, complete sequence": ">pX01", 127 | ">AE017335.3 Bacillus anthracis str. 'Ames Ancestor' plasmid pXO2, complete sequence": ">pX02", 128 | } 129 | with open(genome_temp, "r") as f: 130 | with open(genome, "w") as w: 131 | for line in f: 132 | for key in replace_dict: 133 | if key in line: 134 | line = line.replace(key, replace_dict[key]) 135 | w.write(line) 136 | 137 | os.remove(genome_temp) 138 | 139 | def download_databases(pima_data: PimaData, settings: Settings): 140 | 141 | print_and_log( 142 | pima_data, 143 | "Checking for missing databases", 144 | pima_data.main_process_verbosity, 145 | pima_data.main_process_color, 146 | ) 147 | 148 | database_fasta = settings.plasmid_database_default_fasta 149 | if not validate_file_and_size(pima_data, database_fasta) and validate_file_and_size(pima_data, settings.DockerPathPlasmid): 150 | pima_data.plasmid_database = settings.DockerPathPlasmid 151 | 152 | elif not validate_file_and_size(pima_data, database_fasta) and not validate_file_and_size(pima_data, settings.DockerPathPlasmid): 153 | print_and_log( 154 | pima_data, 155 | 'Downloading plasmid database', 156 | pima_data.sub_process_verbosity, 157 | pima_data.sub_process_color, 158 | ) 159 | command = " ".join( 160 | [ 161 | 'wget', 162 | '-O', database_fasta, 163 | 'http://pima.appliedbinf.com/data/plasmids_and_vectors.fasta', 164 | ] 165 | ) 166 | print_and_run(pima_data, command) 167 | else: 168 | print_and_log( 169 | pima_data, 170 | 'Plasmid database present', 171 | pima_data.sub_process_verbosity, 172 | pima_data.sub_process_color, 173 | ) 174 | if not os.path.isdir(settings.kraken_database_default) and validate_file_and_size(pima_data, settings.DockerPathKraken): 175 | pima_data.kraken_database = settings.DockerPathKraken 176 | elif not validate_file_and_size(pima_data, os.path.join(settings.kraken_database_default, "hash.k2d")) and not validate_file_and_size(pima_data, settings.DockerPathKraken): 177 | print_and_log( 178 | pima_data, 179 | 'Downloading and the prebuilt 8gb kraken2 database, 20230605, (may take some time)', 180 | pima_data.sub_process_verbosity, 181 | pima_data.sub_process_color, 182 | ) 183 | #if download was corrupted and the files (hash, opts, taxo) are not present, we need to delete and try again 184 | if os.path.isdir(settings.kraken_database_default): 185 | shutil.rmtree(settings.kraken_database_default) 186 | 187 | os.makedirs(settings.kraken_database_default) 188 | command = " ".join( 189 | [ 190 | 'wget -O', f"{settings.kraken_database_default}.tar.gz", 191 | 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20230605.tar.gz;', 192 | 'tar xvf', 193 | f"{settings.kraken_database_default}.tar.gz", 194 | '-C', 195 | settings.kraken_database_default, 196 | '1> /dev/null 2> /dev/null', 197 | ] 198 | ) 199 | print_and_run(pima_data, command) 200 | command = " ".join( 201 | [ 202 | 'rm', 203 | f"{settings.kraken_database_default}.tar.gz", 204 | f"{settings.kraken_database_default}/*kmer_distrib", 205 | f"{settings.kraken_database_default}/inspect.txt", 206 | f"{settings.kraken_database_default}/ktaxonomy.tsv", 207 | f"{settings.kraken_database_default}/seqid2taxid.map" 208 | ] 209 | ) 210 | print_and_run(pima_data, command) 211 | else: 212 | print_and_log( 213 | pima_data, 214 | 'Kraken2 database found', 215 | pima_data.sub_process_verbosity, 216 | pima_data.sub_process_color, 217 | ) -------------------------------------------------------------------------------- /Pima/modules/evaluate_assembly.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | 4 | from Pima.pima_data import PimaData 5 | from Pima.utils.utils import ( 6 | print_and_run, 7 | print_and_log, 8 | ) 9 | 10 | def validate_evaluate_assembly(pima_data: PimaData) : 11 | 12 | if pima_data.no_assembly: 13 | return 14 | 15 | if not pima_data.will_have_ont_assembly : 16 | return 17 | 18 | pima_data.analysis.append(['check_for_small_contigs_and_fragmentation', pima_data]) 19 | 20 | def check_for_small_contigs_and_fragmentation(pima_data: PimaData): 21 | """ 22 | Will run on any assembly required, including denovo ONT, ONT polished, and/or Illumina polished assemblies 23 | Queue up for final step after all assembly manipulation steps are complete 24 | """ 25 | print_and_log( 26 | pima_data, 27 | "Evaluating assembly", 28 | pima_data.main_process_verbosity, 29 | pima_data.main_process_color, 30 | ) 31 | genome_sizes = pima_data.genome_fasta.replace(".fasta", ".sizes") 32 | command = " ".join( 33 | ["faidx -i chromsizes", pima_data.genome_fasta, ">", genome_sizes] 34 | ) 35 | print_and_run(pima_data, command) 36 | 37 | assembly_info = pd.read_csv(genome_sizes, sep="\t", header=None) 38 | assembly_info.columns = ["contig", "length"] 39 | pima_data.contig_sizes = assembly_info 40 | 41 | # Take a look at the number of contigs, their sizes, and circularity. Warn if things don't look good 42 | if assembly_info.shape[0] > 4: 43 | warning = f"Assembly produced {assembly_info.shape[0]} contigs, more than ususally expected; assembly may be fragmented." 44 | print_and_log( 45 | pima_data, warning, pima_data.warning_verbosity, pima_data.warning_color 46 | ) 47 | pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype='object')]) 48 | small_contigs = assembly_info.loc[assembly_info["length"] <= 3000, :] 49 | 50 | if small_contigs.shape[0] > 0: 51 | warning = f"Assembly produced {small_contigs.shape[0]} small contigs ({', '.join(small_contigs['contig'])}); assembly may include spurious sequences." 52 | print_and_log( 53 | pima_data, warning, pima_data.warning_verbosity, pima_data.warning_color 54 | ) 55 | pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype='object')]) -------------------------------------------------------------------------------- /Pima/modules/fastq.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | 4 | import pandas as pd 5 | 6 | from Pima.pima_data import PimaData 7 | from Pima.utils.settings import Settings 8 | 9 | from Pima.utils.utils import ( 10 | validate_file_and_size, 11 | validate_file_and_size_or_error, 12 | print_and_log, 13 | print_and_run, 14 | error_out, 15 | add_warning, 16 | find_checkpoint, 17 | make_start_file, 18 | make_finish_file, 19 | format_kmg, 20 | ) 21 | from Pima.utils.mapping import ( 22 | minimap_and_sort, 23 | filter_bam, 24 | ) 25 | 26 | 27 | def validate_ont_fastq(pima_data: PimaData, settings: Settings): 28 | # skip condition 29 | if not pima_data.ont_fastq: 30 | return 31 | 32 | if os.path.isdir(pima_data.ont_fastq): 33 | pima_data.errors.append(f"The provided '--ont-fastq' is a directory, did you mean to include the '--multiplexed' flag?\nOffending input: {pima_data.ont_fastq}") 34 | return 35 | 36 | print_and_log( 37 | pima_data, 38 | "Validating ONT FASTQ", 39 | pima_data.main_process_verbosity, 40 | pima_data.main_process_color, 41 | ) 42 | 43 | if not validate_file_and_size( 44 | pima_data, the_file=pima_data.ont_fastq, min_size=1000 45 | ): 46 | error = f"Input ONT FASTQ file {pima_data.ont_fastq} cannot be found" 47 | pima_data.errors.append(error) 48 | 49 | pima_data.ont_fastq = os.path.realpath(pima_data.ont_fastq) 50 | pima_data.will_have_ont_fastq = True 51 | pima_data.analysis.append(["info_given_ont_fastq", pima_data, settings]) 52 | 53 | 54 | def validate_illumina_fastq(pima_data: PimaData): 55 | if not pima_data.illumina_fastq: 56 | return 57 | 58 | print_and_log( 59 | pima_data, 60 | 'Validating Illumina data', 61 | pima_data.main_process_verbosity, 62 | pima_data.main_process_color, 63 | ) 64 | 65 | if not type(pima_data.illumina_fastq) is list: 66 | pima_data.illumina_fastq = [pima_data.illumina_fastq] 67 | 68 | for r_fastq in pima_data.illumina_fastq: 69 | if not validate_file_and_size(pima_data, the_file = r_fastq, min_size = 1000): 70 | pima_data.errors.append('Illumina FASTQ file ' + r_fastq + ' cannot be found or is size 0') 71 | 72 | pima_data.analysis.append(['info_illumina_fastq', pima_data]) 73 | 74 | 75 | def validate_genome_estimate(pima_data: PimaData, settings: Settings): 76 | # skip conditions 77 | if pima_data.no_assembly: 78 | return 79 | 80 | if pima_data.genome_fasta is not None: 81 | return 82 | 83 | #skip this if we already have a genome assembly, let the assembly module re-initiate assembly info 84 | if pima_data.resume: 85 | if find_checkpoint(pima_data, os.path.join(pima_data.output_dir, "ont_assembly")): 86 | return 87 | 88 | if not pima_data.will_have_ont_fastq: 89 | return 90 | 91 | if not pima_data.genome_assembly_size == "estimate": 92 | return 93 | 94 | print_and_log( 95 | pima_data, 96 | "Estimating expected genome size using median single copy gene coverages", 97 | pima_data.main_process_verbosity, 98 | pima_data.main_process_color, 99 | ) 100 | median_cov = estimate_genome_size(pima_data, settings) 101 | result = ( 102 | f"Estimated genome size: {round(pima_data.genome_assembly_raw_size)}\t" 103 | f"Median coverage: {round(median_cov,1)}" 104 | ) 105 | print_and_log( 106 | pima_data, 107 | result, 108 | pima_data.main_process_verbosity, 109 | pima_data.main_process_color, 110 | ) 111 | 112 | 113 | def info_ont_fastq(pima_data: PimaData, settings: Settings): 114 | print_and_log( 115 | pima_data, 116 | "Getting ONT FASTQ info", 117 | pima_data.main_process_verbosity, 118 | pima_data.main_process_color, 119 | ) 120 | 121 | opener = "cat" 122 | if re.search(r"\.(gz|gzip)$", pima_data.ont_fastq): 123 | opener = "gunzip -c" 124 | 125 | command = " ".join( 126 | [ 127 | opener, 128 | pima_data.ont_fastq, 129 | "| awk '{getline;print length($0);s += length($1);getline;getline;} END{print \"+\"s}'", 130 | "| sort -gr", 131 | "| awk 'BEGIN{bp = 0;f = 0}", 132 | '{if(NR == 1){sub(/+/,"", $1);s=$1} else{bp += $1;if(bp > s / 2 && f == 0){n50 = $1;f = 1}}}', 133 | 'END{printf "%d\\t%d\\t%d\\n", n50, (NR - 1), s;exit}\'', 134 | ] 135 | ) 136 | result = list(re.split(r"\t", print_and_run(pima_data, command)[0])) 137 | if result[1] == "0": 138 | error_out(pima_data, "No ONT reads found") 139 | (pima_data.ont_n50, pima_data.ont_read_count, pima_data.ont_raw_bases) = [ 140 | int(i) for i in result 141 | ] 142 | 143 | command = " ".join( 144 | [ 145 | opener, 146 | pima_data.ont_fastq, 147 | "| awk '{getline;print length($0);getline;getline;}'", 148 | ] 149 | ) 150 | result = print_and_run(pima_data, command) 151 | result = list(filter(lambda x: x != "", result)) 152 | pima_data.ont_read_lengths = [int(i) for i in result] 153 | 154 | 155 | def info_given_ont_fastq(pima_data: PimaData, settings: Settings): 156 | info_ont_fastq(pima_data, settings) 157 | validate_genome_estimate(pima_data, settings) 158 | 159 | if pima_data.ont_n50 <= pima_data.ont_n50_min: 160 | warning = ( 161 | "ONT N50 (" 162 | + str(pima_data.ont_n50) 163 | + ") is less than the recommended minimum (" 164 | + str(pima_data.ont_n50_min) 165 | + ")." 166 | ) 167 | add_warning(pima_data, warning) 168 | pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype='object')]) 169 | 170 | # See if we should downsample the ONT FASTQ file 171 | if not (pima_data.genome_assembly_size): 172 | print_and_log( 173 | pima_data, 174 | "Cannot downsample since --genome-size was not provided", 175 | pima_data.sub_process_verbosity, 176 | pima_data.sub_process_color, 177 | ) 178 | return 179 | 180 | if pima_data.genome_assembly_raw_size is not None: 181 | present_coverage = pima_data.ont_raw_bases / pima_data.genome_assembly_raw_size 182 | if present_coverage >= pima_data.assembly_coverage + 1: 183 | downsample_ont_fastq(pima_data) 184 | return 185 | 186 | print_and_log( 187 | pima_data, 188 | "No downsampling of reads performed", 189 | pima_data.sub_process_verbosity, 190 | pima_data.sub_process_color, 191 | ) 192 | 193 | 194 | def downsample_ont_fastq(pima_data: PimaData): 195 | print_and_log( 196 | pima_data, 197 | f"Downsampling ONT data to {pima_data.assembly_coverage}X coverage", 198 | pima_data.sub_process_verbosity, 199 | pima_data.sub_process_color, 200 | ) 201 | 202 | opener = "cat" 203 | if re.search(r"\.(gz|gzip)$", pima_data.ont_fastq): 204 | opener = "gunzip -c" 205 | 206 | pima_data.downsample_dir = os.path.join(pima_data.output_dir, "downsample") 207 | 208 | if find_checkpoint(pima_data, pima_data.downsample_dir): 209 | downsampled_fastq = os.path.join(pima_data.downsample_dir, "downsampled.fastq") 210 | pima_data.ont_fastq = downsampled_fastq 211 | return 212 | 213 | os.makedirs(pima_data.downsample_dir) 214 | make_start_file(pima_data, pima_data.downsample_dir) 215 | 216 | desired_bases = int( 217 | pima_data.genome_assembly_raw_size * pima_data.assembly_coverage 218 | ) 219 | 220 | read_length_tsv = os.path.join(pima_data.downsample_dir, "read_sizes.tsv") 221 | command = " ".join( 222 | [ 223 | opener, 224 | pima_data.ont_fastq, 225 | "| awk '{printf $1\"\t\";getline;print length($1);getline;getline}'", 226 | "| sort -k 2gr", 227 | "| awk 'BEGIN{bp = 0}{bp += $2;print; if (bp >= ", 228 | str(desired_bases), 229 | "){exit}}'", 230 | ">", 231 | read_length_tsv, 232 | ] 233 | ) 234 | print_and_run(pima_data, command) 235 | validate_file_and_size_or_error( 236 | pima_data, read_length_tsv, "Read length index", "doesn't exist", "is empty" 237 | ) 238 | 239 | downsampled_fastq = os.path.join(pima_data.downsample_dir, "downsampled.fastq") 240 | command = " ".join( 241 | [ 242 | opener, 243 | pima_data.ont_fastq, 244 | "| awk 'BEGIN{while(getline < \"" + read_length_tsv + '"){l[$1] = 1}}', 245 | "{if($1 in l){print;getline;print;getline;print;getline;print}}'", 246 | ">", 247 | downsampled_fastq, 248 | ] 249 | ) 250 | print_and_run(pima_data, command) 251 | validate_file_and_size_or_error( 252 | pima_data, downsampled_fastq, "Downsampled FASTQ", "doesn't exist", "is empty" 253 | ) 254 | 255 | # Keep this new FASTQ 256 | pima_data.ont_fastq = downsampled_fastq 257 | make_finish_file(pima_data, pima_data.downsample_dir) 258 | 259 | 260 | def info_illumina_fastq(pima_data: PimaData) : 261 | 262 | print_and_log( 263 | pima_data, 264 | 'Getting Illumina FASTQ info', 265 | pima_data.main_process_verbosity, 266 | pima_data.main_process_color, 267 | ) 268 | 269 | pima_data.illumina_length_mean, pima_data.illumina_read_count, pima_data.illumina_bases = 0,0,0 270 | opener = 'cat' 271 | if re.search(r'\.(gz|gzip)$', pima_data.illumina_fastq[0]): 272 | opener = 'gunzip -c' 273 | 274 | for r in range(len(pima_data.illumina_fastq)): 275 | r_fastq = pima_data.illumina_fastq[r] 276 | command = ' '.join( 277 | [ 278 | opener, 279 | r_fastq, 280 | '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'', 281 | ] 282 | ) 283 | 284 | values = [float(i) for i in re.split(r'\t', print_and_run(pima_data, command)[0])] 285 | pima_data.illumina_length_mean += values[0] 286 | pima_data.illumina_read_count += int(values[1]) 287 | pima_data.illumina_bases += int(values[2]) 288 | 289 | pima_data.illumina_length_mean /= len(pima_data.illumina_fastq) 290 | pima_data.illumina_bases = format_kmg(pima_data.illumina_bases, decimals = 1) 291 | 292 | 293 | def estimate_genome_size(pima_data: PimaData, settings: Settings): 294 | """Estimate expected genome size using median coverage of a set of 20 universal single copy genes (pulled from Bacillus anthracis) 295 | 296 | The set was pulled from this reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10631056/ 297 | """ 298 | 299 | reference_ffn = os.path.join(settings.data_dir, 'ames_single_copy_genes.fna') 300 | 301 | temp_bam = os.path.join(pima_data.output_dir, 'single_copy_genes.bam') 302 | minimap_and_sort( 303 | pima_data, 304 | reference_ffn, 305 | temp_bam, 306 | pima_data.ont_fastq, 307 | ont=True, 308 | ) 309 | filter_bam( 310 | pima_data, 311 | inbam = temp_bam, 312 | F = "4", 313 | q = "0", 314 | ) 315 | 316 | command = " ".join( 317 | [ 318 | "samtools coverage", 319 | temp_bam, 320 | "| grep -v '^#'", 321 | "| cut -f 7 | sort -n", 322 | "| awk '{ a[i++]=$1; } END { print a[int(i/2)]; }'", 323 | ] 324 | ) 325 | median_cov = float(print_and_run(pima_data, command)[0]) 326 | pima_data.genome_assembly_raw_size = pima_data.ont_raw_bases / median_cov 327 | [os.remove(file) for file in [temp_bam, temp_bam + '.bai', os.path.splitext(temp_bam)[0] + '.stderr']] 328 | return median_cov -------------------------------------------------------------------------------- /Pima/modules/illumina_polishing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import subprocess 5 | import pandas as pd 6 | import pathlib 7 | 8 | from Pima.pima_data import PimaData 9 | from Pima.utils.utils import ( 10 | print_and_log, 11 | print_and_run, 12 | add_warning, 13 | validate_utility, 14 | validate_file_and_size_or_error, 15 | find_checkpoint, 16 | make_start_file, 17 | make_finish_file, 18 | make_report_info_file, 19 | std_files, 20 | ) 21 | from Pima.utils.mapping import ( 22 | bwa_short_illumina_fastq_and_sort, 23 | minimap_and_sort, 24 | bwa_mem_all_aln_illumina, 25 | ) 26 | 27 | def validate_illumina_polish(pima_data: PimaData): 28 | if not pima_data.will_have_ont_assembly and not pima_data.will_have_genome_fasta: 29 | return 30 | 31 | if pima_data.illumina_polisher == "pilon": 32 | validate_pilon(pima_data) 33 | elif pima_data.illumina_polisher == "polypolish": 34 | validate_polypolish(pima_data) 35 | else: 36 | return 37 | 38 | 39 | def validate_pilon(pima_data: PimaData): 40 | 41 | # skip conditions 42 | if pima_data.no_assembly: 43 | return 44 | if not (pima_data.will_have_genome_fasta and pima_data.illumina_fastq): 45 | return 46 | if not pima_data.illumina_polisher == "pilon": 47 | return 48 | 49 | print_and_log( 50 | pima_data, 51 | 'Validating Pilon and memory arguments', 52 | pima_data.main_process_verbosity, 53 | pima_data.main_process_color, 54 | ) 55 | 56 | for utility in ['minimap2', 'pilon']: 57 | if validate_utility(pima_data, utility, f"{utility} is not on the PATH (required by --illumina-fastq)."): 58 | command = f"{utility} --version" 59 | pima_data.versions[utility] = re.search(r'[0-9]+\.[0-9.]+', print_and_run(pima_data,command)[0]).group(0) 60 | 61 | pima_data.analysis.append(["pilon_assembly", pima_data]) 62 | 63 | 64 | def validate_polypolish(pima_data: PimaData): 65 | # skip conditions 66 | if pima_data.no_assembly: 67 | return 68 | if not (pima_data.will_have_genome_fasta and pima_data.illumina_fastq): 69 | return 70 | if not pima_data.illumina_polisher == "polypolish": 71 | return 72 | 73 | print_and_log( 74 | pima_data, 75 | 'Validating polypolish', 76 | pima_data.main_process_verbosity, 77 | pima_data.main_process_color, 78 | ) 79 | 80 | if validate_utility(pima_data, "polypolish", "polypolish is not on the PATH (required by --illumina-polisher polypolish)."): 81 | command = "polypolish --version" 82 | pima_data.versions['polypolish'] = re.search(r'[0-9]+\.[0-9.]+', print_and_run(pima_data,command)[0]).group(0) 83 | 84 | if validate_utility(pima_data, "bwa", "bwa is not on the path (required by --illumina-polisher polypolish)."): 85 | command = "bwa" 86 | result = subprocess.run(command, shell=True, capture_output=True, text=True) 87 | version_element = [i for i,x in enumerate(result.stderr.split()) if re.search("Version", x)] 88 | pima_data.versions['bwa'] = re.search(r'[0-9]+\.[0-9.]+', result.stderr.split()[version_element[0] + 1]).group(0) 89 | 90 | pima_data.analysis.append(["polypolish_assembly", pima_data]) 91 | 92 | 93 | def polypolish_assembly(pima_data: PimaData): 94 | 95 | print_and_log( 96 | pima_data, 97 | 'Running polypolish on genome assembly', 98 | pima_data.main_process_verbosity, 99 | pima_data.main_process_color, 100 | ) 101 | 102 | pima_data.illumina_polish_dir = os.path.join(pima_data.output_dir, 'illumina_polish') 103 | 104 | ## Check if pilon has been run before to completion 105 | if find_checkpoint(pima_data, pima_data.illumina_polish_dir): 106 | pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta') 107 | pima_data.load_genome() 108 | 109 | print_and_log( 110 | pima_data, 111 | 'Polypolish had previously been run and finished successfully', 112 | pima_data.sub_process_verbosity, 113 | pima_data.sub_process_color, 114 | ) 115 | pima_data.did_polypolish_ont_assembly = True 116 | pima_data.files_to_clean.extend( 117 | list(pathlib.Path(pima_data.illumina_polish_dir).glob("*.bam")) 118 | ) 119 | pima_data.files_to_clean.append(os.path.join(pima_data.illumina_polish_dir, 'polypolish.fasta')) 120 | return 121 | 122 | os.makedirs(pima_data.illumina_polish_dir) 123 | make_start_file(pima_data, pima_data.illumina_polish_dir) 124 | 125 | # Map illumina reads onto the assembly 126 | #We need to do this again because polypolish needs separate bam files 127 | #and all alignments from bwa 128 | print_and_log( 129 | pima_data, 130 | 'Mapping Illumina reads to assembly', 131 | pima_data.sub_process_verbosity, 132 | pima_data.sub_process_color, 133 | ) 134 | 135 | #This gets turned into 2 files by the bwa function if there are R1/R2 files provided 136 | polypolish_bam = os.path.join(pima_data.illumina_polish_dir, 'mapping.bam') 137 | 138 | bams = bwa_mem_all_aln_illumina(pima_data, pima_data.genome_fasta, pima_data.illumina_fastq, polypolish_bam) 139 | polypolish_cmd = "polypolish polish" 140 | # If mean coverage is low then give some warning & use correct form of polypolish 141 | if pima_data.mean_coverage['Illumina'] < 25: 142 | warning = f"Mean illumina coverage ({pima_data.mean_coverage['Illumina']}X) is below 25X, using careful mode)." 143 | pima_data.add_warning(warning) 144 | pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype = 'object')]) 145 | polypolish_cmd = "polypolish polish --careful" 146 | 147 | # Actually run polypolish 148 | print_and_log( 149 | pima_data, 150 | 'Running Polypolish', 151 | pima_data.sub_process_verbosity, 152 | pima_data.sub_process_color, 153 | ) 154 | 155 | pfilt_stdout, pfilt_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'poly_filter')) 156 | polypolish_prefix = os.path.join(pima_data.illumina_polish_dir, 'polypolish') 157 | polished_fasta = polypolish_prefix + '.fasta' 158 | 159 | if len(bams) == 2: 160 | filt_bams = [re.sub(r"\.bam", r"_filt.bam", x) for x in bams] 161 | #you can't filter unpaired bams 162 | command = " ".join( 163 | [ 164 | "polypolish filter", 165 | "--in1", bams[0], 166 | "--in2", bams[1], 167 | "--out1", filt_bams[0], 168 | "--out2", filt_bams[1], 169 | "1>", pfilt_stdout, "2>", pfilt_stderr, 170 | ] 171 | ) 172 | print_and_run(pima_data, command) 173 | validate_file_and_size_or_error(pima_data, filt_bams[0], 'Polypolish filtered bam', 'cannot be found after filtering', 'is empty') 174 | _, polish_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'polypolish')) 175 | command = " ".join( 176 | [ 177 | polypolish_cmd, 178 | pima_data.genome_fasta, 179 | " ".join(filt_bams), 180 | "1>", polished_fasta, 181 | "2>", polish_stderr, 182 | ] 183 | ) 184 | print_and_run(pima_data, command) 185 | pima_data.files_to_clean.extend(bams + filt_bams) 186 | 187 | else: 188 | #We have only a single fastq file so we can't filter the alignments 189 | _, polish_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'polypolish')) 190 | command = " ".join( 191 | [ 192 | polypolish_cmd, 193 | pima_data.genome_fasta, 194 | " ".join(bams), 195 | "1>", polished_fasta, 196 | "2>", polish_stderr, 197 | ] 198 | ) 199 | print_and_run(pima_data, command) 200 | pima_data.files_to_clean.extend(bams) 201 | validate_file_and_size_or_error(pima_data, polished_fasta, 'Polypolished assembly', 'cannot be found after running polypolish', 'is empty') 202 | 203 | print_and_log( 204 | pima_data, 205 | 'Repairing contig names after polypolish', 206 | pima_data.sub_process_verbosity, 207 | pima_data.sub_process_color, 208 | ) 209 | pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta') 210 | command = " ".join( 211 | [ 212 | 'cat', 213 | polished_fasta, 214 | '| awk \'{if($0 ~ /^>/){gsub("_pilon", "", $0)}print}\'', 215 | '>', pima_data.genome_fasta, 216 | ] 217 | ) 218 | print_and_run(pima_data, command) 219 | validate_file_and_size_or_error(pima_data, pima_data.genome_fasta, 'Genome assembly', 220 | 'cannot be found after fixing names', 'is empty') 221 | pima_data.files_to_clean.append(polished_fasta) 222 | make_finish_file(pima_data, pima_data.illumina_polish_dir) 223 | pima_data.did_polypolyish_ont_assembly = True #track for the report 224 | 225 | def pilon_assembly(pima_data: PimaData): 226 | print_and_log( 227 | pima_data, 228 | 'Running Pilon on genome assembly', 229 | pima_data.main_process_verbosity, 230 | pima_data.main_process_color, 231 | ) 232 | pima_data.illumina_polish_dir = os.path.join(pima_data.output_dir, 'illumina_polish') 233 | 234 | ## Check if pilon has been run before to completion 235 | if find_checkpoint(pima_data, pima_data.illumina_polish_dir): 236 | pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta') 237 | pilon_bam = os.path.join(pima_data.illumina_polish_dir, 'mapping.bam') 238 | pima_data.files_to_clean.append(pilon_bam) 239 | pima_data.load_genome() 240 | 241 | print_and_log( 242 | pima_data, 243 | 'Pilon had previously been run and finished successfully', 244 | pima_data.main_process_verbosity, 245 | pima_data.main_process_color, 246 | ) 247 | pima_data.did_pilon_ont_assembly = True 248 | return 249 | 250 | os.makedirs(pima_data.illumina_polish_dir) 251 | make_start_file(pima_data, pima_data.illumina_polish_dir) 252 | 253 | # Map illumina reads onto the assembly 254 | print_and_log( 255 | pima_data, 256 | 'Mapping Illumina reads to assembly', 257 | pima_data.sub_process_verbosity, 258 | pima_data.sub_process_color, 259 | ) 260 | pilon_bam = os.path.join(pima_data.illumina_polish_dir, 'mapping.bam') 261 | pima_data.files_to_clean.append(pilon_bam) 262 | 263 | # See what mapping method to use - bwa aln or minimap 2 264 | if pima_data.illumina_length_mean <= 50: 265 | bwa_short_illumina_fastq_and_sort(pima_data, pima_data.genome_fasta, pima_data.illumina_fastq, pilon_bam) 266 | else: # We have longer short reads 267 | minimap_and_sort(pima_data, pima_data.genome_fasta, pilon_bam, pima_data.illumina_fastq, ont=False) 268 | 269 | 270 | # Figure out the depth here. If it's too low, give some sort of warning? 271 | coverage_tsv = os.path.join(pima_data.illumina_polish_dir, 'coverage.tsv') 272 | command = " ".join( 273 | [ 274 | 'samtools depth -a', 275 | pilon_bam, 276 | '| awk \'{s += $3; c++}END{printf "%s\\t%i\\t%.0f\\n", i, c, (s / c)}\'', 277 | '>', coverage_tsv, 278 | ] 279 | ) 280 | print_and_run(pima_data, command) 281 | validate_file_and_size_or_error(pima_data, coverage_tsv, 'Coverage TSV', 'cannot be found after samtools', 'is empty') 282 | pilon_coverage = pd.read_csv(coverage_tsv, header = None, index_col = None, sep = '\t').iloc[0,2] 283 | 284 | # If mean coverage is low then give some warning 285 | if pilon_coverage < pima_data.pilon_coverage_min: 286 | warning = f"Illumina coverage for Pilon {pilon_coverage}X is below the recommended minimum ({pima_data.pilon_coverage_min}X), we recommend using polypolish instead." 287 | add_warning(pima_data, warning) 288 | 289 | # Actually run pilon 290 | print_and_log( 291 | pima_data, 292 | 'Running Pilon', 293 | pima_data.sub_process_verbosity, 294 | pima_data.sub_process_color, 295 | ) 296 | pilon_stdout, pilon_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'pilon')) 297 | pilon_prefix = os.path.join(pima_data.illumina_polish_dir, 'pilon') 298 | polished_fasta = pilon_prefix + '.fasta' 299 | bam_option = '--frags' 300 | if len(pima_data.illumina_fastq) == 1: 301 | bam_option = '--unpaired' 302 | 303 | command = " ".join( 304 | [ 305 | 'pilon', 306 | '-Xms4g -Xmx4g', 307 | '--genome', pima_data.genome_fasta, 308 | bam_option, pilon_bam, 309 | '--output', pilon_prefix, 310 | '1>', pilon_stdout, '2>', pilon_stderr, 311 | ] 312 | ) 313 | print_and_run(pima_data, command) 314 | validate_file_and_size_or_error(pima_data, polished_fasta, 'Pilon FASTA', 'cannot be found after pilon', 'is empty') 315 | 316 | print_and_log( 317 | pima_data, 318 | 'Repairing contig names after Pilon', 319 | pima_data.sub_process_verbosity, 320 | pima_data.sub_process_color, 321 | ) 322 | pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta') 323 | command = " ".join( 324 | [ 325 | 'cat', 326 | polished_fasta, 327 | '| awk \'{if($0 ~ /^>/){gsub("_pilon", "", $0)}print}\'', 328 | '>', pima_data.genome_fasta, 329 | ] 330 | ) 331 | print_and_run(pima_data, command) 332 | validate_file_and_size_or_error(pima_data, pima_data.genome_fasta, 'Genome assembly', 333 | 'cannot be found after fixing names', 'is empty') 334 | pima_data.load_genome() 335 | pima_data.did_pilon_ont_assembly = True 336 | make_finish_file(pima_data, pima_data.illumina_polish_dir) 337 | -------------------------------------------------------------------------------- /Pima/modules/multiplexed.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import os 3 | import re 4 | import copy 5 | import datetime 6 | import glob 7 | import shutil 8 | 9 | from collections import defaultdict 10 | 11 | from Pima.pima_data import PimaData 12 | from Pima.utils.settings import Settings 13 | from Pima import modules 14 | import Pima.pima 15 | 16 | from Pima.utils.utils import ( 17 | print_and_log, 18 | print_and_run, 19 | std_files, 20 | stop_logging, 21 | validate_utility, 22 | ) 23 | 24 | class barcode_data: 25 | def __init__(self, barcode_id: str, barcode_root_path: str | list, barcode_fastq_list: list, barcode_fastq_paths: list, barcode_size_bytes: int): 26 | self.barcode_id = barcode_id 27 | self.barcode_root_path = barcode_root_path 28 | self.barcode_fastq_list = barcode_fastq_list 29 | self.barcode_fastq_paths = barcode_fastq_paths 30 | self.barcode_size_bytes = barcode_size_bytes 31 | self.barcode_size_bp = None 32 | 33 | def update_barcode(self, barcode_root_path: str, barcode_fastq_list: list, barcode_fastq_paths: list, barcode_size_bytes: int): 34 | self.barcode_root_path = [self.barcode_root_path, barcode_root_path] 35 | self.barcode_fastq_list = [*self.barcode_fastq_list, *barcode_fastq_list] 36 | self.barcode_fastq_paths = [*self.barcode_fastq_paths, *barcode_fastq_paths] 37 | self.barcode_size_bytes = self.barcode_size_bytes + barcode_size_bytes 38 | 39 | def create_concat_fastq(self, pima_data, fastq_path:str = None): 40 | if len(self.barcode_fastq_list) == 1: 41 | if fastq_path: 42 | #for nf multiplexing 43 | if re.search(r'\.(gz|gzip)$', self.barcode_fastq_list[0]): 44 | fastq_path = fastq_path + ".gz" 45 | os.symlink(self.barcode_fastq_paths[0], fastq_path) 46 | pima_data.ont_fastq = fastq_path 47 | 48 | else: 49 | pima_data.ont_fastq = self.barcode_fastq_paths[0] 50 | return 51 | 52 | print_and_log( 53 | pima_data, 54 | "Concatenating barcode fastq files", 55 | pima_data.sub_process_verbosity, 56 | pima_data.sub_process_color, 57 | ) 58 | 59 | if re.search(r'\.(gz|gzip)$', self.barcode_fastq_list[0]): 60 | pima_data.ont_fastq = pima_data.ont_fastq + ".gz" 61 | 62 | command = " ".join( 63 | [ 64 | "cat", 65 | " ".join(self.barcode_fastq_paths), 66 | f"> {pima_data.ont_fastq}", 67 | ] 68 | ) 69 | print_and_run(pima_data, command) 70 | 71 | def report_multiplex_sample(self): 72 | message = f"Running PiMA on {self.barcode_id}" 73 | return message 74 | 75 | 76 | def validate_multiplex_fastq(pima_data: PimaData): 77 | if not pima_data.multiplexed: 78 | return 79 | 80 | if pima_data.resume and pima_data.nextflow: 81 | pima_data.errors.append("--resume does not currently work with nextflow multiplexing. If the assemblies were completed in the previous attempt, you can resume the multiplex run in serial by removing '--nextflow' or resume each sample independently without '--multiplexed', otherwise just use '--overwrite'") 82 | 83 | if not pima_data.ont_fastq: 84 | pima_data.errors.append("--multiplexed requires that a directory of FASTQ files or directories of FASTQ files be given") 85 | 86 | if pima_data.illumina_fastq: 87 | pima_data.errors.append("--multiplexing does not currently work with illumina data. Exiting") 88 | 89 | if pima_data.barcode_min_fraction >= 1: 90 | pima_data.errors.append(f"--barcode_min_fraction is greater than 1, did you mean to use {pima_data.barcode_min_fraction / 100}?") 91 | 92 | if pima_data.genome_assembly_size is not None and pima_data.genome_assembly_size != "estimate": 93 | print_and_log( 94 | pima_data, 95 | f"Using the same --genome_size {pima_data.genome_assembly_size} for every sample in the multiplex run. If you do not expect all samples to have the same genome size (+/-10%), please cancel (ctrl+c) and re-run using '--genome-size estimate' or leave it blank (prevents downsampling)'", 96 | pima_data.warning_verbosity, 97 | pima_data.warning_color, 98 | ) 99 | 100 | if os.path.isfile(pima_data.ont_fastq) and pima_data.barcode_kit: 101 | #fastq data has not been demultiplexed - try to demulitplex it ? 102 | ## currently will just exit if user gives a single fastq file 103 | 104 | message = ("You provided a single fastq file and indicated it is multiplexed. " 105 | "PiMA currently doesn't demultiplex a fastq file since this file type is not common. " 106 | "If you need to demultiplex, we recommend using dorado. " 107 | "Please let us know if this is a feature you'd like to see added." 108 | "Exiting now" 109 | ) 110 | pima_data.errors.append(message) 111 | 112 | pima_data.will_have_ont_fastq = True 113 | pima_data.ont_fastq = os.path.realpath(pima_data.ont_fastq) 114 | pima_data.output_dir = os.path.realpath(pima_data.output_dir) 115 | 116 | 117 | def identify_multiplexed_fastq_files(pima_data: PimaData): 118 | 119 | print_and_log( 120 | pima_data, 121 | "Starting Multiplex Analysis", 122 | pima_data.main_process_verbosity, 123 | pima_data.main_process_color, 124 | ) 125 | 126 | multiplexed_dirs = defaultdict() 127 | total_dir_size = 0 128 | 129 | if not any(os.path.isdir(os.path.join(pima_data.ont_fastq, item)) for item in os.listdir(pima_data.ont_fastq)) and any([re.search(r"fastq", item) for item in os.listdir(pima_data.ont_fastq)]): 130 | #user is providing a directory contains fastq files for each sample (hopefully) 131 | for fastq in os.listdir(pima_data.ont_fastq): 132 | name = os.path.splitext(os.path.basename(fastq))[0] 133 | 134 | if re.search(r"\.(gz|gzip)$", fastq): 135 | name = os.path.splitext(os.path.splitext(fastq)[0])[0] 136 | 137 | fastq_size = os.path.getsize(os.path.join(pima_data.ont_fastq, fastq)) 138 | multiplexed_dirs[name] = barcode_data( 139 | barcode_id = name, 140 | barcode_root_path = pima_data.ont_fastq, 141 | barcode_fastq_list = [fastq], 142 | barcode_fastq_paths = [os.path.join(pima_data.ont_fastq, fastq)], 143 | barcode_size_bytes = fastq_size) 144 | total_dir_size = total_dir_size + fastq_size 145 | 146 | else: 147 | for root, dirs, files in os.walk(pima_data.ont_fastq): 148 | if any([re.search(r"fail", dir) for dir in dirs]): 149 | print_and_log( 150 | pima_data, 151 | "There are directories with the string 'fail' in the name. We will use both passing and failing reads for each sample. If you wish to use only passing reads, please cancel (ctrl+c) and re-run giving just the fastq_pass directory as input", 152 | pima_data.warning_verbosity, 153 | pima_data.warning_color, 154 | ) 155 | 156 | if len(files) > 0: 157 | if re.search(r"fastq",files[0]) and not re.search(r"unclassified", root): 158 | if os.path.basename(root) in multiplexed_dirs: 159 | dir_size = sum(os.path.getsize(os.path.join(root, f)) for f in files) 160 | multiplexed_dirs[os.path.basename(root)].update_barcode(root, files, [os.path.join(root, f) for f in files], dir_size) 161 | total_dir_size = total_dir_size + dir_size 162 | 163 | else: 164 | dir_size = sum(os.path.getsize(os.path.join(root, f)) for f in files) 165 | multiplexed_dirs[os.path.basename(root)] = barcode_data( 166 | barcode_id = os.path.basename(root), 167 | barcode_root_path = root, 168 | barcode_fastq_list = files, 169 | barcode_fastq_paths = [os.path.join(root, f) for f in files], 170 | barcode_size_bytes = dir_size) 171 | total_dir_size = total_dir_size + dir_size 172 | 173 | #ignored_barcodes = [] 174 | ignored_barcodes = dict() 175 | for barcode in multiplexed_dirs.copy().values(): 176 | perc_data = barcode.barcode_size_bytes / total_dir_size 177 | if perc_data < pima_data.barcode_min_fraction: 178 | #ignored_barcodes.append(barcode.barcode_id) 179 | ignored_barcodes[barcode.barcode_id] = perc_data 180 | del multiplexed_dirs[barcode.barcode_id] 181 | 182 | print_and_log( 183 | pima_data, 184 | f"Running PiMA on barcodes: {', '.join(multiplexed_dirs.keys())}", 185 | pima_data.main_process_verbosity, 186 | pima_data.main_process_color, 187 | ) 188 | 189 | if len(ignored_barcodes) > 0: 190 | message = ( 191 | "The following barcodes were found in the input directory but were NOT analyzed " 192 | f"because they contained less than {pima_data.barcode_min_fraction*100}% (default=0.025 [2.5%]) of the fastq data:\n" 193 | "If you need to change the min_fraction, please re-run pima with the following flag '--barcode_min_fraction '\n" 194 | ) 195 | for k, v in ignored_barcodes.items(): 196 | message = message + "{:<15} {:>.1%}".format(k, v) + "\n" 197 | print_and_log( 198 | pima_data, 199 | message, 200 | pima_data.warning_verbosity, 201 | pima_data.warning_color, 202 | ) 203 | pima_data.barcodes = multiplexed_dirs 204 | 205 | 206 | def initialize_multiplex_analysis(pima_data: PimaData, settings: Settings): 207 | 208 | identify_multiplexed_fastq_files(pima_data) 209 | 210 | if pima_data.nextflow or isinstance(pima_data.nextflow, str): 211 | validate_nextflow(pima_data) 212 | 213 | #This is pretty hacky, but on rosalind I can't get it to pass the modules / environment through correctly 214 | ## Here we basically use the pima run environment to feed the nextflow.config script user-specific values 215 | 216 | #generate nextflow config with the users conda environment that pima ran in 217 | try: 218 | conda_env = os.environ['CONDA_PREFIX'] 219 | except KeyError: 220 | conda_env = "None" 221 | 222 | #use the activate script that was used in the parent environment 223 | try: 224 | activate_sh = os.environ['CONDA_EXE'].replace(r"/conda", "/activate") 225 | except KeyError: 226 | activate_sh = "None" 227 | 228 | nextflow_dir = os.path.join(settings.pima_path, "nextflow_parallelization") 229 | nextflow_config_template = os.path.join(nextflow_dir, "nextflow.config.template") 230 | user_nextflow_config = os.path.join(nextflow_dir, "nextflow.config") 231 | find_replace = { 232 | "conda = None": f"conda = '{conda_env}'", 233 | "beforeScript = None": f"beforeScript = 'source {activate_sh}'" 234 | } 235 | with open(nextflow_config_template, "rt") as fin: 236 | with open(user_nextflow_config, "wt") as fout: 237 | for line in fin: 238 | for key in find_replace: 239 | if key in line: 240 | line = line.replace(key, find_replace[key]) 241 | fout.write(line) 242 | 243 | if isinstance(pima_data.nextflow, str): 244 | nextflow_args = pima_data.nextflow.replace("'","").replace('"','') 245 | else: 246 | nextflow_args = "" 247 | 248 | print_and_log( 249 | pima_data, 250 | "Handing off multiplexing to Nextflow", 251 | pima_data.main_process_verbosity, 252 | pima_data.main_process_color, 253 | ) 254 | 255 | stop_logging(pima_data, "Sample specific logs are found in their respective directories, closing multiplex log now.") 256 | 257 | nf_file = os.path.join(pima_data.output_dir, "nf_singplex_inputs.csv") 258 | 259 | with open(nf_file, "w") as nf_handle: 260 | for barcode in pima_data.barcodes.keys(): 261 | barcode_pima_data = copy.deepcopy(pima_data) 262 | barcode_pima_data.output_dir = os.path.join(pima_data.output_dir, barcode) 263 | barcode_pima_data.ont_fastq = os.path.join(pima_data.output_dir, f"{barcode}.fastq") 264 | barcode_pima_data.barcodes[barcode].create_concat_fastq(barcode_pima_data, barcode_pima_data.ont_fastq) 265 | updated_cmd = strip_pima_cmd(pima_data, pima_data.run_command) 266 | line = ( 267 | f"{barcode}," 268 | f"{barcode_pima_data.output_dir}," 269 | f"{barcode_pima_data.ont_fastq}," 270 | f"{updated_cmd}\n" 271 | ) 272 | nf_handle.write(line) 273 | 274 | #run nextflow 275 | #specify nextflow workdir to be within the pima output dir so we can delete everything after a successful run 276 | nextflow_stdout, nextflow_stderr = std_files(f"{pima_data.output_dir}/nextflow") 277 | command = " ".join( 278 | [ 279 | "nextflow run", 280 | os.path.join(settings.pima_path, "nextflow_parallelization/main.nf"), 281 | "--sample_sheet", 282 | nf_file, 283 | "--output", 284 | pima_data.output_dir, 285 | "-w", 286 | os.path.join(pima_data.output_dir, "work"), 287 | nextflow_args, 288 | "1>", 289 | nextflow_stdout, 290 | "2>", 291 | nextflow_stderr, 292 | ] 293 | ) 294 | print_and_run(pima_data, command, change_exe_dir = pima_data.output_dir) 295 | cleanup_nextflow(pima_data) 296 | 297 | #not using nextflow, running pima in serial 298 | else: 299 | stop_logging(pima_data, "Sample specific logs are found in their respective directories, closing multiplex log now.") 300 | for barcode in pima_data.barcodes.keys(): 301 | barcode_pima_data = copy.deepcopy(pima_data) 302 | barcode_pima_settings = copy.deepcopy(settings) 303 | barcode_pima_data.output_dir = os.path.join(barcode_pima_data.output_dir, barcode) 304 | barcode_pima_data.logging_file = os.path.join(barcode_pima_data.output_dir, "pima.log") 305 | barcode_pima_data.ont_fastq = os.path.join(barcode_pima_data.output_dir, f"{barcode}.fastq") 306 | barcode_pima_data.multiplexed = None 307 | log_message = [("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"{barcode_pima_data.barcodes[barcode].report_multiplex_sample()}")] 308 | modules.validate_output_dir(barcode_pima_data, settings, log_message) 309 | barcode_pima_data.barcodes[barcode].create_concat_fastq(barcode_pima_data) 310 | Pima.pima.run_workflow(barcode_pima_data, settings) 311 | 312 | 313 | def strip_pima_cmd(pima_data, system_args: list): 314 | """ 315 | Regenerate the pima command except singleplex with the specific ONT reads, updated output directory, removed multiplexing/nextflow statements 316 | """ 317 | #TODO: How should Illumina data be handled for a multiplexed run??? 318 | # - Just need to add a sample_sheet mode 319 | 320 | params_to_change = ['--output', '--ont-fastq', '--threads'] 321 | params_to_remove = ['--multiplexed', '--nextflow'] 322 | if isinstance(pima_data.nextflow, str): 323 | system_args = re.sub(pima_data.nextflow, "", system_args) 324 | params_to_fix_path = ['--reference-genome', '--mutation-regions'] 325 | param_iter = iter(system_args.split(" ")) 326 | new_cmd = [] 327 | for param in param_iter: 328 | if not param in [*params_to_change, *params_to_remove, *params_to_fix_path]: 329 | new_cmd.append(param) 330 | elif param in params_to_change: 331 | next(param_iter) 332 | elif param in params_to_fix_path: 333 | new_cmd.append(param) 334 | new_cmd.append(os.path.realpath(next(param_iter))) 335 | elif param in params_to_remove: 336 | continue 337 | return " ".join(i for i in new_cmd) 338 | 339 | 340 | def validate_nextflow(pima_data): 341 | if not pima_data.nextflow: 342 | return 343 | if pima_data.no_assembly: 344 | return 345 | if not pima_data.will_have_ont_fastq: 346 | error = "Nextflow is only setup for using ont-fastq data, not assemblies" 347 | pima_data.errors.append(error) 348 | print_and_log( 349 | pima_data, 350 | error, 351 | pima_data.fail_verbosity, 352 | pima_data.error_color, 353 | ) 354 | return 355 | 356 | if pima_data.nextflow and not pima_data.multiplexed: 357 | error = "Nextflow is only useful for multiplexed data" 358 | pima_data.errors.append(error) 359 | print_and_log( 360 | pima_data, 361 | error, 362 | pima_data.fail_verbosity, 363 | pima_data.error_color, 364 | ) 365 | 366 | if validate_utility( 367 | pima_data, "nextflow", "nextflow is not on the PATH (required by --multiplexed --nextflow)." 368 | ): 369 | command = "nextflow -v" 370 | pima_data.versions["nextflow"] = re.search( 371 | r"[0-9]+\.[0-9.]+", print_and_run(pima_data, command)[0] 372 | ).group(0) 373 | 374 | 375 | def cleanup_nextflow(pima_data): 376 | nf_temp_files = glob.glob(os.path.join(pima_data.output_dir, ".nextflow*")) 377 | nf_work_dir = os.path.join(pima_data.output_dir, "work") 378 | for file in nf_temp_files: 379 | try: 380 | shutil.rmtree(file) 381 | except NotADirectoryError: 382 | os.remove(file) 383 | 384 | shutil.rmtree(nf_work_dir) -------------------------------------------------------------------------------- /Pima/modules/ont_polishing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import subprocess 4 | 5 | import pandas as pd 6 | 7 | from Pima.pima_data import PimaData 8 | from Pima.utils.utils import ( 9 | add_warning, 10 | print_and_log, 11 | validate_utility, 12 | validate_file_and_size_or_error, 13 | print_and_run, 14 | find_checkpoint, 15 | make_start_file, 16 | make_finish_file, 17 | std_files, 18 | ) 19 | 20 | def validate_medaka(pima_data: PimaData): 21 | # Skip conditions 22 | if pima_data.no_assembly: 23 | return 24 | 25 | if pima_data.no_medaka: 26 | return 27 | 28 | if not pima_data.will_have_genome_fasta: 29 | return 30 | 31 | if not pima_data.will_have_ont_fastq: 32 | return 33 | 34 | print_and_log( 35 | pima_data, 36 | "Validating medaka", 37 | pima_data.main_process_verbosity, 38 | pima_data.main_process_color, 39 | ) 40 | 41 | if validate_utility( 42 | pima_data, 43 | "medaka_consensus", 44 | "medaka_consensus is not on the PATH (required by --medaka)", 45 | ): 46 | command = "medaka --version" 47 | pima_data.versions["medaka"] = re.search( 48 | r"[0-9]+\.[0-9.]+", print_and_run(pima_data, command)[0] 49 | ).group(0) 50 | 51 | if pima_data.ont_model == 'auto': 52 | print_and_log( 53 | pima_data, 54 | "Trying to determine the ont basecalling model for medaka", 55 | pima_data.sub_process_verbosity, 56 | pima_data.sub_process_color, 57 | ) 58 | 59 | determine_ont_model(pima_data) 60 | return 61 | 62 | pima_data.analysis.append(["medaka_ont_assembly", pima_data]) 63 | 64 | # Assume this means we have an ONT assembly 65 | pima_data.will_have_ont_assembly = True 66 | 67 | def medaka_ont_assembly(pima_data: PimaData): 68 | 69 | print_and_log( 70 | pima_data, 71 | "Running Medaka on ONT assembly", 72 | pima_data.main_process_verbosity, 73 | pima_data.main_process_color, 74 | ) 75 | 76 | 77 | medaka_dir = os.path.join(pima_data.output_dir, "medaka") 78 | if find_checkpoint(pima_data, medaka_dir): 79 | print_and_log( 80 | pima_data, 81 | "Medaka had previously been run and finished successfully", 82 | pima_data.main_process_verbosity, 83 | pima_data.main_process_color, 84 | ) 85 | pima_data.genome_fasta = os.path.join(medaka_dir, "assembly.fasta") 86 | pima_data.did_medaka_ont_assembly = True 87 | return 88 | 89 | os.makedirs(medaka_dir) 90 | make_start_file(pima_data, medaka_dir) 91 | 92 | # Actually run Medaka 93 | print_and_log( 94 | pima_data, 95 | f"Starting medaka using model: '{pima_data.ont_model}'", 96 | pima_data.sub_process_verbosity, 97 | pima_data.sub_process_color, 98 | ) 99 | medaka_threads = min(pima_data.threads, 2) 100 | medaka_fasta = os.path.join(medaka_dir, "consensus.fasta") 101 | medaka_stdout, medaka_stderr = std_files( 102 | os.path.join(medaka_dir, "medaka") 103 | ) 104 | command = " ".join( 105 | [ 106 | "medaka_consensus", 107 | "-i", 108 | pima_data.ont_fastq, 109 | "-d", 110 | pima_data.genome_fasta, 111 | "-o", 112 | medaka_dir, 113 | "-t", 114 | str(medaka_threads), # Medaka throttles anything more than 2 due to poor scaling 115 | "-b 50", # MUCH more efficient on scicomp than the default [-b 100] (10x faster, 1/10th the RAM) 116 | "-m", 117 | pima_data.ont_model, 118 | "1>", medaka_stdout, 119 | "2>", medaka_stderr, 120 | ] 121 | ) 122 | print_and_run(pima_data, command) 123 | validate_file_and_size_or_error( 124 | pima_data, 125 | medaka_fasta, 126 | "Medaka FASTA", 127 | "cannot be found after Medaka", 128 | "is empty", 129 | ) 130 | 131 | medaka_bam = os.path.join(medaka_dir, "calls_to_draft.bam") 132 | 133 | print_and_log( 134 | pima_data, 135 | "Repairing contig names after Medaka", 136 | pima_data.sub_process_verbosity, 137 | pima_data.sub_process_color, 138 | ) 139 | pima_data.genome_fasta = os.path.join(medaka_dir, "assembly.fasta") 140 | command = " ".join( 141 | [ 142 | 'awk \'{if($0 ~ /^>/){gsub(":.*", "", $0);gsub("_segment", "_", $0)}print}\'', 143 | medaka_fasta, 144 | ">", 145 | pima_data.genome_fasta, 146 | ] 147 | ) 148 | print_and_run(pima_data, command) 149 | validate_file_and_size_or_error( 150 | pima_data, 151 | pima_data.genome_fasta, 152 | "Genome assembly", 153 | "cannot be found after fixing names", 154 | "is empty", 155 | ) 156 | 157 | pima_data.load_genome() 158 | make_finish_file(pima_data, medaka_dir) 159 | pima_data.did_medaka_ont_assembly = True 160 | pima_data.files_to_clean.extend([medaka_bam, medaka_bam + ".bai"]) 161 | pima_data.files_to_clean.append(os.path.join(medaka_dir, "consensus_probs.hdf")) 162 | 163 | def determine_ont_model(pima_data: PimaData): 164 | #skip if this ran during the info_fastq assessment step () 165 | if pima_data.ont_model != "auto": 166 | return 167 | 168 | command = " ".join( 169 | [ 170 | "medaka tools resolve_model --auto_model consensus", 171 | pima_data.ont_fastq, 172 | ] 173 | ) 174 | result = subprocess.run(command, shell=True, capture_output=True, text=True) 175 | if result.returncode == 0: 176 | pima_data.ont_model = result.stdout.strip() #tell medaka to automatically determine the model 177 | pima_data.analysis.append(["medaka_ont_assembly", pima_data]) 178 | print_and_log( 179 | pima_data, 180 | f"Identified basecalling model: {pima_data.ont_model}", 181 | pima_data.sub_process_verbosity, 182 | pima_data.sub_process_color, 183 | ) 184 | 185 | elif result.returncode != 0: 186 | warn = "Medaka could not determine the basecalling model used to generate the fastq files and '--ont-model' was not provided, continuing PiMA without medaka polishing." 187 | add_warning(pima_data, warn) 188 | pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warn, dtype='object')]) -------------------------------------------------------------------------------- /Pima/modules/outdir.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import datetime 4 | from pathlib import Path 5 | 6 | from Pima.pima_data import PimaData 7 | from Pima.utils.settings import Settings 8 | from Pima.utils.utils import print_and_log, start_logging 9 | 10 | 11 | def validate_output_dir(pima_data: PimaData, settings: Settings, log_messages:list=[]): 12 | 13 | log_messages.append(("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"PiMA version: {settings.pima_version}")) 14 | log_messages.append(("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', "Validating output dir")) 15 | 16 | if not pima_data.output_dir: 17 | pima_data.errors += ["No output directory given (--output)"] 18 | elif pima_data.overwrite and pima_data.resume: 19 | pima_data.errors += ["--overwrite and --resume are mutually exclusive"] 20 | elif os.path.exists(pima_data.output_dir) and not ( 21 | pima_data.overwrite or pima_data.resume 22 | ): 23 | pima_data.errors += [ 24 | "Output directory " 25 | + pima_data.output_dir 26 | + " already exists. Add --overwrite OR --resume to ignore" 27 | ] 28 | 29 | else: 30 | pima_data.output_dir = os.path.realpath(pima_data.output_dir) 31 | make_outdir(pima_data, log_messages) 32 | 33 | 34 | def make_outdir(pima_data: PimaData, log_messages: list): 35 | if pima_data.resume and os.path.isdir(pima_data.output_dir): 36 | start_logging(pima_data) 37 | log_messages.append(("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"Resuming from previous run, previous log has been renamed to 'previous_.log.'")) 38 | report_logs(pima_data, log_messages) 39 | return 40 | 41 | if os.path.isdir(pima_data.output_dir): 42 | log_messages.append(("warn", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"Output directory {pima_data.output_dir} already exists. It will be removed.")) 43 | shutil.rmtree(pima_data.output_dir) 44 | elif os.path.isfile(pima_data.output_dir): 45 | log_messages.append(("warn", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"Output directory {pima_data.output_dir} already exists. It will be removed.")) 46 | os.remove(pima_data.output_dir) 47 | 48 | os.makedirs(pima_data.output_dir) 49 | start_logging(pima_data) 50 | report_logs(pima_data, log_messages) 51 | 52 | def report_logs(pima_data, log_messages): 53 | for message in log_messages: 54 | if isinstance(message, str): 55 | print_and_log( 56 | pima_data, 57 | message, 58 | pima_data.main_process_verbosity, 59 | pima_data.main_process_color, 60 | ) 61 | else: 62 | if message[0] == "main": 63 | print_and_log( 64 | pima_data, 65 | message[2], 66 | pima_data.main_process_verbosity, 67 | pima_data.main_process_color, 68 | message[1], 69 | ) 70 | elif message[0] == "warn": 71 | print_and_log( 72 | pima_data, 73 | message[2], 74 | pima_data.warning_verbosity, 75 | pima_data.warning_color, 76 | message[1], 77 | ) 78 | -------------------------------------------------------------------------------- /Pima/modules/plasmids.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | 5 | import pandas as pd 6 | 7 | from Pima.pima_data import PimaData 8 | from Pima.utils.settings import Settings 9 | 10 | from Pima.utils.utils import ( 11 | print_and_log, 12 | print_and_run, 13 | validate_utility, 14 | validate_file_and_size, 15 | validate_file_and_size_or_error, 16 | make_start_file, 17 | make_finish_file, 18 | find_checkpoint, 19 | std_files, 20 | ) 21 | 22 | from .annotations import make_blast_database 23 | 24 | def validate_plasmids(pima_data: PimaData, settings: Settings): 25 | 26 | if not pima_data.plasmids: 27 | return 28 | 29 | print_and_log( 30 | pima_data, 31 | 'Validating plasmid database and utilities', 32 | pima_data.main_process_verbosity, 33 | pima_data.main_process_color, 34 | ) 35 | 36 | for utility in ['minimap2']: 37 | if validate_utility(pima_data, utility, utility + ' is not on the PATH.'): 38 | command = utility + ' --version' 39 | pima_data.versions[utility] = re.search(r'[0-9]+\.[0-9.]*', print_and_run(pima_data, command)[0]).group(0) 40 | 41 | for utility in ['Rscript', 'R']: 42 | if validate_utility(pima_data, utility, utility + ' is not on the PATH.'): 43 | command = utility + ' --version 2>&1' 44 | pima_data.versions[utility] = re.search(r'[0-9]+\.[0-9.]*', print_and_run(pima_data, command)[0]).group(0) 45 | 46 | if not validate_file_and_size(pima_data, the_file = pima_data.plasmid_database, min_size = 2000): 47 | pima_data.errors.append(f"Can't find plasmid database {pima_data.plasmid_database} or is empty. Try --download?") 48 | 49 | if not pima_data.will_have_genome_fasta: 50 | pima_data.errors.append("Can't call plasmids without a genome or an assembly") 51 | 52 | pima_data.analysis.append(['call_plasmids', pima_data, settings]) 53 | 54 | 55 | def call_plasmids(pima_data: PimaData, settings: Settings): 56 | 57 | print_and_log( 58 | pima_data, 59 | 'Calling plasmids', 60 | pima_data.main_process_verbosity, 61 | pima_data.main_process_color, 62 | ) 63 | 64 | if not validate_file_and_size(pima_data, pima_data.plasmid_database) and validate_file_and_size(pima_data, settings.DockerPathPlasmid): 65 | pima_data.plasmid_database = settings.DockerPathPlasmid 66 | 67 | # Make a directory for plasmid stuff 68 | pima_data.plasmid_dir = os.path.join(pima_data.output_dir, 'plasmids') 69 | if find_checkpoint(pima_data, pima_data.plasmid_dir): 70 | return 71 | os.makedirs(pima_data.plasmid_dir) 72 | make_start_file(pima_data, pima_data.plasmid_dir) 73 | 74 | # Take very large things out of the assembly. They aren't plasmids and take a long time to run 75 | print_and_log( 76 | pima_data, 77 | 'Finding contigs < 500000 bp', 78 | pima_data.sub_process_verbosity, 79 | pima_data.sub_process_color, 80 | ) 81 | smaller_contigs_fasta = os.path.join(pima_data.plasmid_dir, 'small_contigs.fasta') 82 | command = " ".join( 83 | [ 84 | 'faidx -i chromsizes', 85 | pima_data.genome_fasta, 86 | '| awk \'($2 <= 500000){print $1}\'', 87 | '| parallel -n1 -n1 faidx', pima_data.genome_fasta, '>', smaller_contigs_fasta, 88 | ] 89 | ) 90 | print_and_run(pima_data, command) 91 | 92 | # See if there is anything in the small contigs file; if not, we done 93 | # TODO - Add something to the report about no small contigs 94 | small_contigs = pima_data.load_fasta(smaller_contigs_fasta) 95 | if len(small_contigs) == 0: 96 | print_and_log( 97 | pima_data, 98 | 'No contigs smaller than 500kb found, skipping plasmid search', 99 | pima_data.sub_process_verbosity, 100 | pima_data.sub_process_color, 101 | ) 102 | pima_data.did_call_plasmids = True 103 | pima_data.plasmids = None 104 | make_finish_file(pima_data, pima_data.plasmid_dir) 105 | return 106 | 107 | # Query plasmid sequences against the assembly using minimap2 108 | print_and_log( 109 | pima_data, 110 | 'Running minimap2 against the plasmid database', 111 | pima_data.sub_process_verbosity, 112 | pima_data.sub_process_color, 113 | ) 114 | plasmid_sam = os.path.join(pima_data.plasmid_dir, 'plasmid_hits.sam') 115 | _, minimap_stderr = std_files(os.path.join(pima_data.plasmid_dir, 'minimap')) 116 | command = " ".join( 117 | [ 118 | 'minimap2', 119 | '-k 20 -p .2 -a', 120 | '-t', str(pima_data.threads), 121 | smaller_contigs_fasta, 122 | pima_data.plasmid_database, 123 | '1>', plasmid_sam, 124 | '2>', minimap_stderr, 125 | ] 126 | ) 127 | print_and_run(pima_data, command) 128 | validate_file_and_size_or_error(pima_data, plasmid_sam, 'Plasmid v. contig SAM', 'cannot be found', 'is empty') 129 | 130 | pima_data.files_to_clean.append(plasmid_sam) 131 | 132 | # Turn the SAM file in to a PSL file using the modified sam2psl script 133 | print_and_log( 134 | pima_data, 135 | 'Converting the SAM file to a PSL file', 136 | pima_data.sub_process_verbosity, 137 | pima_data.sub_process_color, 138 | ) 139 | plasmid_psl = os.path.join(pima_data.plasmid_dir, 'plasmid_hits.psl') 140 | sam2psl_stdout, sam2psl_stderr = std_files(os.path.join(pima_data.plasmid_dir, 'sam2psl')) 141 | path2sam2psl = os.path.join(settings.pima_path, "Pima", "accessory_scripts", "sam2psl.py") 142 | command = " ".join( 143 | [ 144 | 'python3', 145 | path2sam2psl, 146 | '-i', plasmid_sam, 147 | '-o', plasmid_psl, 148 | '1>', sam2psl_stdout, '2>', sam2psl_stderr, 149 | ] 150 | ) 151 | print_and_run(pima_data, command) 152 | validate_file_and_size_or_error(pima_data, plasmid_sam, 'Plasmid v. contig PSL', 'cannot be found', 'is empty') 153 | 154 | # Make a BLAST database of the plasmid sequences 155 | make_blast_database(pima_data, pima_data.plasmid_database) 156 | 157 | # Pass the data onto pChunks 158 | print_and_log( 159 | pima_data, 160 | 'Running pChunks', 161 | pima_data.sub_process_verbosity, 162 | pima_data.sub_process_color, 163 | ) 164 | pima_data.pchunks_dir = os.path.join(pima_data.plasmid_dir, 'pChunks') 165 | 166 | if find_checkpoint(pima_data, pima_data.pchunks_dir): 167 | pima_data.did_call_plasmids = True 168 | new_plasmid_tsv = os.path.join(pima_data.plasmid_dir, 'plasmids.tsv') 169 | pima_data.plasmid_tsv = new_plasmid_tsv 170 | try: 171 | pima_data.plasmids = pd.read_csv(filepath_or_buffer = pima_data.plasmid_tsv, sep = '\t', header = 0) 172 | except: 173 | pima_data.plasmids = None 174 | return 175 | 176 | os.makedirs(pima_data.pchunks_dir) 177 | 178 | pima_data.plasmid_tsv = os.path.join(pima_data.pchunks_dir, 'plasmids.tsv') 179 | stdout_file, stderr_file = std_files(os.path.join(pima_data.pchunks_dir, "pchunks")) 180 | path2pChunks = os.path.join(settings.pima_path, "Pima", "accessory_scripts", "pChunks.R") 181 | command = " ".join( 182 | [ 183 | 'Rscript', 184 | path2pChunks, '--plasmid-psl', plasmid_psl, 185 | '--output', pima_data.pchunks_dir, 186 | '--no-amr', '--no-inc', 187 | '--plasmid-database', pima_data.plasmid_database, 188 | '--threads', str(pima_data.threads), 189 | '1>', stdout_file, '2>', stderr_file, 190 | ] 191 | ) 192 | print_and_run(pima_data, command) 193 | pima_data.plasmid_tsv = os.readlink(os.path.join(pima_data.pchunks_dir, 'plasmids.tsv')) 194 | validate_file_and_size_or_error(pima_data, pima_data.plasmid_tsv, 'Plasmid output table', 'cannot be found', 'is empty') 195 | 196 | # The final file is in pChunks 197 | new_plasmid_tsv = os.path.join(pima_data.plasmid_dir, 'plasmids.tsv') 198 | shutil.copy2(pima_data.plasmid_tsv, new_plasmid_tsv) 199 | pima_data.plasmid_tsv = new_plasmid_tsv 200 | 201 | try: 202 | pima_data.plasmids = pd.read_csv(filepath_or_buffer = pima_data.plasmid_tsv, sep = '\t', header = 0) 203 | except: 204 | pima_data.plasmids = None 205 | 206 | pima_data.did_call_plasmids = True 207 | make_finish_file(pima_data, pima_data.plasmid_dir) -------------------------------------------------------------------------------- /Pima/modules/report.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import re 4 | import string 5 | 6 | from Pima.pima_data import PimaData 7 | from Pima.utils.settings import Settings 8 | from Pima.utils.utils import ( 9 | print_and_log, 10 | print_and_run, 11 | validate_utility, 12 | find_checkpoint, 13 | validate_file_and_size_or_error, 14 | std_files, 15 | ) 16 | from Pima.accessory_scripts.MarkdownReport import PimaReport 17 | 18 | def validate_make_report(pima_data: PimaData, settings: Settings): 19 | if pima_data.no_report: 20 | return 21 | 22 | print_and_log( 23 | pima_data, 24 | "Validating reporting utilities", 25 | pima_data.main_process_verbosity, 26 | pima_data.main_process_color, 27 | ) 28 | 29 | if pima_data.bundle: 30 | if not os.path.isdir(pima_data.bundle): 31 | pima_data.errors.append(f"Can't find pandoc bundle {pima_data.bundle}") 32 | 33 | validate_utility( 34 | pima_data, "pandoc", "pandoc is not on the PATH (required for reporting)." 35 | ) 36 | 37 | pima_data.analysis.append(["make_report", pima_data, settings]) 38 | 39 | def make_report(pima_data: PimaData, settings: Settings): 40 | 41 | print_and_log( 42 | pima_data, 43 | "Making report", 44 | pima_data.main_process_verbosity, 45 | pima_data.main_process_color, 46 | ) 47 | pima_data.report_dir = os.path.join(pima_data.output_dir, 'report') 48 | 49 | if find_checkpoint(pima_data, pima_data.report_dir): 50 | ##Always regenerate report in case downstream steps have changed the results 51 | shutil.rmtree(pima_data.report_dir) 52 | os.mkdir(pima_data.report_dir) 53 | 54 | pima_data.report_prefix = os.path.join(pima_data.report_dir, 'report') 55 | pima_data.report_md = pima_data.report_prefix + '.md' 56 | 57 | pima_data.markdown_report = PimaReport(pima_data, settings) 58 | pima_data.markdown_report.make_report() 59 | 60 | ## Add appendices to the pima report 61 | if len(pima_data.markdown_report.appendices) > 0: 62 | #assign each new AMR class its own appendix ID (A-Z) 63 | for i, appendix in zip(string.ascii_uppercase, pima_data.markdown_report.appendices): 64 | png = re.sub(r"\.md", ".png", appendix) 65 | mod_appendix = os.path.join(pima_data.report_dir, os.path.basename(appendix)) 66 | shutil.copyfile(png, os.path.join(pima_data.report_dir, os.path.basename(png))) 67 | with open(appendix, "rt") as fin: 68 | with open(mod_appendix, "wt") as fout: 69 | for line in fin: 70 | fout.write(line.replace("LETTER", i)) 71 | 72 | # translate the appendix into the markdown report 73 | with open(pima_data.report_md, "a") as file: 74 | with open(mod_appendix, "r") as temp_file: 75 | file.write(temp_file.read()) 76 | #pima_data.files_to_clean.append(mod_appendix) 77 | #pima_data.files_to_clean.append(os.path.join(pima_data.report_dir, os.path.basename(png))) 78 | 79 | pima_data.report_pdf = pima_data.report_prefix + '.pdf' 80 | validate_file_and_size_or_error(pima_data, pima_data.report_md, 'Report MD', 'cannot be found', 'is empty') 81 | 82 | tectonic_stdout, tectonic_stderr = std_files(os.path.join(pima_data.report_dir, 'markdown2pdf')) 83 | command = ' '.join( 84 | [ 85 | 'pandoc -f gfm', 86 | pima_data.report_md, 87 | '-o', pima_data.report_pdf, 88 | '--pdf-engine=weasyprint', 89 | '--css ' + settings.pima_css, 90 | '1>', tectonic_stdout, 91 | '2>', tectonic_stderr, 92 | ] 93 | ) 94 | print_and_run(pima_data, command, change_exe_dir=pima_data.report_dir) 95 | validate_file_and_size_or_error(pima_data, pima_data.report_pdf, 'Report PDF', 'cannot be found', 'is empty') -------------------------------------------------------------------------------- /Pima/nextflow_parallelization/main.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | nextflow.enable.dsl = 2 3 | 4 | include { PIMA_SINGLEPLEX } from './modules/pima_singleplex.nf' 5 | include { COPY_RESULTS } from './modules/copy_results.nf' 6 | 7 | params.sample_sheet = null 8 | params.output = null 9 | 10 | workflow { 11 | outdir = file( params.output ) 12 | 13 | samples = Channel 14 | .fromPath(params.sample_sheet) 15 | .splitCsv(header:false, sep:",") 16 | 17 | PIMA_SINGLEPLEX( samples ) 18 | 19 | COPY_RESULTS( PIMA_SINGLEPLEX.out.output_directory, outdir ) 20 | 21 | } 22 | 23 | workflow.onComplete { 24 | println "Project : $workflow.projectDir" 25 | println "Workdir : $workflow.workDir" 26 | println "homeDir : $workflow.homeDir" 27 | println "launchDir : $workflow.launchDir" 28 | } -------------------------------------------------------------------------------- /Pima/nextflow_parallelization/modules/copy_results.nf: -------------------------------------------------------------------------------- 1 | process COPY_RESULTS { 2 | input: 3 | path(sample_outdir) 4 | val(workflow_outdir) 5 | 6 | output: 7 | stdout 8 | 9 | script: 10 | """ 11 | #move the pima results folder to the original output location 12 | cp -Lr $sample_outdir $workflow_outdir/ 13 | """ 14 | } -------------------------------------------------------------------------------- /Pima/nextflow_parallelization/modules/pima_singleplex.nf: -------------------------------------------------------------------------------- 1 | process PIMA_SINGLEPLEX { 2 | tag "$sample" 3 | 4 | input: 5 | // [sample, output, ont_fastq, pima_cmd] 6 | tuple val(sample), val(output), path(ont_fastq), val(pima_cmd) 7 | 8 | output: 9 | path(sample), emit: output_directory 10 | path("$sample/report/report.pdf") 11 | 12 | script: 13 | """ 14 | python3 $pima_cmd --ont-fastq $ont_fastq --output $sample --threads ${task.cpus} 15 | """ 16 | } -------------------------------------------------------------------------------- /Pima/nextflow_parallelization/nextflow.config.template: -------------------------------------------------------------------------------- 1 | conda.enabled = true 2 | conda.useMamba = true 3 | 4 | params { 5 | config_profile_description = 'Rosalind HPC @ CDC' 6 | config_profile_contact = 'OAMD' 7 | config_profile_url = 'https://info.biotech.cdc.gov/info/' 8 | custom_config_version = 'master' 9 | 10 | // Default resource parameters. Expecting to be overwritten. 11 | max_memory = '128.GB' 12 | max_cpus = 16 13 | max_time = '240.h' 14 | } 15 | 16 | 17 | executor { 18 | name = 'sge' 19 | pollInterval = '10sec' 20 | submitRateLimit = '2sec' 21 | queueSize = 24 22 | } 23 | 24 | process { 25 | executor = 'sge' 26 | penv = 'smp' 27 | queue = 'all.q' 28 | beforeScript = None 29 | conda = None 30 | errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } 31 | maxRetries = 3 32 | maxErrors = '-1' 33 | // Set h_vmem option for qsub submissions. +6 memory to h_vmem prevents memory allocation errors. 34 | clusterOptions = { "-l h_vmem=${(check_max((task.memory.toGiga())+6), 'memory').toString().replaceAll(/[\sB]/,'')}G" } 35 | cpus = { check_max( 6 * task.attempt, 'cpus' ) } 36 | memory = { check_max( 36.GB * task.attempt, 'memory' ) } 37 | time = { check_max( 8.h * task.attempt, 'time' ) } 38 | } 39 | 40 | def check_max(obj, type) { 41 | if (type == 'memory') { 42 | try { 43 | if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) 44 | return params.max_memory as nextflow.util.MemoryUnit 45 | else 46 | return obj 47 | } catch (all) { 48 | println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" 49 | return obj 50 | } 51 | } else if (type == 'time') { 52 | try { 53 | if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) 54 | return params.max_time as nextflow.util.Duration 55 | else 56 | return obj 57 | } catch (all) { 58 | println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" 59 | return obj 60 | } 61 | } else if (type == 'cpus') { 62 | try { 63 | return Math.min( obj, params.max_cpus as int ) 64 | } catch (all) { 65 | println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" 66 | return obj 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /Pima/pima.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import annotations # for "|" in docstrings python 3.7-3.9 3 | import os 4 | import sys 5 | import matplotlib as mpl 6 | 7 | from Pima.pima_data import PimaData 8 | from Pima.utils.utils import print_and_log, stop_logging, clean_up 9 | from Pima.utils.settings import Settings 10 | from Pima.utils.cli import parse_args 11 | from Pima import modules 12 | 13 | mpl.use("Agg") 14 | 15 | def run_prechecks(pima_data: PimaData, settings: Settings, pima_cmdtext: str): 16 | print_and_log( 17 | pima_data, 18 | "STARTING VALIDATION STEPS", 19 | pima_data.main_process_verbosity, 20 | pima_data.main_process_color, 21 | ) 22 | modules.validate_download(pima_data, settings) 23 | modules.validate_output_dir(pima_data, settings, pima_cmdtext) 24 | modules.validate_organism(pima_data) 25 | modules.validate_multiplex_fastq(pima_data) 26 | 27 | if len(pima_data.errors) > 0: 28 | print_and_log( 29 | pima_data, 30 | "Errors were found during validation.", 31 | pima_data.fail_verbosity, 32 | pima_data.error_color, 33 | ) 34 | for error in pima_data.errors: 35 | print_and_log( 36 | pima_data, 37 | error, 38 | pima_data.fail_verbosity, 39 | pima_data.error_color, 40 | ) 41 | print_and_log( 42 | pima_data, 43 | "Aborting.", 44 | pima_data.fail_verbosity, 45 | pima_data.error_color, 46 | ) 47 | sys.exit(1) 48 | 49 | def run_validation(pima_data: PimaData, settings: Settings): 50 | modules.validate_ont_fastq(pima_data, settings) 51 | modules.validate_illumina_fastq(pima_data) 52 | modules.validate_genome_fasta(pima_data) 53 | modules.validate_genome_assembly_size(pima_data) 54 | modules.validate_contamination_check(pima_data, settings) 55 | modules.validate_assembler(pima_data) 56 | modules.validate_assembly_info(pima_data) 57 | modules.validate_medaka(pima_data) 58 | modules.validate_illumina_polish(pima_data) 59 | modules.validate_evaluate_assembly(pima_data) 60 | modules.validate_plasmids(pima_data, settings) 61 | modules.validate_features(pima_data, settings) 62 | modules.validate_blast(pima_data) 63 | modules.validate_reference_fasta(pima_data) 64 | modules.validate_quast(pima_data) 65 | modules.validate_mutations(pima_data) 66 | modules.validate_draw_features(pima_data) 67 | modules.validate_draw_amr_matrix(pima_data) 68 | modules.validate_draw_circos(pima_data, settings) 69 | modules.validate_make_report(pima_data, settings) 70 | 71 | if len(pima_data.errors) > 0: 72 | print_and_log( 73 | pima_data, 74 | "Errors were found during validation.", 75 | pima_data.fail_verbosity, 76 | pima_data.error_color, 77 | ) 78 | for error in pima_data.errors: 79 | print_and_log( 80 | pima_data, 81 | error, 82 | pima_data.fail_verbosity, 83 | pima_data.error_color, 84 | ) 85 | print_and_log( 86 | pima_data, 87 | "Aborting.", 88 | pima_data.fail_verbosity, 89 | pima_data.error_color, 90 | ) 91 | sys.exit(1) 92 | 93 | #log all the versions 94 | version_log = "Utility versions:" 95 | for utility, version in pima_data.versions.items(): 96 | version_log = version_log + "\n" + "{:<25} {:<10}".format(utility, version) 97 | print_and_log( 98 | pima_data, 99 | version_log, 100 | pima_data.main_process_verbosity, 101 | pima_data.main_process_color, 102 | ) 103 | 104 | def define_workflow(pima_data: PimaData): 105 | """ 106 | Step through modules and run pima 107 | 108 | The run_validation steps parse the cmdline args, check all necessary tools / files are availabe for the 109 | requested steps in the pipeline, and queues up the modules by adding the steps to 110 | pima_data.analysis object 111 | """ 112 | print_and_log( 113 | pima_data, 114 | "STARTING PiMA Analysis", 115 | pima_data.main_process_verbosity, 116 | pima_data.main_process_color, 117 | ) 118 | while (True): 119 | step = pima_data.analysis[0] 120 | pima_data.analysis = pima_data.analysis[1:] 121 | 122 | ## See if we have arguments to pass to our function 123 | if type(step) is list: 124 | arguments = [] 125 | if len(step) > 1: 126 | arguments = step[1:] 127 | step = step[0] 128 | function = getattr(modules, step) 129 | function(*arguments) 130 | else: 131 | function = getattr(modules, step) 132 | function() 133 | 134 | if (len(pima_data.analysis) == 0): 135 | break 136 | 137 | def run_workflow(pima_data: PimaData, settings: Settings): 138 | run_validation(pima_data, settings) 139 | define_workflow(pima_data) 140 | clean_up(pima_data) 141 | stop_logging(pima_data, "PiMA completed successfully") 142 | 143 | def main(): 144 | """ """ 145 | 146 | settings = Settings() 147 | opts, unknown_args = parse_args(settings) 148 | 149 | # Start the analysis 150 | pima_data = PimaData(opts, unknown_args) 151 | 152 | # Capture commandline options used 153 | pima_data.run_command = ' '.join(sys.argv) 154 | run_prechecks(pima_data, settings, [f"PiMA command used: {' '.join(sys.argv)}"]) 155 | 156 | ##Initialize serial multiplex analysis 157 | if pima_data.multiplexed: 158 | modules.initialize_multiplex_analysis(pima_data, settings) 159 | 160 | else: 161 | run_workflow(pima_data, settings) 162 | 163 | 164 | if __name__ == "__main__": 165 | main() 166 | -------------------------------------------------------------------------------- /Pima/pima_colors.py: -------------------------------------------------------------------------------- 1 | class Colors: 2 | HEADER = '\033[95m' 3 | OKBLUE = '\033[94m' 4 | OKGREEN = '\033[92m' 5 | WARNING = '\033[93m' 6 | FAIL = '\033[91m' 7 | ENDC = '\033[0m' 8 | BOLD = '\033[1m' 9 | UNDERLINE = '\033[4m' -------------------------------------------------------------------------------- /Pima/pima_data.py: -------------------------------------------------------------------------------- 1 | import Bio.SeqIO 2 | import datetime 3 | 4 | from Pima.pima_colors import Colors 5 | import pandas as pd 6 | 7 | class PimaData: 8 | def __init__(self, opts=None, unknown_args=None): 9 | # The actual steps to carry out in the analysis held as a list 10 | self.analysis = [] 11 | 12 | # Verbosity levels and colors 13 | self.error_color = Colors.FAIL 14 | self.fail_verbosity = 1 15 | self.main_process_verbosity = 1 16 | self.warning_color = Colors.WARNING 17 | self.warning_verbosity = 1 18 | self.main_process_color = Colors.OKGREEN 19 | self.sub_process_verbosity = 2 20 | self.sub_process_color = Colors.OKBLUE 21 | self.command_verbosity = 3 22 | self.errors = [] 23 | self.warnings = [] 24 | 25 | # ONT FASTQ input 26 | self.ont_fastq = None 27 | self.ont_raw_fastq = self.ont_fastq 28 | self.ont_read_count = None 29 | self.ont_read_lengths = None 30 | self.will_have_ont_fastq = False 31 | self.ont_read_lengths = [] 32 | 33 | # Read metadata 34 | self.read_metadata = pd.Series(dtype=object) 35 | 36 | # Demultiplexing 37 | self.multiplexed = None 38 | self.nextflow = None 39 | self.barcodes = None 40 | self.barcode_min_fraction = None 41 | self.barcode_summary = None 42 | 43 | # Contamination 44 | self.contam_check = False 45 | self.kraken_fracs = pd.Series(dtype=object) 46 | self.did_contamination_check = False 47 | 48 | # Genome FASTA input 49 | self.genome_fasta = None 50 | self.will_have_genome_fasta = False 51 | 52 | # Illumina FASTQ input 53 | self.illumina_fastq = None 54 | self.pilon_coverage_min = 25 55 | self.did_spades_illumina_fastq = False 56 | self.did_pilon_ont_assembly = False 57 | self.did_polypolish_ont_assembly = False 58 | 59 | # Output options 60 | self.output_dir = None 61 | self.overwrite = False 62 | self.resume = False 63 | self.keep_intermediates = False 64 | 65 | # Assembly options 66 | self.assembler = "flye" 67 | self.flye_sup = False 68 | self.genome_assembly_size = None 69 | self.genome_assembly_raw_size = None 70 | self.assembly_coverage = None 71 | self.no_medaka = False 72 | self.ont_n50 = None 73 | self.ont_n50_min = 2500 74 | self.ont_coverage_min = 30 75 | self.only_assemble = False 76 | self.no_assembly = False 77 | self.did_flye_ont_fastq = False 78 | self.did_raven_ont_fastq = False 79 | self.will_have_ont_assembly = False 80 | self.mean_coverage = dict() 81 | self.did_circos_plots = False 82 | 83 | # ONT polishing 84 | self.ont_model = None 85 | self.did_medaka_ont_assembly = False 86 | 87 | # Illumina polishing 88 | self.illumina_polisher = "pilon" 89 | 90 | # Feature options 91 | self.no_amr = False 92 | self.no_inc = False 93 | self.feature_fastas = None 94 | self.feature_hits = pd.Series(dtype=object) 95 | self.feature_plots = pd.Series(dtype=object) 96 | self.feature_dirs = [] 97 | self.feature_names = [] 98 | self.feature_colors = [] 99 | self.did_blast_feature_sets = False 100 | 101 | # Download options 102 | self.download = False 103 | 104 | # Reference options 105 | self.reference_dir = None 106 | self.organism = None 107 | self.organism_dir = None 108 | self.list_organisms = False 109 | self.will_have_reference_fasta = False 110 | self.reference = None 111 | self.amr_mutations = pd.Series(dtype=object) 112 | self.mutation_regions = None 113 | self.amr_region_names = None 114 | self.virulence_genes_fp = None 115 | self.did_call_mutations = False 116 | self.amr_deletions = pd.DataFrame() 117 | self.did_call_large_indels = False 118 | self.reference_contig_order = None 119 | self.organism_amr_appendices = None 120 | # Files to remove when done 121 | self.files_to_clean = [] 122 | 123 | # Plasmid options 124 | self.plasmids = False 125 | self.did_call_plasmids = False 126 | 127 | # Notes for different sections of the analysis 128 | self.assembly_notes = pd.Series(dtype=object) 129 | self.alignment_notes = pd.Series(dtype=object) 130 | self.large_indel_notes = pd.Series(dtype=object) 131 | self.contig_alignment = pd.Series(dtype=object) 132 | self.versions = pd.Series(dtype=object) 133 | 134 | self.logging_handle = None 135 | self.fake_run = False 136 | 137 | self.bundle = None 138 | self.report = pd.Series(dtype=object) 139 | 140 | if opts is None or unknown_args is None: 141 | return 142 | 143 | # Date-time information 144 | self.start_time = datetime.datetime.now().strftime("%Y-%m-%d") 145 | 146 | # Logging information 147 | self.logging_file = None 148 | self.logging_handle = None 149 | 150 | # ONT FASTQ input 151 | self.ont_fastq = opts.ont_fastq 152 | self.ont_raw_fastq = self.ont_fastq 153 | 154 | # Demultiplexing 155 | self.multiplexed = opts.multiplexed 156 | self.nextflow = opts.nextflow 157 | self.barcodes = None 158 | self.barcode_min_fraction = opts.barcode_min_fraction 159 | 160 | # Contamination 161 | self.contam_check = opts.contamination 162 | self.did_contamination_check = False 163 | 164 | # Illumina FASTQ input 165 | self.illumina_fastq = opts.illumina_fastq 166 | 167 | # Genome FASTA input 168 | self.genome_fasta = opts.genome 169 | 170 | # Output options 171 | self.output_dir = opts.output 172 | self.overwrite = opts.overwrite 173 | self.resume = opts.resume 174 | self.keep_intermediates = opts.keep_intermediates 175 | 176 | # Assembly options 177 | self.assembler = opts.assembler 178 | self.genome_assembly_size = opts.genome_size 179 | self.assembly_coverage = opts.assembly_coverage 180 | self.only_assemble = opts.only_assemble 181 | self.no_assembly = opts.no_assembly 182 | 183 | # ONT polishing 184 | self.ont_model = opts.ont_model 185 | self.no_medaka = opts.no_medaka 186 | 187 | # Illumina polishing 188 | self.illumina_polisher = opts.illumina_polisher 189 | 190 | # Illumina metrics 191 | self.illumina_length_mean = None 192 | self.illumina_coverage_min = 30 193 | self.did_pilon_ont_assembly = False 194 | self.did_polypolish_ont_assembly = False 195 | 196 | # The assembly itself 197 | self.genome = None 198 | self.contig_info = None 199 | 200 | # Vs. reference options 201 | self.reference_identity_min = 98.0 202 | self.reference_alignment_min = 97.0 203 | self.query_alignment_min = 97.0 204 | 205 | #placeholders for comparison to given reference 206 | self.reference_identity = 0 207 | self.reference_aligned_bases = 0 208 | self.query_aligned_bases = 0 209 | self.reference_aligned_fraction = 0 210 | self.query_aligned_fraction = 0 211 | 212 | # Plasmid and feature options 213 | self.plasmids = opts.plasmids 214 | self.plasmid_database = opts.plasmid_database 215 | self.did_call_plasmids = False 216 | self.no_drawing = opts.no_drawing 217 | self.amr_database = opts.amr_database 218 | self.no_amr = opts.no_amr 219 | self.inc_database = opts.inc_database 220 | self.no_inc = opts.no_inc 221 | self.feature_fastas = opts.feature 222 | self.feature_hits = pd.Series(dtype="float64") 223 | self.feature_plots = pd.Series(dtype="float64") 224 | self.feature_dirs = [] 225 | self.feature_names = [] 226 | self.feature_colors = [] 227 | self.download = opts.download 228 | 229 | # Reference options 230 | self.reference_dir = opts.reference_dir 231 | self.organism = opts.organism 232 | self.list_organisms = opts.list_organisms 233 | self.reference_fasta = opts.reference_genome 234 | self.mutation_region_bed = opts.mutation_regions 235 | self.self_circos = opts.self_circos 236 | self.threads = opts.threads 237 | 238 | # How much stuff to print 239 | self.verbosity = opts.verbosity 240 | 241 | # Files to remove when done 242 | self.files_to_clean = [] 243 | 244 | # Don't actully run any commands 245 | self.fake_run = opts.fake_run 246 | 247 | # Reporting 248 | self.no_report = False 249 | self.bundle = opts.bundle 250 | self.analysis_name = opts.name 251 | self.mutation_title = "Mutations" 252 | self.report[self.mutation_title] = pd.Series(dtype="float64") 253 | self.large_indels = pd.Series(dtype="float64") 254 | self.plasmid_title = "Plasmid annotation" 255 | self.report[self.plasmid_title] = pd.Series(dtype="float64") 256 | self.amr_matrix_title = "AMR matrix" 257 | self.did_draw_amr_matrix = False 258 | self.report[self.amr_matrix_title] = pd.Series(dtype="float64") 259 | self.methods_title = "Methods summary" 260 | self.report[self.methods_title] = pd.Series(dtype="float64") 261 | self.basecalling_methods = "Basecalling & processing" 262 | self.report[self.methods_title][self.basecalling_methods] = pd.Series( 263 | dtype="float64" 264 | ) 265 | self.assembly_methods = "Assembly & polishing" 266 | self.report[self.methods_title][self.assembly_methods] = pd.Series( 267 | dtype="float64" 268 | ) 269 | self.mutation_methods = "Mutation screening " 270 | self.report[self.methods_title][self.mutation_methods] = pd.Series( 271 | dtype="float64" 272 | ) 273 | self.plasmid_methods = "Plasmid annotation" 274 | self.report[self.methods_title][self.plasmid_methods] = pd.Series( 275 | dtype="float64" 276 | ) 277 | self.meta_title = "PIMA meta-information" 278 | 279 | # See if we got any unknown args. Not allowed. 280 | if len(unknown_args) != 0: 281 | self.errors = self.errors + [ 282 | "Unknown argument: " + unknown for unknown in unknown_args 283 | ] 284 | 285 | def load_reference(self): 286 | self.reference = self.load_fasta(self.reference_fasta) 287 | self.will_have_reference_fasta = True 288 | 289 | self.reference_size = 0 290 | for i in self.reference: 291 | self.reference_size += len(i.seq) 292 | 293 | @staticmethod 294 | def load_fasta(fasta: str): 295 | sequence = pd.Series(dtype=object) 296 | for contig in Bio.SeqIO.parse(fasta, "fasta"): 297 | sequence[contig.id] = contig 298 | return sequence 299 | 300 | def load_genome(self): 301 | self.genome = self.load_fasta(self.genome_fasta) 302 | self.genome_size = 0 303 | for i in self.genome: 304 | self.genome_size += len(i.seq) 305 | -------------------------------------------------------------------------------- /Pima/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/utils/__init__.py -------------------------------------------------------------------------------- /Pima/utils/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from argparse import ArgumentParser, HelpFormatter 4 | 5 | from Pima.pima_colors import Colors 6 | from Pima.utils.settings import Settings 7 | 8 | 9 | def parse_args(settings: Settings): 10 | # with open(os.path.join(settings.pima_path, "VERSION"), "r") as version_fp: 11 | # VERSION = version_fp.read().strip() 12 | 13 | parser = ArgumentParser( 14 | allow_abbrev=False, 15 | prog="pima.py", 16 | add_help=False, 17 | description=""" 18 | P.I.M.A. bacterial genome analysis pipeline 19 | """, 20 | formatter_class=lambda prog: HelpFormatter( 21 | prog, width=120, max_help_position=120 22 | ), 23 | ) 24 | 25 | parser._optionals.title = "Help and version" 26 | parser.add_argument( 27 | "-h", "--help", action="store_true", help="Print this help and exit." 28 | ) 29 | parser.add_argument( 30 | "-v", 31 | "--version", 32 | action="version", 33 | help="Print the software version.", 34 | version=f"PIMA microbial genome analysis pipeline (version {settings.pima_version})", 35 | ) 36 | 37 | # Input arguments 38 | input_group = parser.add_argument_group("Input and basecalilng options") 39 | 40 | input_group.add_argument( 41 | "--ont-model", 42 | required=False, 43 | default="auto", 44 | metavar="", minimap_stderr, 41 | "| samtools sort", 42 | "-@", 43 | str(pima_data.threads), 44 | "-o", 45 | bam, 46 | "-T reads.tmp -", 47 | "1>/dev/null 2>/dev/null", 48 | ] 49 | ) 50 | 51 | print_and_run(pima_data, command) 52 | validate_file_and_size_or_error(pima_data, the_file=bam, error_prefix="The file", presence_suffix="doesn't exist", size_suffix="is below min expected size", min_size=1000) 53 | index_bam(pima_data, bam) 54 | 55 | def filter_bam(pima_data: PimaData, inbam: str, outbam: str = None, F: str = None, q: str = None): 56 | """Filter the bam file, if outbam not provided, we filter in-place""" 57 | if not outbam: 58 | outbam = inbam 59 | command = " ".join( 60 | [ 61 | "samtools view -h", 62 | "-F", 63 | F, 64 | "-q", 65 | q, 66 | inbam, 67 | "| samtools sort", 68 | "-@", 69 | str(pima_data.threads), 70 | "-o", 71 | outbam, 72 | "-T reads.tmp -", 73 | "1>/dev/null 2>/dev/null", 74 | ] 75 | ) 76 | print_and_run(pima_data, command) 77 | validate_file_and_size_or_error(pima_data, the_file=outbam, min_size=1000) 78 | index_bam(pima_data, outbam) 79 | 80 | def index_bam(pima_data: PimaData, bam: str): 81 | command = " ".join(["samtools index", bam, "1>/dev/null 2>/dev/null"]) 82 | print_and_run(pima_data, command) 83 | index_bai = bam + ".bai" 84 | validate_file_and_size_or_error(pima_data, the_file=index_bai, min_size=1000) 85 | 86 | def mpileup_bam(pima_data: PimaData, reference_genome: str, bam: str, mpileup: str, output_dir: str): 87 | 88 | print_and_log( 89 | pima_data, 90 | "Making mpileup from BAM", 91 | pima_data.sub_process_verbosity, 92 | pima_data.sub_process_color, 93 | ) 94 | 95 | mpileup_stdout, mpileup_stderr = std_files(os.path.join(output_dir, 'mpileup')) 96 | command = " ".join( 97 | [ 98 | 'samtools mpileup', 99 | '-B', 100 | '-a', 101 | '-f', reference_genome, 102 | '-o' + mpileup, 103 | bam, 104 | '1>', mpileup_stdout, 105 | '2>', mpileup_stderr, 106 | ] 107 | ) 108 | print_and_run(pima_data, command) 109 | validate_file_and_size_or_error(pima_data, 110 | mpileup, 111 | 'Region MPILEUP file', 112 | 'cannot be found', 113 | 'is empty', 114 | ) 115 | 116 | def bwa_index_fasta(pima_data: PimaData, fasta: str): 117 | 118 | print_and_log( 119 | pima_data, 120 | 'Indexing FASTA with bwa index', 121 | pima_data.sub_process_verbosity, 122 | pima_data.sub_process_color, 123 | ) 124 | 125 | # Check for an index already there 126 | bwa_index = f"{fasta}.bwt" 127 | if validate_file_and_size(pima_data, bwa_index): 128 | return 129 | 130 | # Make the bwa index 131 | std_prefix = re.sub(r'\.f(na|asta)$', '', fasta) 132 | bwa_index_stdout, bwa_index_stderr = std_files(std_prefix + '_index') 133 | command = " ".join( 134 | [ 135 | 'bwa', 136 | 'index', 137 | fasta, 138 | '1>', bwa_index_stdout, '2>', bwa_index_stderr, 139 | ] 140 | ) 141 | print_and_run(pima_data, command) 142 | 143 | # Check that the index was built 144 | validate_file_and_size_or_error(pima_data, bwa_index, 'BWA index', 'doesn\'t exist', 'is empty') 145 | 146 | def bwa_short_illumina_fastq_and_sort(pima_data: PimaData, genome: str, fastq: str, bam: str): 147 | 148 | std_prefix = re.sub(r'\.bam$', '', bam) 149 | 150 | bwa_index_fasta(pima_data, genome) 151 | 152 | # Align the reads 153 | sai = [] 154 | for i in range(len(fastq)): 155 | bwa_stdout, bwa_stderr = std_files(std_prefix + '_aln') 156 | this_sai = std_prefix + '_aln_' + str(i) + '.sai' 157 | command = " ".join( 158 | [ 159 | 'bwa aln', 160 | '-t', str(pima_data.threads), 161 | genome, 162 | fastq[i], 163 | '1>', this_sai, 164 | '2>', bwa_stderr, 165 | ] 166 | ) 167 | sai.append(this_sai) 168 | print_and_run(pima_data, command) 169 | validate_file_and_size_or_error(pima_data, this_sai) 170 | 171 | # And turn the SAI into a proper SAM file 172 | read_type = 'samse' 173 | if len(fastq) > 1: 174 | read_type = 'sampe' 175 | bwa_stdout, bwa_stderr = std_files(std_prefix + '_sam') 176 | tmp_file = std_prefix + '.tmp' 177 | command = " ".join( 178 | [ 179 | 'bwa', 180 | read_type, 181 | genome, 182 | ' '.join(sai), 183 | ' '.join(fastq), 184 | '2>', bwa_stderr, 185 | '| samtools', 186 | 'sort', 187 | '-T', tmp_file, 188 | '-o', bam, 189 | '-', 190 | '1>/dev/null 2>/dev/null', 191 | ] 192 | ) 193 | print_and_run(pima_data, command) 194 | validate_file_and_size_or_error(pima_data, the_file = bam, min_size = 100) 195 | index_bam(pima_data, bam) 196 | 197 | def bwa_mem_all_aln_illumina(pima_data: PimaData, genome: str, fastq: list, bam: str): 198 | std_prefix = re.sub(r'\.bam$', '', bam) 199 | 200 | bwa_index_fasta(pima_data, genome) 201 | 202 | bams = [] 203 | # Align the reads 204 | for i in range(len(fastq)): 205 | read = f"_R{i+1}" 206 | _, bwa_stderr = std_files(std_prefix + read + '_aln') 207 | this_bam = std_prefix + read + '.bam' 208 | #Polypolish warns not to sort the bam files 209 | command = " ".join( 210 | [ 211 | 'bwa mem', 212 | '-a', 213 | '-t', str(pima_data.threads), 214 | genome, 215 | fastq[i], 216 | '1>', this_bam, 217 | '2>', bwa_stderr, 218 | ] 219 | ) 220 | bams.append(this_bam) 221 | print_and_run(pima_data, command) 222 | validate_file_and_size_or_error(pima_data, this_bam) 223 | return bams -------------------------------------------------------------------------------- /Pima/utils/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | class Settings(): 5 | def __init__(self): 6 | self.data_dir = os.path.join(self.pima_path, 'data') 7 | self.amr_database_default = os.path.join(self.pima_path, 'data/amr.fasta') 8 | self.amr_gene_drug_tsv = os.path.join(self.pima_path, 'data/gene_drug.tsv') 9 | self.amr_default_color = '#FED976' 10 | self.inc_database_default = os.path.join(self.pima_path, 'data/inc.fasta') 11 | self.inc_default_color = '#0570B0' 12 | self.included_databases = [self.amr_database_default, self.inc_database_default] 13 | 14 | self.plasmid_database_default_fasta = os.path.join(self.pima_path, 'data/plasmids_and_vectors.fasta') 15 | self.kraken_database_default = os.path.join(self.pima_path, 'data/kraken2') 16 | self.reference_dir_default = os.path.join(self.pima_path, 'data/reference_sequences') 17 | self.pima_css = os.path.join(self.pima_path,'data/pima.css') 18 | self.virulence_genes_fp = os.path.join(self.data_dir, "reference_sequences/Bacillus_anthracis/ba_virulence_genes.bed") 19 | 20 | ## Docker specific paths 21 | self.DockerPathPlasmid = os.path.join('/home/DockerDir/Data/Temp_Data/plasmids_and_vectors.fasta') 22 | self.DockerPathKraken = os.path.join('/home/DockerDir/Data/Temp_Data/kraken2') 23 | 24 | @property 25 | def pima_path(self): 26 | # Is __name__ the most robust way to print the path of importing scripts, not this one? 27 | #return os.path.dirname(os.path.realpath(__name__)) 28 | return Path(__file__).parent.parent 29 | @property 30 | def pima_version(self): 31 | with open(os.path.join(self.pima_path, "VERSION"), "r") as version_fp: 32 | VERSION = version_fp.read().strip() 33 | return VERSION -------------------------------------------------------------------------------- /Pima/utils/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | import datetime 5 | import shutil 6 | import subprocess 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | 11 | from Pima.pima_data import PimaData 12 | from Pima.pima_colors import Colors 13 | 14 | 15 | def nicenumber(x: float, round: int): 16 | exp = np.floor(np.log10(x)) 17 | f = x / 10**exp 18 | 19 | if round: 20 | if f < 1.5: 21 | nf = 1.0 22 | elif f < 3.0: 23 | nf = 2.0 24 | elif f < 7.0: 25 | nf = 5.0 26 | else: 27 | nf = 10.0 28 | else: 29 | if f <= 1.0: 30 | nf = 1.0 31 | elif f <= 2.0: 32 | nf = 2.0 33 | elif f <= 5.0: 34 | nf = 5.0 35 | else: 36 | nf = 10.0 37 | 38 | return nf * 10.0**exp 39 | 40 | 41 | def pretty(low, high, n): 42 | range = nicenumber(high - low, False) 43 | d = nicenumber(range / (n - 1), True) 44 | miny = np.floor(low / d) * d 45 | maxy = np.ceil(high / d) * d 46 | return np.arange(miny, maxy + 0.5 * d, d) 47 | 48 | 49 | def format_kmg(number: float, decimals: int = 0): 50 | if number == 0: 51 | return "0" 52 | 53 | magnitude_powers = [10**9, 10**6, 10**3, 1] 54 | magnitude_units = ["G", "M", "K", ""] 55 | for i in range(len(magnitude_units)): 56 | if number >= magnitude_powers[i]: 57 | magnitude_power = magnitude_powers[i] 58 | magnitude_unit = magnitude_units[i] 59 | return f"{round(number/magnitude_power,decimals)}{magnitude_unit}" 60 | 61 | 62 | def print_and_log( 63 | pima_data: PimaData, text: str, verbosity: int, color: str = Colors.ENDC, time_string:str = None 64 | ): 65 | if not time_string: 66 | time_string = f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]' 67 | if verbosity <= pima_data.verbosity: 68 | sys.stderr.write(f"{time_string} {color}{text}{Colors.ENDC}\n") 69 | 70 | if pima_data.logging_handle: 71 | pima_data.logging_handle.write(f"{time_string} {text}\n") 72 | #otherwise log doesn't get created until it is closed 73 | ## I don't think this is too expensive of a process since we are not logging that frequently 74 | pima_data.logging_handle.flush() 75 | 76 | 77 | def start_logging(pima_data: PimaData): 78 | 79 | if pima_data.resume: 80 | # if user has specified a new logfile for us 81 | if pima_data.logging_file and not os.path.isfile(pima_data.logging_file): 82 | pima_data.logging_handle = open(pima_data.logging_file, 'w') 83 | 84 | # if user has resumed a run using the same log path 85 | elif pima_data.logging_file and os.path.isfile(pima_data.logging_file): 86 | backup_log_file(pima_data.logging_file) 87 | pima_data.logging_handle = open(pima_data.logging_file, 'w') 88 | 89 | # if user is using the default pima.log file 90 | else: 91 | backup_log_file(os.path.join(pima_data.output_dir, "pima.log")) 92 | pima_data.logging_file = os.path.join(pima_data.output_dir, "pima.log") 93 | pima_data.logging_handle = open(pima_data.logging_file, 'w') 94 | 95 | else: 96 | if pima_data.logging_file: 97 | pima_data.logging_handle = open(pima_data.logging_file, 'w') 98 | 99 | else: 100 | pima_data.logging_file = os.path.join(pima_data.output_dir, "pima.log") 101 | pima_data.logging_handle = open(pima_data.logging_file, 'w') 102 | 103 | 104 | def backup_log_file(existing_log_path: str): 105 | log_path = Path(existing_log_path) 106 | 107 | if os.path.isfile(log_path): 108 | rename_log_path = os.path.join(log_path.parent, f"previous_{log_path.name}") 109 | 110 | if os.path.isfile(rename_log_path): 111 | with open(rename_log_path, 'a') as prev_log_file: 112 | prev_log_file.write("\n\n") 113 | with open(existing_log_path) as cur_log_file: 114 | shutil.copyfileobj(cur_log_file, prev_log_file) 115 | os.remove(existing_log_path) 116 | return 117 | 118 | os.rename(existing_log_path, rename_log_path) 119 | 120 | 121 | def stop_logging(pima_data: PimaData, message: str = None): 122 | 123 | if message: 124 | print_and_log( 125 | pima_data, 126 | message, 127 | pima_data.main_process_verbosity, 128 | pima_data.main_process_color, 129 | ) 130 | 131 | pima_data.logging_handle.close() 132 | pima_data.logging_handle = None 133 | 134 | 135 | def validate_file(the_file: str): 136 | return os.path.isfile(the_file) 137 | 138 | 139 | def validate_file_size(pima_data: PimaData, the_file: str, min_size: int = 0): 140 | if not pima_data.fake_run: 141 | return os.stat(the_file).st_size >= min_size 142 | else: 143 | return True 144 | 145 | 146 | def validate_file_and_size(pima_data: PimaData, the_file: str, min_size: int = 0): 147 | return validate_file(the_file) and validate_file_size(pima_data, the_file, min_size) 148 | 149 | 150 | def validate_file_and_size_or_error( 151 | pima_data, 152 | the_file, 153 | error_prefix="The file", 154 | presence_suffix="doesn't exist", 155 | size_suffix="is size 0", 156 | min_size=0, 157 | ): 158 | if not validate_file(the_file) and not pima_data.fake_run: 159 | error_out(pima_data, " ".join([error_prefix, the_file, presence_suffix])) 160 | 161 | if not validate_file_size(pima_data, the_file, min_size) and not pima_data.fake_run: 162 | error_out(pima_data, " ".join([error_prefix, the_file, size_suffix])) 163 | 164 | 165 | def validate_utility(pima_data: PimaData, utility: str, error: str): 166 | if not shutil.which(utility): 167 | pima_data.errors.append(error) 168 | print_and_log( 169 | pima_data, 170 | error, 171 | pima_data.fail_verbosity, 172 | pima_data.error_color, 173 | ) 174 | return False 175 | else: 176 | return True 177 | 178 | 179 | def print_and_run(pima_data: PimaData, command: str, change_exe_dir: str = None): 180 | print_and_log(pima_data, command, pima_data.command_verbosity) 181 | return run_command(pima_data, command, change_exe_dir) 182 | 183 | 184 | def run_command(pima_data: PimaData, command: str, change_exe_dir: str = None): 185 | if not pima_data.fake_run: 186 | if change_exe_dir: 187 | result = subprocess.run(command, shell=True, capture_output=True, text=True, cwd = change_exe_dir) 188 | else: 189 | result = subprocess.run(command, shell=True, capture_output=True, text=True) 190 | 191 | if result.returncode == 0: 192 | return result.stdout.split("\n") 193 | elif re.search(r"\.stderr$", command): 194 | sterr_f = [x for x in command.split(" ") if re.search(r"\.stderr$", x)][0] 195 | message = f"Command {command} failed with the following error. exiting\n{Path(sterr_f).read_text()}" 196 | error_out(pima_data, message) 197 | else: 198 | message = f"Command {command} failed with the following error. exiting\n{result.stderr}" 199 | error_out(pima_data, message) 200 | 201 | 202 | def error_out(pima_data: PimaData, message: str): 203 | print_and_log( 204 | pima_data, 205 | message, 206 | pima_data.fail_verbosity, 207 | pima_data.error_color, 208 | ) 209 | sys.exit(1) 210 | 211 | 212 | def print_warning(pima_data: PimaData, warning: str): 213 | print_and_log( 214 | pima_data, warning, pima_data.warning_verbosity, pima_data.warning_color 215 | ) 216 | 217 | 218 | def add_warning(pima_data: PimaData, warning: str): 219 | print_warning(pima_data, warning) 220 | pima_data.warnings += [warning] 221 | 222 | 223 | def find_checkpoint(pima_data: PimaData, dir: str): 224 | """Searches for the .finish file generated after each analysis step is completed 225 | 226 | Args: 227 | dir (path): Path to analysis directory 228 | 229 | Returns: 230 | True if 'resume' flag provided & ".finish" found 231 | False if ".finish" is not found, existing directory deleted 232 | """ 233 | if not pima_data.resume: 234 | return False 235 | 236 | if os.path.exists(os.path.join(dir, ".finish")): 237 | return True 238 | 239 | else: 240 | if os.path.exists(dir): 241 | shutil.rmtree(dir) 242 | return False 243 | 244 | 245 | def std_files(prefix: str): 246 | return [prefix + ".std" + i for i in ["out", "err"]] 247 | 248 | 249 | def touch_file(pima_data: PimaData, a_file: str): 250 | command = " ".join(["touch", a_file]) 251 | print_and_run(pima_data, command) 252 | 253 | 254 | def make_start_file(pima_data: PimaData, a_dir: str): 255 | start_file = os.path.join(a_dir, ".start") 256 | touch_file(pima_data, start_file) 257 | 258 | 259 | def make_finish_file(pima_data: PimaData, a_dir: str): 260 | finish_file = os.path.join(a_dir, ".finish") 261 | touch_file(pima_data, finish_file) 262 | 263 | 264 | def make_report_info_file(pima_data: PimaData, a_dir: str): 265 | report_info_file = os.path.join(a_dir, ".report_info") 266 | touch_file(pima_data, report_info_file) 267 | return report_info_file 268 | 269 | 270 | def clean_up(pima_data: PimaData): 271 | 272 | print_and_log( 273 | pima_data, 274 | "Cleaning up", 275 | pima_data.main_process_verbosity, 276 | pima_data.main_process_color, 277 | ) 278 | 279 | if (pima_data.genome_fasta): 280 | final_fasta = os.path.join(pima_data.output_dir, 'assembly.fasta') 281 | command = ' '.join(['cp', pima_data.genome_fasta, final_fasta]) 282 | print_and_run(pima_data, command) 283 | 284 | #create a shortcut to the report in the primary output_dir 285 | if os.path.isfile(os.path.join(pima_data.output_dir, "report.pdf")): 286 | os.remove(os.path.join(pima_data.output_dir, "report.pdf")) 287 | 288 | os.symlink(os.path.join(pima_data.output_dir, "report", "report.pdf"), os.path.join(pima_data.output_dir, "report.pdf")) 289 | 290 | if not pima_data.keep_intermediates: 291 | if len(pima_data.files_to_clean) > 0: 292 | for file in pima_data.files_to_clean : 293 | if os.path.isfile(file): 294 | try: 295 | os.remove(file) 296 | except OSError: 297 | continue -------------------------------------------------------------------------------- /conda_recipe/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | $PYTHON -m pip install -vv --no-deps --ignore-installed . 4 | 5 | -------------------------------------------------------------------------------- /conda_recipe/environment.yml: -------------------------------------------------------------------------------- 1 | #mirrors meta.yaml but can be used to build a conda enviroment WITHOUT pima for debugging 2 | name: base 3 | 4 | channels: 5 | - bioconda 6 | - conda-forge 7 | 8 | dependencies: 9 | - git==2.39.1 10 | - gawk==5.3 11 | - bedtools==2.31.1 12 | - biopython==1.84 13 | - blast==2.16 14 | - bwa==0.7.18 15 | - curl 16 | - flye==2.9.4 17 | - raven-assembler==1.8.3 18 | - mdutils==1.6.0 19 | - minimap2==2.28 20 | - multiprocess=0.70.16 21 | - mummer==3.23 22 | - kraken2==2.1.3 23 | - spades==4.0.0 24 | - pandas==2.2.2 25 | - pandoc==3.3 26 | - weasyprint==62.3 # pdf engine for pandoc 27 | - pango==1.50.14 28 | - pathos 29 | - python >=3.9,<3.11 #,<3.9 30 | - pyfaidx==0.8.1.2 31 | - python_circos ## will replace with the development verison using pip in pima_install.sh script 32 | - r 33 | - r-hash #needed for pChunks 34 | - r-stringr #needed for pChunks 35 | - r-gridextra #needed for pChunks 36 | - r-optparse #needed for pChunks 37 | - samtools==1.18 #needs to be the conda-forge version and NOT the bioconda version 38 | - varscan==2.4.6 39 | - medaka >=1.11 #bumped to get access to the inspect model tools 40 | - pilon==1.24 41 | - polypolish==0.6 42 | - quast==5.2.0 #installs perl-circos as a dependency... 43 | - nextflow==24.04.4 # used for parallelizing multiplex runs 44 | -------------------------------------------------------------------------------- /conda_recipe/environment_open_versions.yml: -------------------------------------------------------------------------------- 1 | name: testpima 2 | 3 | channels: 4 | - bioconda 5 | - conda-forge 6 | 7 | dependencies: 8 | - git 9 | - gawk 10 | - bedtools 11 | - biopython 12 | - blast 13 | - bwa 14 | - curl 15 | - flye 16 | - raven-assembler 17 | - mdutils 18 | - minimap2 19 | - multiprocess 20 | - mummer 21 | - kraken2 22 | - spades 23 | - pandas 24 | - pandoc 25 | - weasyprint 26 | - pango 27 | - pathos 28 | - python 29 | - pyfaidx 30 | - python_circos 31 | - r 32 | - r-hash 33 | - r-stringr 34 | - r-gridextra 35 | - r-optparse 36 | - samtools 37 | - varscan 38 | - qcat 39 | - medaka 40 | - pilon 41 | - polypolish 42 | - quast 43 | - nextflow 44 | -------------------------------------------------------------------------------- /conda_recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "pima" %} 2 | #{% set python = "3.8" %} 3 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %} 4 | 5 | package: 6 | name: "{{ name|lower }}" 7 | version: "{{ data.get('version') }}" 8 | 9 | source: 10 | # git_url: https://github.com/appliedbinf/MergedPima.git 11 | # git_tag: development 12 | path: ../ 13 | 14 | 15 | build: 16 | number: 1 17 | skip: True # [win] 18 | 19 | requirements: 20 | host: 21 | - python >=3.9,<3.11 #,<3.9 22 | - pip 23 | 24 | run: 25 | - git==2.39.1 26 | - gawk==5.3 27 | - bedtools==2.31.1 28 | - biopython==1.84 29 | - blast==2.16 30 | - bwa==0.7.18 31 | - curl 32 | - flye==2.9.4 33 | - raven-assembler==1.8.3 34 | - mdutils==1.6.0 35 | - minimap2==2.28 36 | - multiprocess=0.70.16 37 | - mummer==3.23 38 | - kraken2==2.1.3 39 | - spades==4.0.0 40 | - pandas==2.2.2 41 | - pandoc==3.3 42 | - weasyprint==62.3 # pdf engine for pandoc 43 | - pango==1.50.14 44 | - pathos 45 | - python >=3.9,<3.11 #,<3.9 46 | - pyfaidx==0.8.1.2 47 | - python_circos ## will replace with the development verison using pip in pima_install.sh script 48 | - r 49 | - r-hash #needed for pChunks 50 | - r-stringr #needed for pChunks 51 | - r-gridextra #needed for pChunks 52 | - r-optparse #needed for pChunks 53 | - samtools==1.18 #needs to be the conda-forge version and NOT the bioconda version 54 | - varscan==2.4.6 55 | - medaka >=1.11 #bumped to get access to the inspect model tools 56 | - pilon==1.24 57 | - polypolish==0.6 58 | - quast==5.2.0 #installs perl-circos as a dependency... 59 | - nextflow==24.04.4 # used for parallelizing multiplex runs 60 | 61 | test: 62 | 63 | commands: 64 | - minimap2 -h 65 | - pima -h 66 | 67 | about: 68 | home: https://githib.com/appliedbinf/MergedPima 69 | License: MIT 70 | 71 | 72 | -------------------------------------------------------------------------------- /conda_recipe/meta_open_versions.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "pima" %} 2 | {% set python = "3.8" %} 3 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %} 4 | 5 | package: 6 | name: "{{ name|lower }}" 7 | version: "{{ data.get('version') }}" 8 | 9 | source: 10 | # git_url: https://github.com/appliedbinf/MergedPima.git 11 | # git_tag: development 12 | path: ../ 13 | 14 | 15 | build: 16 | number: 7 17 | skip: True # [win] 18 | 19 | requirements: 20 | host: 21 | - python 22 | - pip 23 | 24 | run: 25 | - git 26 | - gawk 27 | - bedtools 28 | - biopython 29 | - blast 30 | - bwa 31 | - curl 32 | - flye 33 | - raven-assembler 34 | - mdutils 35 | - minimap2 36 | - multiprocess 37 | - mummer 38 | - kraken2 39 | - spades 40 | - pandas 41 | - pandoc 42 | - weasyprint 43 | - pango 44 | - pathos 45 | - python 46 | - pyfaidx 47 | - python_circos 48 | - r 49 | - r-hash 50 | - r-stringr 51 | - r-gridextra 52 | - r-optparse 53 | - samtools 54 | - varscan 55 | - qcat 56 | - medaka 57 | - pilon 58 | - polypolish 59 | - quast 60 | - nextflow 61 | 62 | test: 63 | 64 | commands: 65 | - minimap2 -h 66 | - pima -h 67 | 68 | about: 69 | home: https://githib.com/appliedbinf/MergedPima 70 | License: MIT 71 | 72 | 73 | -------------------------------------------------------------------------------- /conda_recipe/post-link.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python -m pip install --no-deps dna_features_viewer si-prefix 4 | 5 | # upgrade python_circos from the conda version that controls the dependencies, but lacks features 6 | python -m pip install --force-reinstall --no-deps git+https://github.com/ponnhide/pyCircos.git 7 | 8 | -------------------------------------------------------------------------------- /dockerbuild/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mambaorg/micromamba:1.5.8 as app 2 | 3 | ARG PIMA_VER="2.1.1" 4 | 5 | # 'LABEL' instructions tag the image with metadata that might be important to the user 6 | LABEL base.image="mambaorg/micromamba:1.5.8" 7 | LABEL dockerfile.version="1" 8 | LABEL software="pima" 9 | LABEL software.version="${PIMA_VER}" 10 | LABEL description="Plasmid, Integrations, Mutations, and Antibiotic resistance annotation pipeline" 11 | LABEL maintainer="Will Overholt" 12 | LABEL maintainer.email="woverholt@asrtinc.com" 13 | 14 | USER root 15 | 16 | ## Need to build the file from the primary pima dir 17 | # cd ....pima 18 | # docker build -t local/pima:2.1.0 -f dockerbuild/Dockerfile . 19 | # cd ....singularity_images 20 | # apptainer build pima2.1.0.sif docker-daemon://local/pima:2.1.0 21 | 22 | # include required pima files 23 | ADD conda_recipe/environment.yml environment.yml 24 | ADD Pima Pima 25 | ADD setup.py setup.py 26 | ADD README.md README.md 27 | 28 | # build run environment 29 | RUN apt-get update && apt-get install -y --no-install-recommends \ 30 | locales \ 31 | locales-all \ 32 | libpango-1.0-0 libpangoft2-1.0-0 libharfbuzz-subset0 \ 33 | wget \ 34 | procps \ 35 | ca-certificates && \ 36 | apt-get autoclean && rm -rf /var/lib/apt/lists/* 37 | 38 | RUN micromamba install --name base -c conda-forge -c bioconda -f environment.yml && \ 39 | micromamba clean -a -f -y && \ 40 | mkdir /data 41 | 42 | ENV PATH="/opt/conda/bin/:$PATH" \ 43 | LC_ALL=C.UTF-8 44 | 45 | # install pima 46 | RUN python -m pip install -vv --no-deps --ignore-installed --no-cache-dir . && \ 47 | python -m pip install --no-deps --no-cache-dir dna_features_viewer si-prefix && \ 48 | python -m pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/ponnhide/pyCircos.git 49 | 50 | CMD pima --help 51 | 52 | WORKDIR /data 53 | 54 | FROM app as test 55 | 56 | WORKDIR /test 57 | 58 | # test installation 59 | RUN pima --help && \ 60 | pima --version 61 | 62 | # prep pima 63 | RUN pima --download -------------------------------------------------------------------------------- /pima: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import re 4 | import sys 5 | from Pima.pima import main 6 | if __name__ == '__main__': 7 | sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) 8 | sys.exit(main()) 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import os 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | with open(os.path.join("Pima", "VERSION"), "r", encoding="utf-8") as version_fp: 7 | VERSION = version_fp.read().strip() 8 | 9 | setuptools.setup( 10 | name="pima", 11 | version=VERSION, 12 | author="Applied Bioinformatics Laboratory", 13 | author_email="woverholt@asrtinc.com", 14 | description="Genomic characterization pipeline for Bacillus anthracis", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/appliedbinf/pima", 18 | packages=setuptools.find_packages(), 19 | python_requires='>=3.8', 20 | package_data={ 21 | "Pima": [ 22 | "data/**", 23 | "VERSION", 24 | "nextflow_parallelization/**", 25 | ], 26 | }, 27 | scripts = ['Pima/pima.py', 'Pima/accessory_scripts/building_pycircos_figures.py'], 28 | zip_safe=False, 29 | include_package_data=True, 30 | entry_points={ 31 | # Optional: specify any entry points for your package here 32 | 'console_scripts': [ 33 | 'pima = Pima.pima:main', 34 | ], 35 | }, 36 | ) --------------------------------------------------------------------------------