├── .gitignore
├── Pima
    ├── VERSION
    ├── __init__.py
    ├── accessory_scripts
    │   ├── MarkdownReport.py
    │   ├── __init__.py
    │   ├── building_pycircos_figures.py
    │   ├── pChunks.R
    │   └── sam2psl.py
    ├── data
    │   ├── ames_single_copy_genes.fna
    │   ├── amr.fasta
    │   ├── gene_drug.tsv
    │   ├── inc.fasta
    │   ├── pima.css
    │   └── reference_sequences
    │   │   └── Bacillus_anthracis
    │   │       ├── amr_appendices
    │   │           ├── beta-lactams.md
    │   │           ├── beta-lactams.png
    │   │           ├── macrolides.md
    │   │           ├── macrolides.png
    │   │           ├── quinolones.md
    │   │           ├── quinolones.png
    │   │           ├── tetracyclines.md
    │   │           └── tetracyclines.png
    │   │       ├── ba_virulence_genes.bed
    │   │       └── confirmed_amr_mutations.bed
    ├── modules
    │   ├── __init__.py
    │   ├── annotations.py
    │   ├── assembly.py
    │   ├── check_contamination.py
    │   ├── compare_to_ref.py
    │   ├── download_references.py
    │   ├── evaluate_assembly.py
    │   ├── fastq.py
    │   ├── illumina_polishing.py
    │   ├── multiplexed.py
    │   ├── ont_polishing.py
    │   ├── outdir.py
    │   ├── plasmids.py
    │   ├── report.py
    │   └── visualizations.py
    ├── nextflow_parallelization
    │   ├── main.nf
    │   ├── modules
    │   │   ├── copy_results.nf
    │   │   └── pima_singleplex.nf
    │   └── nextflow.config.template
    ├── pima.py
    ├── pima_colors.py
    ├── pima_data.py
    └── utils
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── mapping.py
    │   ├── settings.py
    │   └── utils.py
├── README.md
├── conda_recipe
    ├── build.sh
    ├── environment.yml
    ├── environment_open_versions.yml
    ├── meta.yaml
    ├── meta_open_versions.yaml
    └── post-link.sh
├── dockerbuild
    └── Dockerfile
├── pima
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | #TEMP DEV
  2 | Pima_refactor_notes.md
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # database files
 10 | Pima/data/plasmids_and_vectors*
 11 | Pima/data/kraken2*
 12 | Pima/data/reference_sequences/Bacillus_anthracis/genome.fasta*
 13 | 
 14 | # Nextflow
 15 | Pima/nextflow_parallelization/nextflow.config
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # raven files?
 21 | raven.cereal
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | share/python-wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | MANIFEST
 42 | 
 43 | # VSCODE
 44 | .vscode/
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | cover/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | db.sqlite3-journal
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | .pybuilder/
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | #   For a library or package, you might want to ignore these files since the code is
104 | #   intended to run in multiple environments; otherwise, check them in:
105 | # .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # poetry
115 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
117 | #   commonly ignored for libraries.
118 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119 | #poetry.lock
120 | 
121 | # pdm
122 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123 | #pdm.lock
124 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125 | #   in version control.
126 | #   https://pdm.fming.dev/#use-with-ide
127 | .pdm.toml
128 | 
129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130 | __pypackages__/
131 | 
132 | # Celery stuff
133 | celerybeat-schedule
134 | celerybeat.pid
135 | 
136 | # SageMath parsed files
137 | *.sage.py
138 | 
139 | # Environments
140 | .env
141 | .venv
142 | env/
143 | venv/
144 | ENV/
145 | env.bak/
146 | venv.bak/
147 | 
148 | # Spyder project settings
149 | .spyderproject
150 | .spyproject
151 | 
152 | # Rope project settings
153 | .ropeproject
154 | 
155 | # mkdocs documentation
156 | /site
157 | 
158 | # mypy
159 | .mypy_cache/
160 | .dmypy.json
161 | dmypy.json
162 | 
163 | # Pyre type checker
164 | .pyre/
165 | 
166 | # pytype static type analyzer
167 | .pytype/
168 | 
169 | # Cython debug symbols
170 | cython_debug/
171 | 
172 | # PyCharm
173 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
176 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
177 | #.idea/
178 | 


--------------------------------------------------------------------------------
/Pima/VERSION:
--------------------------------------------------------------------------------
1 | 2.1.1


--------------------------------------------------------------------------------
/Pima/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/__init__.py


--------------------------------------------------------------------------------
/Pima/accessory_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/accessory_scripts/__init__.py


--------------------------------------------------------------------------------
/Pima/accessory_scripts/building_pycircos_figures.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from collections import defaultdict
  5 | 
  6 | from pycircos import Garc, Gcircle
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib.colors as colors
  9 | import matplotlib.patches as mpatches
 10 | 
 11 | 
 12 | 
 13 | class BuildCircosPlots:
 14 |     """Build circos plot for genome alignment, coverage data, and gene annotations
 15 | 
 16 |     Expects inputs that are generated by the main 'pima.py' pipeline. Creates a png image \
 17 |     for each genetic element within the reference genome provided to pima.
 18 | 
 19 |     Typical usage example:
 20 |         circos_elem = BuildCircosPlots(ge_name=ge_name, 
 21 |                                     ge_size=int(ge_size),
 22 |                                     aln_file=aln_file,
 23 |                                     cov_file=cov_file,
 24 |                                     illumina_cov_file=illumina_cov_file,
 25 |                                     gene_file=gene_file,
 26 |                                     outdir=<Path>)
 27 |         circos_fig = circos_elem.main()
 28 |         circos_fig.save(file_name=f"{Path}/{ge_name}", format="png", dpi=300)
 29 |     
 30 |     """
 31 |     def __init__(self, ge_name: str, ge_size: int, aln_file: str | None, outdir:str, cov_file: str | None,illumina_cov_file: str | None, gene_file: str | None, legend = True) -> None:
 32 |         """Initializing data to build the circos plots
 33 | 
 34 |         Args:
 35 |             ge_name (str): Name of the genetic element being plotted (chromosome, plasmid, etc..)
 36 |             ge_size (int): Length of the genetic element (bp)
 37 |             aln_file (str, optional): Path to the genome:genome alignment file, generated by dnadiff & filtered
 38 |             outdir (str): Path to save the results
 39 |             cov_file (str, optional): Path to the ONT coverage data, filtered for columns "sequenceID\tpos\tcoverage (cut -f1,2,4 from mpileup). Defaults to None | str.
 40 |             illumina_cov_file (str, optional): Path to the illumina coverage data, filtered for columns "sequenceID\tpos\tcoverage (cut -f1,2,4 from mpileup. Defaults to None | str.
 41 |             gene_file (str, optional): Path to a bed file indicating which genes to draw on the reference genome map. Defaults to None | str.
 42 |             legend (bool, optional): Generate a legend (T / F)
 43 |         Returns:
 44 |             None
 45 |         """
 46 | 
 47 |         self.ge_name = ge_name
 48 |         self.ge_size = ge_size
 49 |         self.aln_file = aln_file
 50 |         self.outdir = outdir
 51 |         self.cov_file = cov_file
 52 |         self.illumina_cov_file = illumina_cov_file
 53 |         self.gene_file = gene_file
 54 |         self.legend = legend
 55 | 
 56 |         
 57 | 
 58 |         #instantiate dictionaries that contain data for each of the plotting steps
 59 |         self.aln_dict = defaultdict(dict)
 60 |         self.n_colors = defaultdict(dict)
 61 |         self.cov_dict = defaultdict(dict)
 62 |         self.illumina_cov_dict = defaultdict(dict)
 63 |         self.ge_circle = Gcircle()
 64 | 
 65 |     def build_reference_genome(self) -> None:
 66 |         """Generates the reference genome backbone
 67 | 
 68 |         Returns:
 69 |             None
 70 |         """
 71 |         elem_arc = Garc(arc_id = self.ge_name,
 72 |                         size = self.ge_size,
 73 |                         raxis_range = [600, 700],
 74 |                         facecolor="#FFFFFF",
 75 |                         linewidth=1,
 76 |                         edgecolor="#000000",
 77 |                         label_visible=False,
 78 |                         interspace=0
 79 |                         )
 80 |         self.ge_circle.add_garc(elem_arc)
 81 |         self.ge_circle.set_garcs()
 82 | 
 83 |     def add_tickmarks(self) -> None:
 84 |         """Places logical tick marks indicating genome element size
 85 | 
 86 |         Returns:
 87 |             None
 88 |         """
 89 |         # use different spacing for the chromosome vs plasmids
 90 |         if self.ge_size > 2000000:
 91 |             tick_major_positions = list(range(0,self.ge_size,1000000))
 92 |             tick_minor_positions = list(range(0,self.ge_size,200000))
 93 |             tick_labels = [f"{int(i/1000000)} Mb" for i in tick_major_positions]
 94 |         else:
 95 |             tick_major_positions = list(range(0, self.ge_size,20000))
 96 |             tick_minor_positions = list(range(0,self.ge_size,5000))
 97 |             tick_labels = [f"{int(i/1000)} kb" for i in tick_major_positions]
 98 | 
 99 |         #major tick marks with labels
100 |         self.ge_circle.tickplot(garc_id=self.ge_name,
101 |                         raxis_range=(550,600),
102 |                         tickpositions=tick_major_positions,
103 |                         tickdirection="inner",
104 |                         ticklabels=tick_labels,
105 |                         ticklabelmargin=10
106 |                         )
107 |         # minor tick marks
108 |         self.ge_circle.tickplot(garc_id=self.ge_name,
109 |                         raxis_range=(550,600),
110 |                         tickpositions=tick_minor_positions,
111 |                         tickdirection="inner",
112 |                         tickwidth=0.5
113 |                         )
114 | 
115 |     def build_alignment_plot(self) -> None:
116 |         """Build garcs indicating which portions of the query genome align to the reference genome backbone
117 | 
118 |         Can show up to 20 colors representing query genome contigs before repeating. 
119 |         """
120 |         if self.aln_file is None:
121 |             return
122 | 
123 |         self.aln_dict["start"] = []
124 |         self.aln_dict["len"] = []
125 |         self.aln_dict["contig"] = []
126 |         self.aln_dict["color"] = []
127 |         
128 |         with open(self.aln_file,"r") as fin:
129 |             for line in fin:
130 |                 ref_name, ref_start, ref_end, qname = line.rstrip().split("\t")
131 |                 aln_len = int(ref_end) - int(ref_start)
132 |                 
133 |                 self.aln_dict["start"].append(int(ref_start))
134 |                 self.aln_dict["len"].append(int(aln_len))
135 |                 self.aln_dict["contig"].append(qname)
136 | 
137 |         ## Generate different colors for different chromosomes
138 |         qual_colors_20 = plt.colormaps['tab20']
139 |         uniq_contigs = [*set(self.aln_dict['contig'])]
140 |         self.n_colors['n_contigs'] = len(uniq_contigs)
141 |         self.n_colors['contig_names'] = uniq_contigs
142 |         self.n_colors['colors'] = []
143 |         for i, contig_id in enumerate(uniq_contigs):
144 |             self.n_colors['colors'].append(colors.rgb2hex(qual_colors_20(i)))
145 | 
146 |         ## Add colors to the alignment dictionary
147 |         for contig_name in self.aln_dict['contig']:
148 |             i = self.n_colors['contig_names'].index(contig_name)
149 |             self.aln_dict['color'].append(self.n_colors['colors'][i])
150 | 
151 |         ## build the barplots that map the genome aligment onto the chromosome plot
152 |         if not len(self.aln_dict['start']) == 0:
153 |             self.ge_circle.barplot(self.ge_name, 
154 |                         data = [1]*len(self.aln_dict["start"]), 
155 |                         positions=self.aln_dict["start"],
156 |                         width=self.aln_dict["len"],
157 |                         raxis_range=[600,700],
158 |                         facecolor=self.aln_dict["color"],
159 |                         linewidth=0)
160 |             
161 |     def build_coverage_plot(self) -> None:
162 |         """Generates a garc fillplot of the ONT coverage data"""
163 |         if self.cov_file is None:
164 |             return
165 |         
166 |         self.cov_dict["pos"] = []
167 |         self.cov_dict["cov"] = []
168 |         with open(self.cov_file, 'r') as fin:
169 |             for line in fin:
170 |                 element, pos, cov, = line.rstrip().rsplit("\t")
171 |                 self.cov_dict["pos"].append(int(pos))
172 |                 self.cov_dict["cov"].append(int(cov))
173 | 
174 |         if not len(self.cov_dict['pos']) == 0:
175 |             self.ge_circle.fillplot(self.ge_name, 
176 |                         data = self.cov_dict["cov"], 
177 |                         positions=self.cov_dict["pos"],
178 |                         raxis_range=[701,850],
179 |                         base_value=0,
180 |                         rlim=(0,max(self.cov_dict["cov"])),
181 |                         facecolor="#808080")
182 | 
183 |     def build_illumina_coverage_plot(self) -> None:
184 |         """Generates a garc fillplot of the Illumina coverage data"""
185 |         if self.illumina_cov_file is None:
186 |             return
187 |         
188 |         self.illumina_cov_dict["pos"] = []
189 |         self.illumina_cov_dict["cov"] = []
190 |         with open(self.illumina_cov_file, 'r') as fin:
191 |             for line in fin:
192 |                 element, pos, cov, = line.rstrip().rsplit("\t")
193 |                 self.illumina_cov_dict["pos"].append(int(pos))
194 |                 self.illumina_cov_dict["cov"].append(int(cov))
195 | 
196 |         if not len(self.illumina_cov_dict['pos']) == 0:
197 |             self.ge_circle.fillplot(self.ge_name, 
198 |                         data = self.illumina_cov_dict["cov"], 
199 |                         positions=self.illumina_cov_dict["pos"],
200 |                         raxis_range=[851,1000],
201 |                         base_value=0,
202 |                         rlim=(0,max(self.illumina_cov_dict["cov"])),
203 |                         facecolor="#808080")
204 | 
205 |     def add_cov_labels(self) -> None:
206 |         """Generates the middle text indicating genetic element name (from the reference) and coverage statistics"""
207 |         self.ge_circle.ax.text(0.5,0.53,f"{self.ge_name}", fontsize=20,
208 |                                transform=self.ge_circle.ax.transAxes,
209 |                                ha='center')
210 |         
211 |         if (self.cov_file is not None and len(self.cov_dict['cov']) != 0):
212 |             min_cov = min(self.cov_dict['cov'])
213 |             max_cov = max(self.cov_dict['cov'])
214 |             avg_cov = round(sum(self.cov_dict['cov']) / len(self.cov_dict['cov']), 1)
215 |             self.ge_circle.ax.text(0.5,0.45,
216 |                                f"ONT Average Coverage: {avg_cov}\nONT Minimum Coverage: {min_cov}\nONT Maximum Coverage: {max_cov}",
217 |                                fontsize=10,
218 |                                transform=self.ge_circle.ax.transAxes,
219 |                                ha='center')
220 |         if (self.illumina_cov_file is not None and len(self.illumina_cov_dict['cov']) != 0):
221 |             illumina_min_cov = min(self.illumina_cov_dict['cov'])
222 |             illumina_max_cov = max(self.illumina_cov_dict['cov'])
223 |             illumina_avg_cov = round(sum(self.illumina_cov_dict['cov']) / len(self.illumina_cov_dict['cov']), 1)
224 |             self.ge_circle.ax.text(0.5,0.37,
225 |                                f"Illumina Average Coverage: {illumina_avg_cov}\n" + 
226 |                                f"Illumina Minimum Coverage: {illumina_min_cov}\n" + 
227 |                                f"Illumina Maximum Coverage: {illumina_max_cov}",
228 |                                fontsize=10,
229 |                                transform=self.ge_circle.ax.transAxes,
230 |                                ha='center')
231 | 
232 |     def add_gene_loc(self) -> None:
233 |         """Draws gene annotations onto the reference backbone given a bed file"""
234 |         if self.gene_file is None:
235 |             return
236 |         
237 |         gene_dict = defaultdict(dict)
238 |         label_pos = []
239 |         label_id = []
240 |         gene_dict['pos'] = []
241 |         gene_dict['width'] = []
242 |         with open(self.gene_file) as fin:
243 |             for line in fin:
244 |                 elem_name, start, stop, gene_name = line.rstrip().split("\t")
245 |                 if elem_name == self.ge_name:
246 |                     start = int(start)
247 |                     width = int(stop)-start-1
248 |                     gene_dict['pos'].append(start)
249 |                     gene_dict['width'].append(width)
250 |                     label_pos.append(int(round(start + (width / 2),0)))
251 |                     label_id.append(f"$\\it\u007b{gene_name}\u007d$")
252 | 
253 |         if len(gene_dict['pos']) > 0:
254 |             self.ge_circle.barplot(garc_id = self.ge_name,
255 |                             data=[1]*len(gene_dict['pos']),
256 |                             positions=gene_dict['pos'],
257 |                             width=gene_dict['width'],
258 |                             raxis_range=(600,700),
259 |                             facecolor="#FF000040",
260 |                             edgecolor="#000000",
261 |                             linewidth=1)
262 |         
263 |             self.ge_circle.tickplot(garc_id = self.ge_name,
264 |                                 tickpositions=label_pos,
265 |                                 ticklabels=label_id,
266 |                                 tickdirection="inner",
267 |                                 tickcolor="#FF0000",
268 |                                 ticklabelcolor="#FF0000",
269 |                                 ticklabelmargin=20)
270 |             
271 |     def build_legend(self) -> None:
272 |         """Generates the legend showing: genome:genome alignments, contig IDs, and the coverage data"""
273 |         leg = []
274 |         leg.append(mpatches.Patch(facecolor = "#FFFFFF", edgecolor= "#000000", label = "No alignment to reference"))
275 |         #leg.append(mpatches.Patch(facecolor = "#1f77b4", label = ">98%% sequence alignment to $\it{Ba}$ Ames Ancestor"))
276 |         #leg.append(mpatches.Patch(facecolor = "#808080", label = 'Inner: ONT Coverage across $\it{Ba}$ Ames Ancestor'))
277 |         #leg.append(mpatches.Patch(facecolor = "#808080", label = "Outer: Illumina Coverage across $\it{Ba}$ Ames Ancestor"))
278 | 
279 |         if not len(self.n_colors) == 0:
280 |             for i, contig in enumerate(self.n_colors['contig_names']):
281 |                 leg_patch = mpatches.Patch(color = self.n_colors['colors'][i], label = contig)
282 |                 leg.append(leg_patch)
283 | 
284 |         #both illumina and ONT coverage data
285 |         if len(self.cov_dict['cov']) != 0 and len(self.illumina_cov_dict['cov']) != 0:
286 |             leg.append(mpatches.Patch(facecolor = "#808080", label = 'Inner: ONT Coverage across reference'))
287 |             leg.append(mpatches.Patch(facecolor = "#808080", label = "Outer: Illumina Coverage across reference"))
288 | 
289 |         # only ONT
290 |         elif len(self.cov_dict['cov']) != 0:
291 |             leg.append(mpatches.Patch(facecolor = "#808080", label = 'ONT Coverage across reference'))
292 | 
293 |         # only Illumina
294 |         elif len(self.cov_dict['cov']) == 0 and len(self.illumina_cov_dict['cov']) != 0:
295 |             leg.append(mpatches.Patch(facecolor = "#808080", label = 'Illumina Coverage across reference'))
296 |             
297 |         #self.ge_circle.ax.legend(handles=leg, prop={'size': 9}, bbox_to_anchor=(0.18,0.11)) #loc=3
298 |         self.ge_circle.figure.legend(handles=leg, prop={'size': 9}, loc=3)
299 | 
300 |     def main(self) -> Gcircle:
301 |         """Executes each method in the BuildCircosPlots class depending on data inputs
302 | 
303 |         Returns:
304 |             Gcircle: A class object containing all the data to be visualized
305 |         """
306 |         self.build_reference_genome()
307 |         self.add_tickmarks()
308 |         self.build_alignment_plot()
309 |         self.build_coverage_plot()
310 |         self.build_illumina_coverage_plot()
311 |         self.add_cov_labels()
312 |         self.add_gene_loc()
313 |         if self.legend:
314 |             self.build_legend()
315 |         return self.ge_circle
316 | 
317 | 
318 | if __name__ == "__main__":
319 |     import argparse
320 |     import sys
321 |     import re
322 |     import subprocess
323 | 
324 |     parser = argparse.ArgumentParser()
325 |     # add input for reference genome only
326 |     parser.add_argument("-i", "--input_dir", help="Output directory generated by a previous pima (v1.4 or higher) run")
327 |     parser.add_argument("-r", "--reference_genome", help="Input reference genome to draw specific gene locations on")
328 |     parser.add_argument("-o", "--output_dir", help="Where to save the results")
329 |     parser.add_argument("-g", "--gene_file", required=False,
330 |                         help="bed file format with the location of the genes to draw in the reference coordinates format")
331 |     parser.add_argument("--image_format", help = "The output image type, support is determined by the matplotlib backend: 'png', 'pdf', 'ps', 'eps', and 'svg' should all work",
332 |                         required = False, default='png')
333 | 
334 |     args = parser.parse_args()
335 | 
336 |     if args.input_dir and args.reference_genome:
337 |         print("Please specify either -i OR -r. These are mutually exclusive flags")
338 |         sys.exit(0)
339 |     
340 | 
341 |     outdir = args.output_dir
342 |     if not os.path.exists(outdir):
343 |         os.makedirs(outdir)
344 |         
345 | 
346 |     def validate_files(var_fp):
347 |         if not os.path.isfile(var_fp):
348 |             var_fp = None
349 |             return var_fp
350 |         else:
351 |             return var_fp
352 | 
353 |     ## Building circos plots to represent specific gene functions on a reference genome
354 |     if args.reference_genome:
355 |         reference_genome = args.reference_genome
356 |         reference_sizes = os.path.join(outdir, "reference.sizes")
357 | 
358 |         # build the reference.sizes file
359 |         command = ' '.join(['faidx -i chromsizes', reference_genome, ' | sort -k 1,1 -k 2,2n'])
360 |         reference_sizes = [x for x in re.split(r'\n', subprocess.check_output(command, shell = True).decode('utf-8')) if x] 
361 | 
362 |         # add the gene file
363 |         gene_file = args.gene_file
364 | 
365 |         ## Manually constructing the Gcircle object because I want to modify some of the steps
366 |         for element in reference_sizes:
367 |             ge_name, ge_size = element.rsplit()
368 | 
369 |             circos_elem = BuildCircosPlots(ge_name=ge_name, 
370 |                                             aln_file=None,
371 |                                             cov_file = None,
372 |                                             illumina_cov_file = None,
373 |                                             ge_size=int(ge_size),
374 |                                             gene_file=gene_file,
375 |                                             legend=False,
376 |                                             outdir=None)
377 |             circos_fig = circos_elem.main()
378 | 
379 |             # a hack - but we want the reference genome to be filled for these figures instead of blank
380 |             circos_elem.ge_circle.barplot(ge_name,
381 |                                           data = [1],
382 |                                           positions=[int(1)],
383 |                                           width=[int(ge_size)-1],
384 |                                           raxis_range=[600,700],
385 |                                           facecolor="#1f77b4",
386 |                                           linewidth=0)
387 |             circos_fig.save(file_name=f"{outdir}/{ge_name}", format=args.image_format , dpi=300)
388 | 
389 |     ## Building custom pima plots
390 |     else:
391 |         with open(f"{args.input_dir}/insertions/reference.sizes", "r") as fin:
392 |             for line in fin:
393 |                 ge_name, ge_size = line.rstrip().rsplit()
394 |                 aln_file = f"{args.input_dir}/circos/{ge_name}/alignment.txt"
395 |                 cov_file = f"{args.input_dir}/circos/{ge_name}/coverage.mpileup"
396 |                 illumina_cov_file = f"{args.input_dir}/circos/{ge_name}/illumina_coverage.mpileup"
397 |                 gene_file = args.gene_file
398 |                 
399 | 
400 |                 aln_file = validate_files(aln_file)
401 |                 cov_file = validate_files(cov_file)
402 |                 illumina_cov_file = validate_files(illumina_cov_file)
403 | 
404 |                 circos_elem = BuildCircosPlots(ge_name=ge_name, 
405 |                                                 ge_size=int(ge_size),
406 |                                                 aln_file=aln_file,
407 |                                                 cov_file=cov_file,
408 |                                                 illumina_cov_file=illumina_cov_file,
409 |                                                 gene_file=gene_file,
410 |                                                 outdir=None)
411 |                 
412 |                 circos_fig = circos_elem.main()
413 | 
414 |                 circos_fig.save(file_name=f"{outdir}/{ge_name}", format="png" , dpi=300)
415 | 
416 | """
417 |     for dir in os.scandir("../pima_downsample_ont/"):
418 |         if dir.is_dir():
419 |             dirname = os.path.basename(dir)
420 |             with open(f"{dir.path}/insertions/reference.sizes", "r") as fin:
421 |                 for line in fin:
422 |                     ge_name, ge_size = line.rstrip().rsplit()
423 |                     aln_file = f"{dir.path}/circos/{ge_name}/alignment.txt"
424 |                     cov_file = f"{dir.path}/circos/{ge_name}/coverage.mpileup"
425 |                     illumina_cov_file = f"{dir.path}/circos/{ge_name}/illumina_coverage.mpileup"
426 |                     gene_file = "/scicomp/home-pure/tsz0/Projects/devPima/MergedPima/data/ba_virulence_genes.bed"
427 |                     #outdir = f"{dir.path}/pima_out/20230308_Minion_TM_01/Sterne-CLR1-2/circos/{ge_name}"
428 |                     circos_elem = BuildCircosPlots(ge_name=ge_name, 
429 |                                                     ge_size=int(ge_size),
430 |                                                     aln_file=aln_file,
431 |                                                     cov_file=cov_file,
432 |                                                     illumina_cov_file=illumina_cov_file,
433 |                                                     gene_file=gene_file,
434 |                                                     outdir=None)
435 |                     circos_fig = circos_elem.main()
436 |                     circos_fig.save(file_name=f"{dirname}_{ge_name}", format="png" , dpi=300)
437 | """


--------------------------------------------------------------------------------
/Pima/data/pima.css:
--------------------------------------------------------------------------------
  1 | html {
  2 |     line-height: 1.5;
  3 |     font-family: Georgia, serif;
  4 |     font-size: 20px;
  5 |     color: #1a1a1a;
  6 |     background-color: #fdfdfd;
  7 | }
  8 | body {
  9 |     margin: 0 auto;
 10 |     max-width: 50em;
 11 |     padding-left: 20px;
 12 |     padding-right: 20px;
 13 |     padding-top: 20px;
 14 |     padding-bottom: 20px;
 15 |     hyphens: auto;
 16 |     overflow-wrap: break-word;
 17 |     font-kerning: normal;
 18 | }
 19 | @media print {
 20 |     body {
 21 |         background-color: transparent;
 22 |         color: black;
 23 |         font-size: 10pt;
 24 |     }
 25 |     p, h2, h3 {
 26 |         orphans: 3;
 27 |         widows: 3;
 28 |     }
 29 |     h2, h3, h4 {
 30 |         page-break-after: avoid;
 31 |     }
 32 | }
 33 | p {
 34 |     margin: 1em 0;
 35 | }
 36 | a {
 37 |     color: #3333FF;
 38 | }
 39 | a:visited {
 40 |     color: #1a1a1a;
 41 | }
 42 | img {
 43 |     margin: auto;
 44 |     display: block;
 45 |     max-width: 70%;
 46 | }
 47 | h1, h2, h3, h4, h5, h6 {
 48 |     margin-top: 1.4em;
 49 | }
 50 | h5, h6 {
 51 |     font-size: .8em;
 52 |     font-style: italic;
 53 | }
 54 | h6 {
 55 |     font-weight: normal;
 56 | }
 57 | ol, ul {
 58 |     padding-left: 1.7em;
 59 |     margin-top: 1em;
 60 | }
 61 | li > ol, li > ul {
 62 |     margin-top: 0;
 63 | }
 64 | blockquote {
 65 |     margin: 1em 0 1em 1.7em;
 66 |     padding-left: 1em;
 67 |     border-left: 2px solid #e6e6e6;
 68 |     color: #606060;
 69 | }
 70 | code {
 71 |     font-family: Menlo, Monaco, 'Lucida Console', Consolas, monospace;
 72 |     font-size: 85%;
 73 |     margin: 0;
 74 | }
 75 | pre {
 76 |     margin: 1em 0;
 77 |     overflow: auto;
 78 | }
 79 | pre code {
 80 |     padding: 0;
 81 |     overflow: visible;
 82 |     overflow-wrap: normal;
 83 | }
 84 | .sourceCode {
 85 |     background-color: transparent;
 86 |     overflow: visible;
 87 | }
 88 | hr {
 89 |     background-color: #1a1a1a;
 90 |     border: none;
 91 |     height: 1px;
 92 |     margin: 1em 0;
 93 | }
 94 | table {
 95 |     border-collapse: collapse;
 96 |     font-variant-numeric: lining-nums tabular-nums;
 97 |     font-size: 80%;
 98 | }
 99 | table caption {
100 |     margin-bottom: 0.75em;
101 | }
102 | tbody {
103 |     margin-top: 0.5em;
104 |     border-top: 1px solid #1a1a1a;
105 |     border-bottom: 1px solid #1a1a1a;
106 |     text-align: center;
107 | }
108 | th {
109 |     border-top: 1px solid #1a1a1a;
110 |     padding: 0.25em 0.5em 0.25em 0.5em;
111 | }
112 | td {
113 |     padding: 0.125em 0.5em 0.25em 0.5em;
114 | }
115 | 
116 | tr:nth-child(even) {background: #CCC}
117 | tr:nth-child(odd) {background: #FFF}
118 | 
119 | header {
120 |     margin-bottom: 4em;
121 |     text-align: center;
122 | }
123 | #TOC li {
124 |     list-style: none;
125 | }
126 | #TOC ul {
127 |     padding-left: 1.3em;
128 | }
129 | #TOC > ul {
130 |     padding-left: 0;
131 | }
132 | #TOC a:not(:hover) {
133 |     text-decoration: none;
134 | }
135 | code{white-space: pre-wrap;}
136 | span.smallcaps{font-variant: small-caps;}
137 | span.underline{text-decoration: underline;}
138 | div.column{display: inline-block; vertical-align: top; width: 50%;}
139 | div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
140 | ul.task-list{list-style: none;}
141 | .display.math{display: block; text-align: center; margin: 0.5rem auto;}
142 | 


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/beta-lactams.md:
--------------------------------------------------------------------------------
 1 | ### Appendix LETTER: Location of β-Lactams AMR associated genes within Bacillus anthracis str. Ames Ancestor Chromosome
 2 | 
 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2  
 4 | **Antibiotic Class:** Beta-lactams  
 5 | **Antibiotics:** Penicillin  
 6 | **AMR genes description:**  
 7 | - rsiP: Sigma-70 family RNA polymerase sigma factor
 8 |   
 9 | **Gene location:**                      
10 | - 2,323,269 – 2,324,096
11 | 
12 | ![Chromosomal location](beta-lactams.png)
13 | 
14 | | Gene | AMR description | Manuscript |
15 | | :------ | :------ | :------ |
16 | | rsiP | Mutations leading to truncation of RsiP have been described as a basis for PEN resistance. | https://pubmed.ncbi.nlm.nih.gov/30574557/ https://pubmed.ncbi.nlm.nih.gov/19717606/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3923885/ |
17 | | sigP | Mutations in the sigP gene has been associated with PEN resistance. |  https://pubmed.ncbi.nlm.nih.gov/30574557/ |
18 | <div style="page-break-after: always;"></div>
19 | 
20 | 


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/beta-lactams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/beta-lactams.png


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/macrolides.md:
--------------------------------------------------------------------------------
 1 | ### Appendix LETTER: Location of macrolides (rplV) AMR associated genes within Bacillus anthracis Ames Ancestor Chromosome  
 2 | 
 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2  
 4 | **Antibiotic Class:** Macrolides  
 5 | **Antibiotics:** Clarithromycin  
 6 | **AMR genes description:**  
 7 | - rplV: 50S ribosomal protein L22
 8 |   
 9 | **Gene location:**                      
10 | - 124,092-124,433
11 | 
12 | ![Chromosomal location](macrolides.png)
13 | 
14 | | Gene | AMR description | Manuscript |
15 | | :------ | :------ | :------ |
16 | |rplV | It has been reported that a 27-nucleotide repeat sequence insertion in the rplV gene induced a specific resistance to macrolide antibiotics. | https://pubmed.ncbi.nlm.nih.gov/29899844/ |
17 | <div style="page-break-after: always;"></div>
18 | 
19 | 


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/macrolides.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/macrolides.png


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/quinolones.md:
--------------------------------------------------------------------------------
 1 | ### Appendix LETTER: Location of quinolones AMR associated genes within Bacillus anthracis str. Ames Ancestor Chromosome
 2 | 
 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2  
 4 | **Antibiotic Class:** Quinolones  
 5 | **Antibiotics:** Ciprofloxacin, ofloxacin, levofloxacin, and moxifloxacin  
 6 | **AMR genes description:**  
 7 | - gyrA: DNA gyrase subunit A
 8 | - gyrB: DNA topoisomerase (ATP-hydrolyzing) subunit B
 9 | - parC: DNA topoisomerase IV subunit A
10 | - parE: DNA topoisomerase IV subunit B
11 | - tetR: TetR family transcriptional regulator
12 |   
13 | **Gene location:**                      
14 | - gyrA: 6,595 -9,066
15 | - gyrB: 4,584 - 6,506
16 | - parC: 3,362,705 - 3,365,128
17 | - parE: 3,365,130 - 3,367,094
18 | - GBAA_RS04545 or tetR: 842,403 – 842,981
19 | 
20 | ![Chromosomal location](quinolones.png)
21 | 
22 | | Gene | AMR description | Manuscript |
23 | | :------ | :------ | :------ |
24 | | gyrA | Mutations in the gyrA gene have been associated with resistance to certain classes of antibiotics, particularly fluoroquinolones. | https://pubmed.ncbi.nlm.nih.gov/12821500/ |
25 | | gyrB | Mutations in the gyrB gene can also lead to resistance against fluoroquinolone antibiotics. However, compared to the gyrA gene, mutations in gyrB are less commonly associated with antibiotic resistance. | https://pubmed.ncbi.nlm.nih.gov/15190035/ |
26 | | parC | Mutations in the parC gene can lead to resistance against fluoroquinolone antibiotics, similar to the gyrA and gyrB genes.Mutations in the parC gene can lead to resistance against fluoroquinolone antibiotics, similar to the gyrA and gyrB genes. | https://pubmed.ncbi.nlm.nih.gov/32273351/ |
27 | | parE | Similar to the gyrA, gyrB, and parC genes, mutations in the parE gene can also contribute to antibiotic resistance, particularly against fluoroquinolone antibiotics. | https://www.osti.gov/servlets/purl/1117920 |
28 | | GBAA_RS04545 or tetR | TetR-type transcriptional regulators have been described as a novel "mutation hot spot" that leads to the increased expression of multidrug efflux systems for CIP resistance. | https://pubmed.ncbi.nlm.nih.gov/20385868/ https://pubmed.ncbi.nlm.nih.gov/32273351/ | 
29 | <div style="page-break-after: always;"></div>
30 | 
31 | 


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/quinolones.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/quinolones.png


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/tetracyclines.md:
--------------------------------------------------------------------------------
 1 | ### Appendix LETTER: Location of tetracyclines AMR associated genes within Bacillus anthracis str. Ames Ancestor Chromosome
 2 | 
 3 | **Biothreat pathogen:** Bacillus anthracis str. Ames Ancestor NC_007530.2  
 4 | **Antibiotic Class:** Tetracyclines  
 5 | **Antibiotics:** Tetracycline  
 6 | **AMR genes description:**  
 7 | - tetA: Tetracycline resistance MFS efflux pump
 8 | - tetMWOS: TetM/W/O/S family tetracycline resistance ribosomal protection protein
 9 | - rpsJ: 30S ribosomal protein S10
10 |   
11 | **Gene location:**                      
12 | - tetA: 8,431,92 – 8,443,94
13 | - tetMWOS: 2,805,522 - 2,807,465
14 | - rpsJ: 1,209,62 – 1,212,70
15 | 
16 | ![Chromosomal location](tetracyclines.png)
17 | 
18 | | Gene | AMR description | Manuscript |
19 | | :------ | :------ | :------ |
20 | | tetA | Mutations in the tetA gene, which encode a tetracycline efflux protein have been associated with tetracyclines resistance. | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2847397/ https://pubmed.ncbi.nlm.nih.gov/14702405/ |
21 | | tetMWOS | Mechanisms of tetracycline resistance involving ribosomal protection proteins have established a correlation between mutations in tetracycline binding sites and changes in MIC data. |  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3576927/ |
22 | | rpsJ | Mutations in the rpsJ, encoding changes or deletions in residues 53–60 in the 30S ribosomal subunit protein S10, have been linked to tetracycline or tigecycline resistance in in vitro studies with Gram-positive bacteria |  https://pubmed.ncbi.nlm.nih.gov/26989065/ |
23 | <div style="page-break-after: always;"></div>
24 | 
25 | 


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/tetracyclines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/data/reference_sequences/Bacillus_anthracis/amr_appendices/tetracyclines.png


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/ba_virulence_genes.bed:
--------------------------------------------------------------------------------
 1 | pX01	143779	146073	pagA
 2 | pX01	149357	151786	lef
 3 | pX01	122608	125010	cya
 4 | pX02	48928	50376	acpB
 5 | pX02	52147	52290	capE
 6 | pX02	52305	53891	capD
 7 | pX02	53888	55123	capA
 8 | pX02	55135	55584	capC
 9 | pX02	55599	56993	capB
10 | pX02	68439	69890	acpA


--------------------------------------------------------------------------------
/Pima/data/reference_sequences/Bacillus_anthracis/confirmed_amr_mutations.bed:
--------------------------------------------------------------------------------
 1 | #contig	start	stop	name	type	drug	priority	note
 2 | chromosome	2322815	2322816	sigP	snp	beta-lactam	confirmed_location	A to G mutation at this location in sigP has been shown to confer penicillin resistance in a lab setting.
 3 | chromosome	2323276	2323277	rsiP	indel	beta-lactam	confirmed_location	A single deletion in a homopolyer region that is part of the end of sigP and beginning of rsiP (sigma / antisigma factors) has been shown to confer penicillin resistance in a lab setting.
 4 | chromosome	2323283	2323284	rsiP	indel	beta-lactam	confirmed_location	The insertion of G has been shown to confer penicillin resistance in a lab setting.
 5 | chromosome	2323306	2323307	rsiP	indel	beta-lactam	confirmed_location	Deletion of A has been shown to confer penicillin resistance in a lab setting.
 6 | chromosome	2323738	2323739	rsiP	snp	beta-lactam	confirmed_location	SNP in this location has been shown to confer penicillin resistance in a lab setting.
 7 | chromosome	2323738	2323739	rsiP	indel	beta-lactam	confirmed_location	The insertion of G has been shown to confer penicillin resistance in a lab setting.
 8 | chromosome	2323919	2323925	rsiP	indel	beta-lactam	confirmed_location	A 7bp deletion in this location has been shown to confer penicillin resistance in a lab setting.
 9 | chromosome	2324040	2324078	rsiP	indel	beta-lactam	confirmed_location	Single A deletion has been shown to confer penicillin resistance in a lab setting.
10 | chromosome	124375	124396	rplV	indel	macrolide	confirmed_location	A 24bp insertion in rplV has been shown to confer macrolide resistance in a lab setting.
11 | chromosome	124351	124363	rplV	indel	macrolide	confirmed_location	A 12bp insertion in rplV has been shown to confer macrolide resistance in a lab setting.
12 | chromosome	5053	5054	gyrB	snp	quinolones	confirmed_location	A to G mutation in gyrB has been shown to confer quinolone resistance in a lab setting.
13 | chromosome	5892	5893	gyrB	snp	quinolones	confirmed_location	A to G mutation in gyrB has been shown to confer quinolone resistance in a lab setting.
14 | chromosome	6006	6007	gyrB	snp	quinolones	confirmed_location	A G to A mutation in gyrB has been shown to confer quinolone resistance in a lab setting.
15 | chromosome	6847	6848	gyrA	snp	quinolones	confirmed_location	C to T mutation in gyrA has been shown to confer quinolone resistance in a lab setting.
16 | chromosome	6849	6850	gyrA	snp	quinolones	confirmed_location	G to C mutation in gyrA has been shown to confer quinolone resistance in a lab setting.
17 | chromosome	6850	6851	gyrA	snp	quinolones	confirmed_location	C to G mutation in gyrA has been shown to confer quinolone resistance in a lab setting.
18 | chromosome	6858	6859	gyrA	snp	quinolones	confirmed_location	G to A mutation in gyrA has been shown to confer quinolone resistance in a lab setting.
19 | chromosome	6859	6860	gyrA	snp	quinolones	confirmed_location	A to C mutation in gyrA has been shown to confer quinolone resistance in a lab setting.
20 | chromosome	6860	6861	gyrA	snp	quinolones	confirmed_location	A to G mutation in gyrA has been shown to confer quinolone resistance in a lab setting.
21 | chromosome	3362946	3362947	parC	snp	quinolones	confirmed_location	C to either T or A mutations in parC have been shown to confer quinolone resistance in lab settings
22 | chromosome	3362947	3362948	parC	snp	quinolones	confirmed_location	C to T mutation in parC have been shown to confer quinolone resistance in lab settings
23 | chromosome	3362950	3362951	parC	snp	quinolones	confirmed_location	T to C mutation in parC have been shown to confer quinolone resistance in lab settings
24 | chromosome	3362957	3362958	parC	snp	quinolones	confirmed_location	G to A mutation in parC have been shown to confer quinolone resistance in lab settings
25 | chromosome	3362992	3362993	parC	snp	quinolones	confirmed_location	Any mutation in parC at this location has been shown to confer quinolone resistance in lab setting
26 | chromosome	3364886	3364887	parC	snp	quinolones	confirmed_location	G to A mutation in parC has been shown to confer quinolone resistance in lab settings
27 | chromosome	3365453	3365454	parC	snp	quinolones	confirmed_location	G to T mutation in parC has been shown to confer quinolone resistance in lab settings
28 | chromosome	3366419	3366420	parE	snp	quinolones	confirmed_location	Any mutation in parE at this location has been shown to confer quinolone resistance in lab setting
29 | chromosome	748008	842715	tetR	large-indel	quinolones	confirmed_location	Multiple large deletions in this region have been shown to confer quinolone resistance under lab conditions.
30 | chromosome	842290	842291	tetR	indel	quinolones	confirmed_location	Single T deletion has been shown to confer quinolone resistance in a lab setting
31 | chromosome	842290	842291	tetR	indel	quinolones	confirmed_location	A 5 bp deletion has been shown to confer quinolone resistance in a lab setting
32 | chromosome	842398	842399	tetR	indel	quinolones	confirmed_location	A 2 bp insertion has been shown to confer quinolone resistance in a lab setting
33 | chromosome	842517	842518	tetR	snp	quinolones	confirmed_location	C to T mutation has been shown to confer quinolone resistance in lab settings
34 | chromosome	842613	842614	tetR	indel	quinolones	confirmed_location	An 11 bp insertion has been shown to confer quinolone resistance in lab settings
35 | chromosome	842709	842710	tetR	indel	quinolones	confirmed_location	A 2 bp insertion has been shown to confer quinolone resistance in a lab setting
36 | chromosome	842714	842715	tetR	indel	quinolones	confirmed_location	A 1 bp deletion has been shown to confer quinolone resistance in a lab setting
37 | chromosome	108732	108733	rpoB	snp	rifamycim	confirmed_location	G to T mutation has been shown to confer rifamycim resistance in lab settings
38 | chromosome	2322745	2323278	sigP	any	beta-lactam	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer beta-lactam resistance under laboratory conditions
39 | chromosome	2323268	2324095	rsiP	any	beta-lactam	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer beta-lactam resistance under laboratory conditions
40 | chromosome	124091	124432	rplV	any	macrolide	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer macrolide resistance under laboratory conditions
41 | chromosome	4583	6505	gyrB	any	quinolones	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer quinolone  resistance under laboratory conditions
42 | chromosome	6594	9065	gyrA	any	quinolones	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer quinolone  resistance under laboratory conditions
43 | chromosome	3362704	3365127	parC	any	quinolones	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer quinolone  resistance under laboratory conditions
44 | chromosome	3365129	3367093	parE	any	quinolones	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer quinolone  resistance under laboratory conditions
45 | chromosome	842402	842980	tetR	any	quinolones	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer quinolone  resistance under laboratory conditions
46 | chromosome	108390	111923	rpoB	any	rifamycim	potential_confer_amr	While this mutation has not been observed before, other mutations in this region confer rifamycim resistance under laboratory conditions
47 | 


--------------------------------------------------------------------------------
/Pima/modules/__init__.py:
--------------------------------------------------------------------------------
  1 | from .download_references import (
  2 |     validate_download,
  3 |     validate_organism,
  4 | )
  5 | from .outdir import validate_output_dir
  6 | from .multiplexed import validate_multiplex_fastq, initialize_multiplex_analysis
  7 | from .fastq import (
  8 |     validate_ont_fastq, 
  9 |     info_given_ont_fastq,
 10 |     validate_illumina_fastq,
 11 |     info_illumina_fastq,
 12 |     validate_genome_estimate,
 13 |     estimate_genome_size,
 14 | )
 15 | from .check_contamination import validate_contamination_check, fastq_contamination
 16 | 
 17 | from .assembly import (
 18 |     validate_genome_fasta,
 19 |     validate_genome_assembly_size,
 20 |     validate_assembly_info,
 21 |     validate_assembler,
 22 |     flye_ont_fastq,
 23 |     raven_ont_fastq,
 24 |     spades_illumina_fastq,
 25 |     check_assembly_coverages,
 26 | )
 27 | 
 28 | from .ont_polishing import (
 29 |     validate_medaka,
 30 |     medaka_ont_assembly,
 31 | )
 32 | 
 33 | from .illumina_polishing import(
 34 |     validate_illumina_polish,
 35 |     pilon_assembly,
 36 |     polypolish_assembly,
 37 | )
 38 | 
 39 | from .evaluate_assembly import(
 40 |     validate_evaluate_assembly,
 41 |     check_for_small_contigs_and_fragmentation,
 42 | )
 43 | 
 44 | from .annotations import (
 45 |     validate_features,
 46 |     validate_blast,
 47 |     blast_feature_sets,
 48 | )
 49 | 
 50 | from .plasmids import (
 51 |     validate_plasmids,
 52 |     call_plasmids,
 53 | )
 54 | 
 55 | from .compare_to_ref import (
 56 |     validate_reference_fasta,
 57 |     validate_mutations,
 58 |     validate_quast,
 59 |     call_insertions,
 60 |     quast_genome,
 61 |     call_amr_mutations,
 62 | )
 63 | 
 64 | from .visualizations import (
 65 |     validate_draw_amr_matrix,
 66 |     validate_draw_features,
 67 |     validate_draw_circos,
 68 |     draw_features,
 69 |     draw_amr_matrix,
 70 |     draw_circos,
 71 | )
 72 | 
 73 | from .report import (
 74 |     validate_make_report,
 75 |     make_report,
 76 | )
 77 | 
 78 | __all__ = [
 79 |     # Validation
 80 |     "validate_download",
 81 |     "validate_organism",
 82 |     "validate_output_dir",
 83 |     "validate_multiplex_fastq",
 84 |     "validate_genome_estimate",
 85 |     "validate_ont_fastq",
 86 |     "validate_illumina_fastq",
 87 |     "validate_contamination_check",
 88 |     "validate_genome_fasta",
 89 |     "validate_genome_assembly_size",
 90 |     "validate_assembly_info", #check for coverages
 91 |     "validate_assembler",
 92 |     "validate_medaka",
 93 |     "validate_illumina_polish",
 94 |     "validate_evaluate_assembly", #check for contig size/# issues
 95 |     "validate_plasmids",
 96 |     "validate_features",
 97 |     "validate_blast",
 98 |     "validate_reference_fasta",
 99 |     "validate_quast",
100 |     "validate_mutations",
101 |     "validate_draw_amr_matrix",
102 |     "validate_draw_features",
103 |     "validate_draw_circos",
104 |     "validate_make_report",
105 |     # Analysis
106 |     "initialize_multiplex_analysis",
107 |     "estimate_genome_size",
108 |     "info_given_ont_fastq",
109 |     "info_illumina_fastq",
110 |     "fastq_contamination",
111 |     "flye_ont_fastq",
112 |     "raven_ont_fastq",
113 |     "spades_illumina_fastq",
114 |     "medaka_ont_assembly",
115 |     "pilon_assembly",
116 |     "polypolish_assembly",
117 |     "check_assembly_coverages",
118 |     "check_for_small_contigs_and_fragmentation",
119 |     "call_plasmids",
120 |     "blast_feature_sets",
121 |     "call_insertions",
122 |     "quast_genome",
123 |     "call_amr_mutations",
124 |     "draw_features",
125 |     "draw_amr_matrix",
126 |     "draw_circos",
127 |     "make_report",
128 | ]
129 | 


--------------------------------------------------------------------------------
/Pima/modules/annotations.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import re
  3 | import os
  4 | import subprocess
  5 | 
  6 | import pandas as pd
  7 | 
  8 | from Pima.pima_data import PimaData
  9 | from Pima.utils.settings import Settings
 10 | from Pima.utils.utils import (
 11 |     print_and_log,
 12 |     print_and_run,
 13 |     validate_utility,
 14 |     validate_file_and_size,
 15 |     find_checkpoint,
 16 |     make_start_file,
 17 |     make_finish_file,
 18 |     std_files,
 19 |     error_out,
 20 | )
 21 | 
 22 | 
 23 | def validate_features(pima_data: PimaData, settings: Settings):
 24 |     # skip conditions
 25 |     if pima_data.only_assemble:
 26 |         return
 27 | 
 28 |     if pima_data.feature_fastas is None:
 29 |         pima_data.feature_fastas = []
 30 | 
 31 |     if not pima_data.no_amr:
 32 |         pima_data.feature_fastas.append(pima_data.amr_database)
 33 |         pima_data.feature_colors.append(settings.amr_default_color)
 34 |         if validate_file_and_size(pima_data, settings.amr_gene_drug_tsv):
 35 |             pima_data.amr_gene_drug = pd.read_csv(
 36 |                 settings.amr_gene_drug_tsv,
 37 |                 index_col=None,
 38 |                 sep="\t",
 39 |                 quoting=csv.QUOTE_NONE,
 40 |                 header=None,
 41 |             )
 42 |             pima_data.drug_categories = pima_data.amr_gene_drug.iloc[:, 1].unique()
 43 | 
 44 |     if not pima_data.no_inc:
 45 |         pima_data.feature_fastas.append(pima_data.inc_database)
 46 |         pima_data.feature_colors.append(settings.inc_default_color)
 47 | 
 48 |     if len(pima_data.feature_fastas) == 0:
 49 |         return
 50 | 
 51 |     if not pima_data.will_have_genome_fasta:
 52 |         return
 53 | 
 54 |     print_and_log(
 55 |         pima_data,
 56 |         "Validating feature sets",
 57 |         pima_data.main_process_verbosity,
 58 |         pima_data.main_process_color,
 59 |     )
 60 | 
 61 |     for feature_fasta in pima_data.feature_fastas:
 62 |         if not validate_file_and_size(pima_data, feature_fasta):
 63 |             # See if the missing database can be downloaded
 64 |             if feature_fasta in settings.included_databases:
 65 |                 if not pima_data.download:
 66 |                     pima_data.errors.append(
 67 |                         f"Can't find feature database {feature_fasta} or it is empty.  Try --download?"
 68 |                     )
 69 |             else:
 70 |                 pima_data.errors.append(f"Can't find feature database {feature_fasta}")
 71 | 
 72 | 
 73 | def validate_blast(pima_data: PimaData):
 74 |     # skip conditions
 75 |     if pima_data.only_assemble:
 76 |         return
 77 | 
 78 |     if len(pima_data.feature_fastas) == 0:
 79 |         return
 80 | 
 81 |     if not pima_data.will_have_genome_fasta:
 82 |         return
 83 | 
 84 |     print_and_log(
 85 |         pima_data,
 86 |         "Validating blast utilities",
 87 |         pima_data.main_process_verbosity,
 88 |         pima_data.main_process_color,
 89 |     )
 90 | 
 91 |     for utility in ["makeblastdb", "blastn", "bedtools"]:
 92 |         if validate_utility(pima_data, utility, f"{utility} isn't on the PATH."):
 93 |             command = utility + " -version"
 94 |             pima_data.versions[utility] = re.search(
 95 |                 r"[0-9]+\.[0-9.]+", print_and_run(pima_data, command)[0]
 96 |             ).group(0)
 97 |     pima_data.analysis.append(["blast_feature_sets", pima_data])
 98 | 
 99 |     
100 | def blast_feature_sets(pima_data: PimaData):
101 |     """Find genes within both 'amr' and 'inc' databases within the assembly
102 |     
103 |     Generates a dictionary of dataframes, 1 dataframe for amr and 1 for inc
104 |     """
105 | 
106 |     print_and_log(
107 |         pima_data,
108 |         "BLASTing feature sets",
109 |         pima_data.main_process_verbosity,
110 |         pima_data.main_process_color,
111 |     )
112 | 
113 |     # Keep track of feature hits for reporting
114 |     pima_data.features_dir = os.path.join(pima_data.output_dir, "features")
115 | 
116 |     # Check if results already exist
117 |     if find_checkpoint(pima_data, pima_data.features_dir):
118 |         print_and_log(
119 |             pima_data,
120 |             "BLASTing features had previously been run and finished successfully",
121 |             pima_data.main_process_verbosity,
122 |             pima_data.main_process_color,
123 |         )
124 |         pima_data.did_blast_feature_sets = True
125 |         found_feature_dirs = [
126 |             feature_dir.path
127 |             for feature_dir in os.scandir(pima_data.features_dir)
128 |             if feature_dir.is_dir()
129 |         ]
130 |         for feature_dir in found_feature_dirs:
131 |             feature_name = os.path.basename(feature_dir)
132 |             best_bed = os.path.join(feature_dir, "best.bed")
133 |             parse_blast_features(pima_data, best_bed, feature_name)
134 |         return
135 | 
136 |     os.makedirs(pima_data.features_dir)
137 |     make_start_file(pima_data, pima_data.features_dir)
138 | 
139 |     # Make a blast database of the genome
140 |     make_blast_database(pima_data, pima_data.genome_fasta)
141 | 
142 |     for feature_number in range(len(pima_data.feature_fastas)):
143 |         feature_fasta = pima_data.feature_fastas[feature_number]
144 |         feature_name = re.sub(r"\.f.*", "", os.path.basename(feature_fasta))
145 |         feature_dir = os.path.join(pima_data.features_dir, feature_name)
146 |         blast_features(pima_data, feature_fasta, feature_dir, feature_name)
147 |         pima_data.feature_dirs += [feature_dir]
148 |         pima_data.feature_names += [feature_name]
149 | 
150 |     pima_data.did_blast_feature_sets = True
151 |     make_finish_file(pima_data, pima_data.features_dir)
152 | 
153 | 
154 | def make_blast_database(pima_data: PimaData, database_fasta: str):
155 | 
156 |     if os.path.isfile(f"{database_fasta}.nin"):
157 |         command = " ".join(
158 |             [
159 |                 'blastdbcmd -info -db',
160 |                 database_fasta,
161 |             ]
162 |         )
163 |         result = subprocess.run(command, shell=True, capture_output=True, text=True)
164 |         if result.returncode == 0:
165 |             return
166 | 
167 |     print_and_log(
168 |         pima_data,
169 |         "Making a BLAST database for " + database_fasta,
170 |         pima_data.sub_process_verbosity,
171 |         pima_data.sub_process_color,
172 |     )
173 |     std_prefix = re.sub(r"\.[^.]*$", "", database_fasta)
174 |     stdout_file, stderr_file = std_files(std_prefix)
175 |     command = " ".join(
176 |         [
177 |             "makeblastdb -in",
178 |             database_fasta,
179 |             "-dbtype nucl -parse_seqids",
180 |             "1>", stdout_file,
181 |             "2>", stderr_file,
182 |         ]
183 |     )
184 |     print_and_run(pima_data, command)
185 | 
186 | 
187 | def blast_features(
188 |     pima_data: PimaData, feature_fasta: str, feature_dir: str, feature_name: str
189 | ):
190 |     # Make a directory for the new features
191 |     os.makedirs(feature_dir)
192 | 
193 |     # BLASTn the feature set
194 |     blast_output = os.path.join(feature_dir, "blast_output.tsv")
195 |     print_and_log(
196 |         pima_data,
197 |         "BLASTing features against the assembly",
198 |         pima_data.sub_process_verbosity,
199 |         pima_data.sub_process_color,
200 |     )
201 |     blastn_stdout, blastn_stderr = std_files(os.path.join(feature_dir, "blastn"))
202 |     command = " ".join(
203 |         [
204 |             "blastn -db",
205 |             pima_data.genome_fasta,
206 |             "-query",
207 |             feature_fasta,
208 |             "-perc_identity 95.0",
209 |             '-outfmt "6',
210 |             'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen"',
211 |             "-evalue 1e-10 -out ",
212 |             blast_output,
213 |             "1>", blastn_stdout,
214 |             "2>", blastn_stderr,
215 |         ]
216 |     )
217 |     print_and_run(pima_data, command)
218 | 
219 |     # Clean up the results into a handy BED file
220 |     print_and_log(
221 |         pima_data,
222 |         "Converting feature hits to BED",
223 |         pima_data.sub_process_verbosity,
224 |         pima_data.sub_process_color,
225 |     )
226 |     all_bed = os.path.join(feature_dir, "all.bed")
227 |     command = " ".join(
228 |         [
229 |             "cat",
230 |             blast_output,
231 |             "| awk -F '\\t' '($3 >= 95) && ($4 / $14 >= .90){OFS = \"\\t\";"
232 |             + 'print $2,($9 < $10 ? $9 : $10),($9 < $10 ? $10 : $9),$1,$3/100,($9 < $10 ? "+" : "-")}\'',
233 |             "| sort -k 1,1 -k 2,2n >",
234 |             all_bed,
235 |         ]
236 |     )
237 |     print_and_run(pima_data, command)
238 | 
239 |     # Make clusters of hits
240 |     print_and_log(
241 |         pima_data,
242 |         "Clustering feature hits",
243 |         pima_data.sub_process_verbosity,
244 |         pima_data.sub_process_color,
245 |     )
246 |     merge_bed = os.path.join(feature_dir, "merge.bed")
247 |     _, merge_stderr = std_files(os.path.join(feature_dir, "bedtools_merge"))
248 |     command = " ".join(
249 |         ["bedtools merge -d -30 -i", all_bed, "1>", merge_bed, "2>", merge_stderr]
250 |     )
251 |     print_and_run(pima_data, command)
252 | 
253 |     # Pick the best hit for each cluster
254 |     print_and_log(
255 |         pima_data,
256 |         "Finding the best hit for each feature cluster",
257 |         pima_data.sub_process_verbosity,
258 |         pima_data.sub_process_color,
259 |     )
260 |     best_bed = os.path.join(feature_dir, "best.bed")
261 |     command = " ".join(
262 |         [
263 |             "bedtools intersect",
264 |             "-a",
265 |             all_bed,
266 |             "-b",
267 |             merge_bed,
268 |             "-f .9 -F .9 -wao",
269 |             "| awk '$7 != \".\"'",
270 |             '| awk \'{OFS="\\t";locus=$7"\\t"$8"\\t"$9; if($5 > s[locus]){s[locus]=$5;id = sprintf("%.3f", $5); b[locus] = $1"\\t"$2"\\t"$3"\\t"$4"\\t"id"\\t"$6}}',
271 |             "END{for(i in b){print b[i]}}'",
272 |             "| sort -k 1,1 -k2,2n",
273 |             ">" + best_bed,
274 |         ]
275 |     )
276 |     print_and_run(pima_data, command)
277 |     parse_blast_features(pima_data, best_bed, feature_name)
278 | 
279 | 
280 | def parse_blast_features(pima_data: PimaData, best_bed: str, feature_name: str):
281 |     # Keep the feature hits for later drawing.  It may be empty, i.e., no feature hits
282 |     try:
283 |         best = pd.read_csv(filepath_or_buffer=best_bed, sep="\t", header=None)
284 |     except FileNotFoundError:
285 |         best = pd.DataFrame()
286 |     except pd.errors.EmptyDataError:
287 |         best = pd.DataFrame()
288 |     except Exception as e:
289 |         error_out(
290 |             pima_data, f"Unexpected exception when processing BLAST features: {e}"
291 |         )
292 | 
293 |     pima_data.feature_hits[feature_name] = best
294 | 


--------------------------------------------------------------------------------
/Pima/modules/check_contamination.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from Pima.pima_data import PimaData
  7 | from Pima.utils.settings import Settings
  8 | 
  9 | from Pima.utils.utils import (
 10 |     print_and_log,
 11 |     print_and_run,
 12 |     validate_utility,
 13 |     validate_file_and_size_or_error,
 14 |     make_start_file,
 15 |     make_finish_file,
 16 |     std_files,
 17 |     find_checkpoint,
 18 | )
 19 | 
 20 | def validate_contamination_check(pima_data: PimaData, settings: Settings):
 21 | 
 22 |     if not pima_data.contam_check:
 23 |         return
 24 | 
 25 |     print_and_log(
 26 |         pima_data,
 27 |         'Validating contamination check', 
 28 |         pima_data.main_process_verbosity, 
 29 |         pima_data.main_process_color,
 30 |     )
 31 |     
 32 |     if not pima_data.will_have_ont_fastq and pima_data.illumina_fastq is None:
 33 |         pima_data.errors.append('--contamination requires a set of FASTQ reads')
 34 |     
 35 |     if validate_utility(pima_data, 'kraken2', 'kraken2 is not on the PATH (required by --contamination-check).'):
 36 |         command = 'kraken2 --version'
 37 |         pima_data.versions['kraken2'] = re.search(r'[0-9]+\.[0-9.]+', print_and_run(pima_data, command)[0]).group(0)
 38 | 
 39 |     if os.path.isdir(settings.kraken_database_default): 
 40 |         pima_data.kraken_database = settings.kraken_database_default
 41 |     elif os.path.isdir(settings.DockerPathKraken):
 42 |         pima_data.kraken_database = settings.DockerPathKraken
 43 |     else:
 44 |         pima_data.errors.append("No kraken2 database detected, try and run pima with --download. Exiting now.")
 45 |     
 46 |     pima_data.analysis.append(['fastq_contamination', pima_data, settings])
 47 | 
 48 | 
 49 | def fastq_contamination(pima_data: PimaData, settings: Settings):
 50 | 
 51 |     print_and_log(
 52 |         pima_data,
 53 |         'Running Kraken2 to check for contamination', 
 54 |         pima_data.main_process_verbosity, 
 55 |         pima_data.main_process_color,
 56 |     )
 57 | 
 58 |     pima_data.kraken_dir = os.path.join(pima_data.output_dir, 'contamination')
 59 | 
 60 |     if find_checkpoint(pima_data, pima_data.kraken_dir):
 61 |         print_and_log(
 62 |             pima_data,
 63 |             'Using existing kraken2 report',
 64 |             pima_data.sub_process_verbosity,
 65 |             pima_data.sub_process_color,
 66 |         )
 67 |         pima_data.did_kraken_fastq = True
 68 |         if os.path.isdir(os.path.join(pima_data.kraken_dir, "ont")):
 69 |             pima_data.kraken_fracs['ONT'] = read_kraken_report(os.path.join(pima_data.kraken_dir, "ont", "kraken.report"))
 70 |         
 71 |         if os.path.isdir(os.path.join(pima_data.kraken_dir, "illumina")):
 72 |             pima_data.kraken_fracs['Illumina'] = read_kraken_report(os.path.join(pima_data.kraken_dir, "illumina", "kraken.report"))
 73 |         return
 74 | 
 75 |     os.makedirs(pima_data.kraken_dir)
 76 |     make_start_file(pima_data, pima_data.kraken_dir)
 77 | 
 78 |     if not (pima_data.ont_fastq is None):
 79 |         print_and_log(
 80 |             pima_data,
 81 |             'Running Kraken2 on ONT data', 
 82 |             pima_data.sub_process_verbosity, 
 83 |             pima_data.sub_process_color,
 84 |         )
 85 |         ont_kraken_dir = os.path.join(pima_data.kraken_dir, 'ont')
 86 |         pima_data.kraken_fracs['ONT'] = kraken_fastq(pima_data, settings, pima_data.ont_fastq, ont_kraken_dir)
 87 | 
 88 |     if not (pima_data.illumina_fastq is None):
 89 |         print_and_log(
 90 |             pima_data,
 91 |             'Running Kraken2 on Illumina data', 
 92 |             pima_data.sub_process_verbosity, 
 93 |             pima_data.sub_process_color,
 94 |         )
 95 |         illumina_kraken_dir = os.path.join(pima_data.kraken_dir, 'illumina')
 96 |         pima_data.kraken_fracs['Illumina'] = kraken_fastq(pima_data, settings, pima_data.illumina_fastq, illumina_kraken_dir)
 97 | 
 98 |     pima_data.did_kraken_fastq = True
 99 |     make_finish_file(pima_data, pima_data.kraken_dir)
100 | 
101 |             
102 | def kraken_fastq(pima_data: PimaData, settings: Settings, fastq, fastq_dir: str):
103 | 
104 |     os.makedirs(fastq_dir)
105 |     
106 |     kraken_files = [os.path.join(fastq_dir, 'kraken.' + i) for i in ['report', 'out', 'class', 'unclass']]
107 |     kraken_report, kraken_out, kraken_class, kraken_unclass = kraken_files
108 |     kraken_stdout, kraken_stderr = std_files(os.path.join(fastq_dir, 'kraken'))
109 | 
110 |     fastq_arg = fastq
111 |     if isinstance(fastq, list):
112 |         fastq_arg = ' '.join(fastq)
113 | 
114 |     command = " ".join(
115 |         [
116 |             'kraken2',
117 |             '--threads', str(pima_data.threads),
118 |             '--report', kraken_report,
119 |             '--out', kraken_out,
120 |             '--class', kraken_class,
121 |             '--unclass', kraken_unclass,
122 |             '--db', pima_data.kraken_database,
123 |             fastq_arg,
124 |             '1>', kraken_stdout, '2>', kraken_stderr,
125 |         ]
126 |     )
127 |     print_and_run(pima_data, command)
128 | 
129 |     [validate_file_and_size_or_error(pima_data, i, i + ' missing after Kraken2', i + ' file is size 0 after Kraken2', )
130 |         for i in kraken_files]
131 | 
132 |     # Read in the Kraken fractions and pull out the useful parts
133 |     kraken_fracs = read_kraken_report(kraken_report)
134 |     pima_data.files_to_clean.append([kraken_class, kraken_unclass, kraken_out])
135 |     return(kraken_fracs)
136 | 
137 | def read_kraken_report(kraken_report: str):
138 |     kraken_fracs = pd.read_csv(kraken_report, delimiter = '\t', header = None)
139 |     kraken_fracs.index = kraken_fracs.iloc[:, 4].values
140 |     kraken_fracs = kraken_fracs.loc[kraken_fracs.iloc[:, 3].str.match('[UG]1?'), :]
141 |     kraken_fracs = kraken_fracs.loc[(kraken_fracs.iloc[:, 0] >= 1) | (kraken_fracs.iloc[:, 3] == 'U'), :]
142 |     kraken_fracs = kraken_fracs.iloc[:, [0, 1, 3, 5]]
143 |     kraken_fracs.columns = ['Fraction', 'Reads', 'Level', 'Taxa']
144 |     kraken_fracs['Fraction'] = (kraken_fracs['Fraction'] / 100).round(4)
145 |     kraken_fracs.sort_values(by = 'Fraction', inplace = True, ascending = False)
146 |     kraken_fracs['Taxa'] = kraken_fracs['Taxa'].str.lstrip()
147 |     return kraken_fracs


--------------------------------------------------------------------------------
/Pima/modules/download_references.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import shutil
  4 | import glob
  5 | 
  6 | from Pima.pima_data import PimaData
  7 | from Pima.utils.settings import Settings
  8 | 
  9 | from Pima.utils.utils import (
 10 |     print_and_log,
 11 |     print_and_run,
 12 |     validate_file_and_size,
 13 | )
 14 | 
 15 | def validate_download(pima_data: PimaData, settings: Settings):
 16 | 
 17 |     if not pima_data.download:
 18 |         return
 19 | 
 20 |     pima_data.errors = []
 21 | 
 22 |     pima_data.verbosity = 3
 23 |     
 24 |     download_databases(pima_data, settings)
 25 |     # We don't need this as part of the analysis pipeline, if user tries to donwload, run the download function and then end
 26 |     #pima_data.analysis.append(['download_databases', pima_data, settings])
 27 | 
 28 |     print_and_log(
 29 |         pima_data,
 30 |         "Finished validating databases. Re-run analysis without '--download' argument",
 31 |         pima_data.main_process_verbosity,
 32 |         pima_data.main_process_color,
 33 |     )
 34 |     sys.exit(0)
 35 | 
 36 | 
 37 | def validate_organism(pima_data: PimaData):
 38 |     if pima_data.only_assemble:
 39 |         return
 40 |     
 41 |     if not pima_data.organism and not pima_data.list_organisms:
 42 |         return
 43 | 
 44 |     list_of_org = [
 45 |         "Bacillus_anthracis",
 46 |     ]
 47 | 
 48 |     if pima_data.list_organisms:
 49 |         print_and_log(
 50 |             pima_data,
 51 |             f"List of available reference organisms:\n{' '.join(list_of_org)}",
 52 |             pima_data.main_process_verbosity,
 53 |             pima_data.main_process_color,
 54 |         )
 55 |         sys.exit(0)
 56 | 
 57 |     print_and_log(
 58 |         pima_data, 
 59 |         'Validating organism', 
 60 |         pima_data.main_process_verbosity, 
 61 |         pima_data.main_process_color,
 62 |     )
 63 |     
 64 |     if pima_data.organism and pima_data.reference_fasta:
 65 |         pima_data.errors.append("--organism and --reference-genome are mutually exclusive")
 66 | 
 67 |     if pima_data.organism and pima_data.mutation_region_bed:
 68 |         pima_data.errors.append("--organism and --mutation-regions are mutually exclusive")
 69 | 
 70 |     if not pima_data.organism in list_of_org:
 71 |         pima_data.errors.append(
 72 |             f"--organism {pima_data.organism} is not available, please specify a specifc --reference-genome and --mutations-regions"
 73 |             f" or run PiMA without a reference"
 74 |         )
 75 |         return
 76 |     
 77 |     if not os.path.isdir(pima_data.reference_dir):
 78 |         os.mkdir(pima_data.reference_dir)
 79 | 
 80 |     pima_data.organism_dir = os.path.join(pima_data.reference_dir, pima_data.organism)
 81 |     if not os.path.isdir(pima_data.organism_dir):
 82 |         os.mkdir(pima_data.organism_dir)
 83 |     
 84 |     pima_data.reference_fasta = os.path.join(pima_data.organism_dir, "genome.fasta")
 85 |     pima_data.mutation_region_bed = os.path.join(pima_data.organism_dir, "confirmed_amr_mutations.bed")
 86 |     #pima_data.mutation_regions = os.path.join(pima_data.organism_dir, "mutation_regions.bed")
 87 |     pima_data.organism_amr_appendices = glob.glob(os.path.join(pima_data.organism_dir, "amr_appendices","*md"))
 88 |     
 89 |     if not validate_file_and_size(pima_data, pima_data.reference_fasta):
 90 |         print_and_log(
 91 |             pima_data,
 92 |             f"Downloading reference genome for {pima_data.organism}",
 93 |             pima_data.sub_process_verbosity, 
 94 |             pima_data.sub_process_color,
 95 |         )
 96 |         download_organism(pima_data, pima_data.organism)
 97 |         pima_data.load_reference()
 98 | 
 99 |     pima_data.will_have_reference_fasta = True
100 | 
101 | def download_organism(pima_data: PimaData, organism: str):
102 | 
103 |     print_and_log(
104 |         pima_data, 
105 |         f"Downloading references specific for {organism}", 
106 |         pima_data.main_process_verbosity, 
107 |         pima_data.main_process_color,
108 |     )
109 | 
110 |     if organism == "Bacillus_anthracis":
111 |         genome_temp = os.path.join(pima_data.organism_dir, "genome_temp.fasta")
112 |         genome = os.path.join(pima_data.organism_dir, "genome.fasta")
113 |         command = " ".join(
114 |             [
115 |                 "wget -O",
116 |                 f"{genome_temp}.gz",
117 |                 "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/445/GCA_000008445.1_ASM844v1/GCA_000008445.1_ASM844v1_genomic.fna.gz",
118 |                 "1> /dev/null 2> /dev/null;",
119 |                 "gunzip", f"{genome_temp}.gz",
120 |             ]
121 |         )
122 |         print_and_run(pima_data, command)
123 | 
124 |         replace_dict = {
125 |             ">AE017334.2 Bacillus anthracis str. 'Ames Ancestor', complete genome": ">chromosome",
126 |             ">AE017336.2 Bacillus anthracis str. 'Ames Ancestor' plasmid pXO1, complete sequence": ">pX01",
127 |             ">AE017335.3 Bacillus anthracis str. 'Ames Ancestor' plasmid pXO2, complete sequence": ">pX02",
128 |         }
129 |         with open(genome_temp, "r") as f:
130 |             with open(genome, "w") as w:
131 |                 for line in f:
132 |                     for key in replace_dict:
133 |                         if key in line:
134 |                             line = line.replace(key, replace_dict[key])
135 |                     w.write(line)
136 | 
137 |         os.remove(genome_temp)
138 | 
139 | def download_databases(pima_data: PimaData, settings: Settings):
140 | 
141 |     print_and_log(
142 |         pima_data,
143 |         "Checking for missing databases", 
144 |         pima_data.main_process_verbosity, 
145 |         pima_data.main_process_color,
146 |     )
147 | 
148 |     database_fasta = settings.plasmid_database_default_fasta
149 |     if not validate_file_and_size(pima_data, database_fasta) and validate_file_and_size(pima_data, settings.DockerPathPlasmid):
150 |         pima_data.plasmid_database = settings.DockerPathPlasmid
151 | 
152 |     elif not validate_file_and_size(pima_data, database_fasta) and not validate_file_and_size(pima_data, settings.DockerPathPlasmid):
153 |         print_and_log(
154 |             pima_data,
155 |             'Downloading plasmid database', 
156 |             pima_data.sub_process_verbosity, 
157 |             pima_data.sub_process_color,
158 |         )
159 |         command = " ".join(
160 |             [
161 |                 'wget',
162 |                 '-O', database_fasta,
163 |                 'http://pima.appliedbinf.com/data/plasmids_and_vectors.fasta',
164 |             ]
165 |         )
166 |         print_and_run(pima_data, command)
167 |     else:
168 |         print_and_log(
169 |             pima_data,
170 |             'Plasmid database present', 
171 |             pima_data.sub_process_verbosity, 
172 |             pima_data.sub_process_color,
173 |         )
174 |     if not os.path.isdir(settings.kraken_database_default) and validate_file_and_size(pima_data, settings.DockerPathKraken):
175 |         pima_data.kraken_database = settings.DockerPathKraken
176 |     elif not validate_file_and_size(pima_data, os.path.join(settings.kraken_database_default, "hash.k2d")) and not validate_file_and_size(pima_data, settings.DockerPathKraken):
177 |         print_and_log(
178 |             pima_data,
179 |             'Downloading and the prebuilt 8gb kraken2 database, 20230605, (may take some time)', 
180 |             pima_data.sub_process_verbosity, 
181 |             pima_data.sub_process_color,
182 |         )
183 |         #if download was corrupted and the files (hash, opts, taxo) are not present, we need to delete and try again
184 |         if os.path.isdir(settings.kraken_database_default):
185 |             shutil.rmtree(settings.kraken_database_default)
186 | 
187 |         os.makedirs(settings.kraken_database_default)
188 |         command = " ".join(
189 |             [
190 |                 'wget -O', f"{settings.kraken_database_default}.tar.gz",
191 |                 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20230605.tar.gz;',
192 |                 'tar xvf', 
193 |                 f"{settings.kraken_database_default}.tar.gz", 
194 |                 '-C', 
195 |                 settings.kraken_database_default,
196 |                 '1> /dev/null 2> /dev/null',
197 |             ]
198 |         )
199 |         print_and_run(pima_data, command)
200 |         command = " ".join(
201 |             [
202 |                 'rm', 
203 |                 f"{settings.kraken_database_default}.tar.gz",
204 |                 f"{settings.kraken_database_default}/*kmer_distrib",
205 |                 f"{settings.kraken_database_default}/inspect.txt",
206 |                 f"{settings.kraken_database_default}/ktaxonomy.tsv",
207 |                 f"{settings.kraken_database_default}/seqid2taxid.map"
208 |             ]
209 |         )
210 |         print_and_run(pima_data, command)
211 |     else:
212 |         print_and_log(
213 |             pima_data,
214 |             'Kraken2 database found', 
215 |             pima_data.sub_process_verbosity, 
216 |             pima_data.sub_process_color,
217 |         )


--------------------------------------------------------------------------------
/Pima/modules/evaluate_assembly.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas as pd
 3 | 
 4 | from Pima.pima_data import PimaData
 5 | from Pima.utils.utils import (
 6 |     print_and_run,
 7 |     print_and_log,
 8 | )
 9 | 
10 | def validate_evaluate_assembly(pima_data: PimaData) :
11 | 
12 |     if pima_data.no_assembly:
13 |         return
14 |     
15 |     if not pima_data.will_have_ont_assembly :
16 |         return
17 | 
18 |     pima_data.analysis.append(['check_for_small_contigs_and_fragmentation', pima_data])
19 | 
20 | def check_for_small_contigs_and_fragmentation(pima_data: PimaData):
21 |     """
22 |     Will run on any assembly required, including denovo ONT, ONT polished, and/or Illumina polished assemblies
23 |     Queue up for final step after all assembly manipulation steps are complete
24 |     """
25 |     print_and_log(
26 |         pima_data,
27 |         "Evaluating assembly",
28 |         pima_data.main_process_verbosity,
29 |         pima_data.main_process_color,
30 |     )
31 |     genome_sizes = pima_data.genome_fasta.replace(".fasta", ".sizes")
32 |     command = " ".join(
33 |         ["faidx -i chromsizes", pima_data.genome_fasta, ">", genome_sizes]
34 |     )
35 |     print_and_run(pima_data, command)
36 | 
37 |     assembly_info = pd.read_csv(genome_sizes, sep="\t", header=None)
38 |     assembly_info.columns = ["contig", "length"]
39 |     pima_data.contig_sizes = assembly_info
40 | 
41 |     # Take a look at the number of contigs, their sizes, and circularity.  Warn if things don't look good
42 |     if assembly_info.shape[0] > 4:
43 |         warning = f"Assembly produced {assembly_info.shape[0]} contigs, more than ususally expected; assembly may be fragmented."
44 |         print_and_log(
45 |             pima_data, warning, pima_data.warning_verbosity, pima_data.warning_color
46 |         )
47 |         pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype='object')])
48 |     small_contigs = assembly_info.loc[assembly_info["length"] <= 3000, :]
49 | 
50 |     if small_contigs.shape[0] > 0:
51 |         warning = f"Assembly produced {small_contigs.shape[0]} small contigs ({', '.join(small_contigs['contig'])}); assembly may include spurious sequences."
52 |         print_and_log(
53 |             pima_data, warning, pima_data.warning_verbosity, pima_data.warning_color
54 |         )
55 |         pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype='object')])


--------------------------------------------------------------------------------
/Pima/modules/fastq.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from Pima.pima_data import PimaData
  7 | from Pima.utils.settings import Settings
  8 | 
  9 | from Pima.utils.utils import (
 10 |     validate_file_and_size,
 11 |     validate_file_and_size_or_error,
 12 |     print_and_log,
 13 |     print_and_run,
 14 |     error_out,
 15 |     add_warning,
 16 |     find_checkpoint,
 17 |     make_start_file,
 18 |     make_finish_file,
 19 |     format_kmg,
 20 | )
 21 | from Pima.utils.mapping import (
 22 |     minimap_and_sort,
 23 |     filter_bam,
 24 | )
 25 | 
 26 | 
 27 | def validate_ont_fastq(pima_data: PimaData, settings: Settings):
 28 |     # skip condition
 29 |     if not pima_data.ont_fastq:
 30 |         return
 31 | 
 32 |     if os.path.isdir(pima_data.ont_fastq):
 33 |         pima_data.errors.append(f"The provided '--ont-fastq' is a directory, did you mean to include the '--multiplexed' flag?\nOffending input: {pima_data.ont_fastq}")
 34 |         return
 35 |     
 36 |     print_and_log(
 37 |         pima_data,
 38 |         "Validating ONT FASTQ",
 39 |         pima_data.main_process_verbosity,
 40 |         pima_data.main_process_color,
 41 |     )
 42 | 
 43 |     if not validate_file_and_size(
 44 |         pima_data, the_file=pima_data.ont_fastq, min_size=1000
 45 |     ):
 46 |         error = f"Input ONT FASTQ file {pima_data.ont_fastq} cannot be found"
 47 |         pima_data.errors.append(error)
 48 | 
 49 |     pima_data.ont_fastq = os.path.realpath(pima_data.ont_fastq)
 50 |     pima_data.will_have_ont_fastq = True
 51 |     pima_data.analysis.append(["info_given_ont_fastq", pima_data, settings])
 52 | 
 53 | 
 54 | def validate_illumina_fastq(pima_data: PimaData):
 55 |     if not pima_data.illumina_fastq:
 56 |         return
 57 |     
 58 |     print_and_log(
 59 |         pima_data,
 60 |         'Validating Illumina data', 
 61 |         pima_data.main_process_verbosity, 
 62 |         pima_data.main_process_color,
 63 |     )
 64 |     
 65 |     if not type(pima_data.illumina_fastq) is list:
 66 |         pima_data.illumina_fastq = [pima_data.illumina_fastq]
 67 | 
 68 |     for r_fastq in pima_data.illumina_fastq:
 69 |         if not validate_file_and_size(pima_data, the_file = r_fastq, min_size = 1000):
 70 |             pima_data.errors.append('Illumina FASTQ file ' + r_fastq + ' cannot be found or is size 0')
 71 | 
 72 |     pima_data.analysis.append(['info_illumina_fastq', pima_data])
 73 | 
 74 | 
 75 | def validate_genome_estimate(pima_data: PimaData, settings: Settings):
 76 |     # skip conditions
 77 |     if pima_data.no_assembly:
 78 |         return
 79 | 
 80 |     if pima_data.genome_fasta is not None:
 81 |         return
 82 | 
 83 |     #skip this if we already have a genome assembly, let the assembly module re-initiate assembly info
 84 |     if pima_data.resume:
 85 |         if find_checkpoint(pima_data, os.path.join(pima_data.output_dir, "ont_assembly")):
 86 |             return
 87 | 
 88 |     if not pima_data.will_have_ont_fastq:
 89 |         return
 90 | 
 91 |     if not pima_data.genome_assembly_size == "estimate":
 92 |         return
 93 |     
 94 |     print_and_log(
 95 |         pima_data,
 96 |         "Estimating expected genome size using median single copy gene coverages",
 97 |         pima_data.main_process_verbosity,
 98 |         pima_data.main_process_color,
 99 |     )
100 |     median_cov = estimate_genome_size(pima_data, settings)
101 |     result = (
102 |         f"Estimated genome size: {round(pima_data.genome_assembly_raw_size)}\t"   
103 |         f"Median coverage: {round(median_cov,1)}"
104 |     )
105 |     print_and_log(
106 |         pima_data,
107 |         result,
108 |         pima_data.main_process_verbosity,
109 |         pima_data.main_process_color,
110 |     )
111 | 
112 | 
113 | def info_ont_fastq(pima_data: PimaData, settings: Settings):
114 |     print_and_log(
115 |         pima_data,
116 |         "Getting ONT FASTQ info",
117 |         pima_data.main_process_verbosity,
118 |         pima_data.main_process_color,
119 |     )
120 | 
121 |     opener = "cat"
122 |     if re.search(r"\.(gz|gzip)$", pima_data.ont_fastq):
123 |         opener = "gunzip -c"
124 | 
125 |     command = " ".join(
126 |         [
127 |             opener,
128 |             pima_data.ont_fastq,
129 |             "| awk '{getline;print length($0);s += length($1);getline;getline;} END{print \"+\"s}'",
130 |             "| sort -gr",
131 |             "| awk 'BEGIN{bp = 0;f = 0}",
132 |             '{if(NR == 1){sub(/+/,"", $1);s=$1} else{bp += $1;if(bp > s / 2 && f == 0){n50 = $1;f = 1}}}',
133 |             'END{printf "%d\\t%d\\t%d\\n", n50, (NR - 1), s;exit}\'',
134 |         ]
135 |     )
136 |     result = list(re.split(r"\t", print_and_run(pima_data, command)[0]))
137 |     if result[1] == "0":
138 |         error_out(pima_data, "No ONT reads found")
139 |     (pima_data.ont_n50, pima_data.ont_read_count, pima_data.ont_raw_bases) = [
140 |         int(i) for i in result
141 |     ]
142 | 
143 |     command = " ".join(
144 |         [
145 |             opener,
146 |             pima_data.ont_fastq,
147 |             "| awk '{getline;print length($0);getline;getline;}'",
148 |         ]
149 |     )
150 |     result = print_and_run(pima_data, command)
151 |     result = list(filter(lambda x: x != "", result))
152 |     pima_data.ont_read_lengths = [int(i) for i in result]
153 | 
154 | 
155 | def info_given_ont_fastq(pima_data: PimaData, settings: Settings):
156 |     info_ont_fastq(pima_data, settings)
157 |     validate_genome_estimate(pima_data, settings)
158 | 
159 |     if pima_data.ont_n50 <= pima_data.ont_n50_min:
160 |         warning = (
161 |             "ONT N50 ("
162 |             + str(pima_data.ont_n50)
163 |             + ") is less than the recommended minimum ("
164 |             + str(pima_data.ont_n50_min)
165 |             + ")."
166 |         )
167 |         add_warning(pima_data, warning)
168 |         pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype='object')])
169 |     
170 |     # See if we should downsample the ONT FASTQ file
171 |     if not (pima_data.genome_assembly_size):
172 |         print_and_log(
173 |             pima_data,
174 |             "Cannot downsample since --genome-size was not provided",
175 |             pima_data.sub_process_verbosity,
176 |             pima_data.sub_process_color,
177 |         )
178 |         return
179 |     
180 |     if pima_data.genome_assembly_raw_size is not None:
181 |         present_coverage = pima_data.ont_raw_bases / pima_data.genome_assembly_raw_size
182 |         if present_coverage >= pima_data.assembly_coverage + 1:
183 |             downsample_ont_fastq(pima_data)
184 |             return
185 | 
186 |     print_and_log(
187 |         pima_data,
188 |         "No downsampling of reads performed",
189 |         pima_data.sub_process_verbosity,
190 |         pima_data.sub_process_color,
191 |     )
192 | 
193 | 
194 | def downsample_ont_fastq(pima_data: PimaData):
195 |     print_and_log(
196 |         pima_data,
197 |         f"Downsampling ONT data to {pima_data.assembly_coverage}X coverage",
198 |         pima_data.sub_process_verbosity,
199 |         pima_data.sub_process_color,
200 |     )
201 | 
202 |     opener = "cat"
203 |     if re.search(r"\.(gz|gzip)$", pima_data.ont_fastq):
204 |         opener = "gunzip -c"
205 | 
206 |     pima_data.downsample_dir = os.path.join(pima_data.output_dir, "downsample")
207 |     
208 |     if find_checkpoint(pima_data, pima_data.downsample_dir):
209 |         downsampled_fastq = os.path.join(pima_data.downsample_dir, "downsampled.fastq")
210 |         pima_data.ont_fastq = downsampled_fastq
211 |         return
212 | 
213 |     os.makedirs(pima_data.downsample_dir)
214 |     make_start_file(pima_data, pima_data.downsample_dir)
215 | 
216 |     desired_bases = int(
217 |         pima_data.genome_assembly_raw_size * pima_data.assembly_coverage
218 |     )
219 | 
220 |     read_length_tsv = os.path.join(pima_data.downsample_dir, "read_sizes.tsv")
221 |     command = " ".join(
222 |         [
223 |             opener,
224 |             pima_data.ont_fastq,
225 |             "| awk '{printf $1\"\t\";getline;print length($1);getline;getline}'",
226 |             "| sort -k 2gr",
227 |             "| awk 'BEGIN{bp = 0}{bp += $2;print; if (bp >= ",
228 |             str(desired_bases),
229 |             "){exit}}'",
230 |             ">",
231 |             read_length_tsv,
232 |         ]
233 |     )
234 |     print_and_run(pima_data, command)
235 |     validate_file_and_size_or_error(
236 |         pima_data, read_length_tsv, "Read length index", "doesn't exist", "is empty"
237 |     )
238 | 
239 |     downsampled_fastq = os.path.join(pima_data.downsample_dir, "downsampled.fastq")
240 |     command = " ".join(
241 |         [
242 |             opener,
243 |             pima_data.ont_fastq,
244 |             "| awk 'BEGIN{while(getline < \"" + read_length_tsv + '"){l[$1] = 1}}',
245 |             "{if($1 in l){print;getline;print;getline;print;getline;print}}'",
246 |             ">",
247 |             downsampled_fastq,
248 |         ]
249 |     )
250 |     print_and_run(pima_data, command)
251 |     validate_file_and_size_or_error(
252 |         pima_data, downsampled_fastq, "Downsampled FASTQ", "doesn't exist", "is empty"
253 |     )
254 | 
255 |     # Keep this new FASTQ
256 |     pima_data.ont_fastq = downsampled_fastq
257 |     make_finish_file(pima_data, pima_data.downsample_dir)
258 | 
259 | 
260 | def info_illumina_fastq(pima_data: PimaData) :
261 | 
262 |     print_and_log(
263 |         pima_data,
264 |         'Getting Illumina FASTQ info', 
265 |         pima_data.main_process_verbosity, 
266 |         pima_data.main_process_color,
267 |     )
268 | 
269 |     pima_data.illumina_length_mean, pima_data.illumina_read_count, pima_data.illumina_bases = 0,0,0
270 |     opener = 'cat'
271 |     if re.search(r'\.(gz|gzip)$', pima_data.illumina_fastq[0]):
272 |         opener = 'gunzip -c'
273 | 
274 |     for r in range(len(pima_data.illumina_fastq)):
275 |         r_fastq = pima_data.illumina_fastq[r]
276 |         command = ' '.join(
277 |             [
278 |                 opener,
279 |                 r_fastq,
280 |                 '| awk \'{getline;s += length($1);getline;getline;}END{print s/(NR/4)"\t"(NR/4)"\t"s}\'',
281 |             ]
282 |         )
283 |         
284 |         values = [float(i) for i in re.split(r'\t', print_and_run(pima_data, command)[0])]
285 |         pima_data.illumina_length_mean += values[0]
286 |         pima_data.illumina_read_count += int(values[1])
287 |         pima_data.illumina_bases += int(values[2])
288 |         
289 |     pima_data.illumina_length_mean /= len(pima_data.illumina_fastq)
290 |     pima_data.illumina_bases = format_kmg(pima_data.illumina_bases, decimals = 1)
291 | 
292 | 
293 | def estimate_genome_size(pima_data: PimaData, settings: Settings):
294 |     """Estimate expected genome size using median coverage of a set of 20 universal single copy genes (pulled from Bacillus anthracis)
295 |     
296 |     The set was pulled from this reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10631056/
297 |     """
298 | 
299 |     reference_ffn = os.path.join(settings.data_dir, 'ames_single_copy_genes.fna')
300 | 
301 |     temp_bam = os.path.join(pima_data.output_dir, 'single_copy_genes.bam')
302 |     minimap_and_sort(
303 |         pima_data,
304 |         reference_ffn,
305 |         temp_bam,
306 |         pima_data.ont_fastq,
307 |         ont=True,
308 |     )
309 |     filter_bam(
310 |         pima_data,
311 |         inbam = temp_bam,
312 |         F = "4",
313 |         q = "0",
314 |     )
315 | 
316 |     command = " ".join(
317 |         [
318 |             "samtools coverage",
319 |             temp_bam,
320 |             "| grep -v '^#'",
321 |             "| cut -f 7 | sort -n",
322 |             "| awk '{ a[i++]=$1; } END { print a[int(i/2)]; }'",
323 |         ]
324 |     )
325 |     median_cov = float(print_and_run(pima_data, command)[0])
326 |     pima_data.genome_assembly_raw_size = pima_data.ont_raw_bases / median_cov
327 |     [os.remove(file) for file in [temp_bam, temp_bam + '.bai', os.path.splitext(temp_bam)[0] + '.stderr']]
328 |     return median_cov


--------------------------------------------------------------------------------
/Pima/modules/illumina_polishing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import subprocess
  5 | import pandas as pd
  6 | import pathlib
  7 | 
  8 | from Pima.pima_data import PimaData
  9 | from Pima.utils.utils import (
 10 |     print_and_log,
 11 |     print_and_run,
 12 |     add_warning,
 13 |     validate_utility,
 14 |     validate_file_and_size_or_error,
 15 |     find_checkpoint,
 16 |     make_start_file,
 17 |     make_finish_file,
 18 |     make_report_info_file,
 19 |     std_files,
 20 | )
 21 | from Pima.utils.mapping import (
 22 |     bwa_short_illumina_fastq_and_sort,
 23 |     minimap_and_sort,
 24 |     bwa_mem_all_aln_illumina,
 25 | )
 26 | 
 27 | def validate_illumina_polish(pima_data: PimaData):
 28 |     if not pima_data.will_have_ont_assembly and not pima_data.will_have_genome_fasta:
 29 |         return
 30 | 
 31 |     if pima_data.illumina_polisher == "pilon":
 32 |         validate_pilon(pima_data)
 33 |     elif pima_data.illumina_polisher == "polypolish":
 34 |         validate_polypolish(pima_data)
 35 |     else:
 36 |         return
 37 | 
 38 | 
 39 | def validate_pilon(pima_data: PimaData):
 40 | 
 41 |     # skip conditions
 42 |     if pima_data.no_assembly:
 43 |         return
 44 |     if not (pima_data.will_have_genome_fasta and pima_data.illumina_fastq):
 45 |         return
 46 |     if not pima_data.illumina_polisher == "pilon":
 47 |         return
 48 |     
 49 |     print_and_log(
 50 |         pima_data,
 51 |         'Validating Pilon and memory arguments', 
 52 |         pima_data.main_process_verbosity, 
 53 |         pima_data.main_process_color,
 54 |     )
 55 | 
 56 |     for utility in ['minimap2', 'pilon']:
 57 |         if validate_utility(pima_data, utility, f"{utility} is not on the PATH (required by --illumina-fastq)."):
 58 |             command = f"{utility} --version"
 59 |             pima_data.versions[utility] = re.search(r'[0-9]+\.[0-9.]+', print_and_run(pima_data,command)[0]).group(0)
 60 | 
 61 |     pima_data.analysis.append(["pilon_assembly", pima_data])
 62 | 
 63 | 
 64 | def validate_polypolish(pima_data: PimaData):
 65 |     # skip conditions
 66 |     if pima_data.no_assembly:
 67 |         return
 68 |     if not (pima_data.will_have_genome_fasta and pima_data.illumina_fastq):
 69 |         return
 70 |     if not pima_data.illumina_polisher == "polypolish":
 71 |         return
 72 |     
 73 |     print_and_log(
 74 |         pima_data,
 75 |         'Validating polypolish', 
 76 |         pima_data.main_process_verbosity, 
 77 |         pima_data.main_process_color,
 78 |     )
 79 | 
 80 |     if validate_utility(pima_data, "polypolish", "polypolish is not on the PATH (required by --illumina-polisher polypolish)."):
 81 |         command = "polypolish --version"
 82 |         pima_data.versions['polypolish'] = re.search(r'[0-9]+\.[0-9.]+', print_and_run(pima_data,command)[0]).group(0)
 83 | 
 84 |     if validate_utility(pima_data, "bwa", "bwa is not on the path (required by --illumina-polisher polypolish)."):
 85 |         command = "bwa"
 86 |         result = subprocess.run(command, shell=True, capture_output=True, text=True)
 87 |         version_element = [i for i,x in enumerate(result.stderr.split()) if re.search("Version", x)]
 88 |         pima_data.versions['bwa'] = re.search(r'[0-9]+\.[0-9.]+', result.stderr.split()[version_element[0] + 1]).group(0)
 89 |     
 90 |     pima_data.analysis.append(["polypolish_assembly", pima_data])
 91 |     
 92 | 
 93 | def polypolish_assembly(pima_data: PimaData):
 94 | 
 95 |     print_and_log(
 96 |         pima_data,
 97 |         'Running polypolish on genome assembly', 
 98 |         pima_data.main_process_verbosity, 
 99 |         pima_data.main_process_color,
100 |     )
101 | 
102 |     pima_data.illumina_polish_dir = os.path.join(pima_data.output_dir, 'illumina_polish')
103 | 
104 |     ## Check if pilon has been run before to completion
105 |     if find_checkpoint(pima_data, pima_data.illumina_polish_dir):
106 |         pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta')
107 |         pima_data.load_genome()
108 | 
109 |         print_and_log(
110 |             pima_data,
111 |             'Polypolish had previously been run and finished successfully', 
112 |             pima_data.sub_process_verbosity, 
113 |             pima_data.sub_process_color,
114 |         )
115 |         pima_data.did_polypolish_ont_assembly = True
116 |         pima_data.files_to_clean.extend(
117 |             list(pathlib.Path(pima_data.illumina_polish_dir).glob("*.bam"))
118 |         )
119 |         pima_data.files_to_clean.append(os.path.join(pima_data.illumina_polish_dir, 'polypolish.fasta'))
120 |         return
121 | 
122 |     os.makedirs(pima_data.illumina_polish_dir)
123 |     make_start_file(pima_data, pima_data.illumina_polish_dir)
124 | 
125 |     # Map illumina reads onto the assembly 
126 |     #We need to do this again because polypolish needs separate bam files 
127 |     #and all alignments from bwa
128 |     print_and_log(
129 |         pima_data,
130 |         'Mapping Illumina reads to assembly', 
131 |         pima_data.sub_process_verbosity, 
132 |         pima_data.sub_process_color,
133 |     )
134 | 
135 |     #This gets turned into 2 files by the bwa function if there are R1/R2 files provided
136 |     polypolish_bam = os.path.join(pima_data.illumina_polish_dir, 'mapping.bam')
137 | 
138 |     bams = bwa_mem_all_aln_illumina(pima_data, pima_data.genome_fasta, pima_data.illumina_fastq, polypolish_bam)
139 |     polypolish_cmd = "polypolish polish"
140 |     # If mean coverage is low then give some warning & use correct form of polypolish
141 |     if pima_data.mean_coverage['Illumina'] < 25:
142 |         warning = f"Mean illumina coverage ({pima_data.mean_coverage['Illumina']}X) is below 25X, using careful mode)."
143 |         pima_data.add_warning(warning)
144 |         pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warning, dtype = 'object')])
145 |         polypolish_cmd = "polypolish polish --careful"
146 | 
147 |     # Actually run polypolish
148 |     print_and_log(
149 |         pima_data,
150 |         'Running Polypolish', 
151 |         pima_data.sub_process_verbosity, 
152 |         pima_data.sub_process_color,
153 |     )
154 | 
155 |     pfilt_stdout, pfilt_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'poly_filter'))
156 |     polypolish_prefix = os.path.join(pima_data.illumina_polish_dir, 'polypolish')
157 |     polished_fasta = polypolish_prefix + '.fasta'
158 | 
159 |     if len(bams) == 2:
160 |         filt_bams = [re.sub(r"\.bam", r"_filt.bam", x) for x in bams]
161 |         #you can't filter unpaired bams
162 |         command = " ".join(
163 |             [
164 |                 "polypolish filter",
165 |                 "--in1", bams[0],
166 |                 "--in2", bams[1],
167 |                 "--out1", filt_bams[0],
168 |                 "--out2", filt_bams[1],
169 |                 "1>", pfilt_stdout, "2>", pfilt_stderr,
170 |             ]
171 |         )
172 |         print_and_run(pima_data, command)
173 |         validate_file_and_size_or_error(pima_data, filt_bams[0], 'Polypolish filtered bam', 'cannot be found after filtering', 'is empty')
174 |         _, polish_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'polypolish'))
175 |         command = " ".join(
176 |             [
177 |                 polypolish_cmd,
178 |                 pima_data.genome_fasta,
179 |                 " ".join(filt_bams),
180 |                 "1>", polished_fasta,
181 |                 "2>", polish_stderr,
182 |             ]
183 |         )
184 |         print_and_run(pima_data, command)
185 |         pima_data.files_to_clean.extend(bams + filt_bams)
186 | 
187 |     else:
188 |         #We have only a single fastq file so we can't filter the alignments
189 |         _, polish_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'polypolish'))
190 |         command = " ".join(
191 |             [
192 |                 polypolish_cmd,
193 |                 pima_data.genome_fasta,
194 |                 " ".join(bams),
195 |                 "1>", polished_fasta,
196 |                 "2>", polish_stderr,
197 |             ]
198 |         )
199 |         print_and_run(pima_data, command)
200 |         pima_data.files_to_clean.extend(bams)
201 |     validate_file_and_size_or_error(pima_data, polished_fasta, 'Polypolished assembly', 'cannot be found after running polypolish', 'is empty')
202 | 
203 |     print_and_log(
204 |         pima_data,
205 |         'Repairing contig names after polypolish', 
206 |         pima_data.sub_process_verbosity, 
207 |         pima_data.sub_process_color,
208 |     )
209 |     pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta')
210 |     command = " ".join(
211 |         [
212 |             'cat', 
213 |             polished_fasta,
214 |             '| awk \'{if($0 ~ /^>/){gsub("_pilon", "", $0)}print}\'',
215 |             '>', pima_data.genome_fasta,
216 |         ]
217 |     )
218 |     print_and_run(pima_data, command)
219 |     validate_file_and_size_or_error(pima_data, pima_data.genome_fasta, 'Genome assembly', 
220 |                                             'cannot be found after fixing names', 'is empty')
221 |     pima_data.files_to_clean.append(polished_fasta)
222 |     make_finish_file(pima_data, pima_data.illumina_polish_dir)
223 |     pima_data.did_polypolyish_ont_assembly = True #track for the report
224 | 
225 | def pilon_assembly(pima_data: PimaData):
226 |     print_and_log(
227 |         pima_data,
228 |         'Running Pilon on genome assembly', 
229 |         pima_data.main_process_verbosity, 
230 |         pima_data.main_process_color,
231 |     )
232 |     pima_data.illumina_polish_dir = os.path.join(pima_data.output_dir, 'illumina_polish')
233 | 
234 |     ## Check if pilon has been run before to completion
235 |     if find_checkpoint(pima_data, pima_data.illumina_polish_dir):
236 |         pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta')
237 |         pilon_bam = os.path.join(pima_data.illumina_polish_dir, 'mapping.bam')
238 |         pima_data.files_to_clean.append(pilon_bam)
239 |         pima_data.load_genome()
240 | 
241 |         print_and_log(
242 |             pima_data,
243 |             'Pilon had previously been run and finished successfully', 
244 |             pima_data.main_process_verbosity, 
245 |             pima_data.main_process_color,
246 |         )
247 |         pima_data.did_pilon_ont_assembly = True
248 |         return
249 | 
250 |     os.makedirs(pima_data.illumina_polish_dir)
251 |     make_start_file(pima_data, pima_data.illumina_polish_dir)
252 | 
253 |     # Map illumina reads onto the assembly
254 |     print_and_log(
255 |         pima_data,
256 |         'Mapping Illumina reads to assembly', 
257 |         pima_data.sub_process_verbosity, 
258 |         pima_data.sub_process_color,
259 |     )
260 |     pilon_bam = os.path.join(pima_data.illumina_polish_dir, 'mapping.bam')
261 |     pima_data.files_to_clean.append(pilon_bam)
262 | 
263 |     # See what mapping method to use - bwa aln or minimap 2
264 |     if pima_data.illumina_length_mean <= 50:
265 |         bwa_short_illumina_fastq_and_sort(pima_data, pima_data.genome_fasta, pima_data.illumina_fastq, pilon_bam)
266 |     else: # We have longer short reads
267 |         minimap_and_sort(pima_data, pima_data.genome_fasta, pilon_bam, pima_data.illumina_fastq, ont=False)
268 | 
269 | 
270 |     # Figure out the depth here.  If it's too low, give some sort of warning?
271 |     coverage_tsv = os.path.join(pima_data.illumina_polish_dir, 'coverage.tsv')
272 |     command = " ".join(
273 |         [
274 |             'samtools depth -a', 
275 |             pilon_bam,
276 |             '| awk \'{s += $3; c++}END{printf "%s\\t%i\\t%.0f\\n", i, c, (s / c)}\'',
277 |             '>', coverage_tsv,
278 |         ]
279 |     )
280 |     print_and_run(pima_data, command)
281 |     validate_file_and_size_or_error(pima_data, coverage_tsv, 'Coverage TSV', 'cannot be found after samtools', 'is empty')
282 |     pilon_coverage = pd.read_csv(coverage_tsv, header = None, index_col = None, sep = '\t').iloc[0,2]
283 | 
284 |     # If mean coverage is low then give some warning
285 |     if pilon_coverage < pima_data.pilon_coverage_min:
286 |         warning = f"Illumina coverage for Pilon {pilon_coverage}X is below the recommended minimum ({pima_data.pilon_coverage_min}X), we recommend using polypolish instead."
287 |         add_warning(pima_data, warning)
288 |         
289 |     # Actually run pilon
290 |     print_and_log(
291 |         pima_data,
292 |         'Running Pilon', 
293 |         pima_data.sub_process_verbosity, 
294 |         pima_data.sub_process_color,
295 |     )
296 |     pilon_stdout, pilon_stderr = std_files(os.path.join(pima_data.illumina_polish_dir, 'pilon'))
297 |     pilon_prefix = os.path.join(pima_data.illumina_polish_dir, 'pilon')
298 |     polished_fasta = pilon_prefix + '.fasta'
299 |     bam_option = '--frags'
300 |     if len(pima_data.illumina_fastq) == 1:
301 |         bam_option = '--unpaired'
302 | 
303 |     command = " ".join(
304 |         [
305 |             'pilon',
306 |             '-Xms4g -Xmx4g',
307 |             '--genome', pima_data.genome_fasta,
308 |             bam_option, pilon_bam,
309 |             '--output', pilon_prefix,
310 |             '1>', pilon_stdout, '2>', pilon_stderr,
311 |         ]
312 |     )
313 |     print_and_run(pima_data, command)
314 |     validate_file_and_size_or_error(pima_data, polished_fasta, 'Pilon FASTA', 'cannot be found after pilon', 'is empty')
315 | 
316 |     print_and_log(
317 |         pima_data,
318 |         'Repairing contig names after Pilon', 
319 |         pima_data.sub_process_verbosity, 
320 |         pima_data.sub_process_color,
321 |     )
322 |     pima_data.genome_fasta = os.path.join(pima_data.illumina_polish_dir, 'assembly.fasta')
323 |     command = " ".join(
324 |         [
325 |             'cat', 
326 |             polished_fasta,
327 |             '| awk \'{if($0 ~ /^>/){gsub("_pilon", "", $0)}print}\'',
328 |             '>', pima_data.genome_fasta,
329 |         ]
330 |     )
331 |     print_and_run(pima_data, command)
332 |     validate_file_and_size_or_error(pima_data, pima_data.genome_fasta, 'Genome assembly', 
333 |                                             'cannot be found after fixing names', 'is empty')
334 |     pima_data.load_genome()
335 |     pima_data.did_pilon_ont_assembly = True
336 |     make_finish_file(pima_data, pima_data.illumina_polish_dir)
337 | 


--------------------------------------------------------------------------------
/Pima/modules/multiplexed.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import os
  3 | import re
  4 | import copy
  5 | import datetime
  6 | import glob
  7 | import shutil
  8 | 
  9 | from collections import defaultdict
 10 | 
 11 | from Pima.pima_data import PimaData
 12 | from Pima.utils.settings import Settings
 13 | from Pima import modules
 14 | import Pima.pima
 15 | 
 16 | from Pima.utils.utils import (
 17 |     print_and_log,
 18 |     print_and_run,
 19 |     std_files,
 20 |     stop_logging,
 21 |     validate_utility,
 22 | )
 23 | 
 24 | class barcode_data:
 25 |     def __init__(self, barcode_id: str, barcode_root_path: str | list, barcode_fastq_list: list, barcode_fastq_paths: list, barcode_size_bytes: int):
 26 |         self.barcode_id = barcode_id
 27 |         self.barcode_root_path = barcode_root_path
 28 |         self.barcode_fastq_list = barcode_fastq_list
 29 |         self.barcode_fastq_paths = barcode_fastq_paths
 30 |         self.barcode_size_bytes = barcode_size_bytes
 31 |         self.barcode_size_bp = None
 32 | 
 33 |     def update_barcode(self, barcode_root_path: str, barcode_fastq_list: list, barcode_fastq_paths: list, barcode_size_bytes: int):
 34 |         self.barcode_root_path = [self.barcode_root_path, barcode_root_path]
 35 |         self.barcode_fastq_list = [*self.barcode_fastq_list, *barcode_fastq_list]
 36 |         self.barcode_fastq_paths = [*self.barcode_fastq_paths, *barcode_fastq_paths]
 37 |         self.barcode_size_bytes = self.barcode_size_bytes + barcode_size_bytes
 38 | 
 39 |     def create_concat_fastq(self, pima_data, fastq_path:str = None):
 40 |         if len(self.barcode_fastq_list) == 1:
 41 |             if fastq_path:
 42 |                 #for nf multiplexing
 43 |                 if re.search(r'\.(gz|gzip)$', self.barcode_fastq_list[0]):
 44 |                     fastq_path = fastq_path + ".gz"
 45 |                 os.symlink(self.barcode_fastq_paths[0], fastq_path)
 46 |                 pima_data.ont_fastq = fastq_path
 47 | 
 48 |             else:
 49 |                 pima_data.ont_fastq = self.barcode_fastq_paths[0]
 50 |             return
 51 |         
 52 |         print_and_log(
 53 |                 pima_data,
 54 |                 "Concatenating barcode fastq files",
 55 |                 pima_data.sub_process_verbosity,
 56 |                 pima_data.sub_process_color,
 57 |         )
 58 | 
 59 |         if re.search(r'\.(gz|gzip)$', self.barcode_fastq_list[0]):
 60 |             pima_data.ont_fastq = pima_data.ont_fastq + ".gz"
 61 | 
 62 |         command = " ".join(
 63 |             [
 64 |                 "cat",
 65 |                 " ".join(self.barcode_fastq_paths),
 66 |                 f"> {pima_data.ont_fastq}",
 67 |             ]
 68 |         )
 69 |         print_and_run(pima_data, command)
 70 | 
 71 |     def report_multiplex_sample(self):
 72 |         message = f"Running PiMA on {self.barcode_id}"
 73 |         return message
 74 | 
 75 | 
 76 | def validate_multiplex_fastq(pima_data: PimaData):
 77 |     if not pima_data.multiplexed:
 78 |         return
 79 |     
 80 |     if pima_data.resume and pima_data.nextflow:
 81 |         pima_data.errors.append("--resume does not currently work with nextflow multiplexing. If the assemblies were completed in the previous attempt, you can resume the multiplex run in serial by removing '--nextflow' or resume each sample independently without '--multiplexed', otherwise just use '--overwrite'")
 82 |         
 83 |     if not pima_data.ont_fastq:
 84 |         pima_data.errors.append("--multiplexed requires that a directory of FASTQ files or directories of FASTQ files be given")
 85 | 
 86 |     if pima_data.illumina_fastq:
 87 |         pima_data.errors.append("--multiplexing does not currently work with illumina data. Exiting")
 88 | 
 89 |     if pima_data.barcode_min_fraction >= 1:
 90 |         pima_data.errors.append(f"--barcode_min_fraction is greater than 1, did you mean to use {pima_data.barcode_min_fraction / 100}?") 
 91 | 
 92 |     if pima_data.genome_assembly_size is not None and pima_data.genome_assembly_size != "estimate":
 93 |         print_and_log(
 94 |             pima_data,
 95 |             f"Using the same --genome_size {pima_data.genome_assembly_size} for every sample in the multiplex run. If you do not expect all samples to have the same genome size (+/-10%), please cancel (ctrl+c) and re-run using '--genome-size estimate' or leave it blank (prevents downsampling)'",
 96 |             pima_data.warning_verbosity,
 97 |             pima_data.warning_color,
 98 |         )
 99 | 
100 |     if os.path.isfile(pima_data.ont_fastq) and pima_data.barcode_kit:
101 |         #fastq data has not been demultiplexed - try to demulitplex it ?
102 |         ## currently will just exit if user gives a single fastq file
103 |         
104 |         message = ("You provided a single fastq file and indicated it is multiplexed. "
105 |                     "PiMA currently doesn't demultiplex a fastq file since this file type is not common. "
106 |                     "If you need to demultiplex, we recommend using dorado. "
107 |                     "Please let us know if this is a feature you'd like to see added."
108 |                     "Exiting now"
109 |         )
110 |         pima_data.errors.append(message)
111 | 
112 |     pima_data.will_have_ont_fastq = True
113 |     pima_data.ont_fastq = os.path.realpath(pima_data.ont_fastq)
114 |     pima_data.output_dir = os.path.realpath(pima_data.output_dir)
115 | 
116 | 
117 | def identify_multiplexed_fastq_files(pima_data: PimaData):
118 |         
119 |         print_and_log(
120 |             pima_data,
121 |             "Starting Multiplex Analysis",
122 |             pima_data.main_process_verbosity,
123 |             pima_data.main_process_color,
124 |         )
125 |         
126 |         multiplexed_dirs = defaultdict()
127 |         total_dir_size = 0
128 | 
129 |         if not any(os.path.isdir(os.path.join(pima_data.ont_fastq, item)) for item in os.listdir(pima_data.ont_fastq)) and any([re.search(r"fastq", item) for item in os.listdir(pima_data.ont_fastq)]):
130 |             #user is providing a directory contains fastq files for each sample (hopefully)
131 |             for fastq in os.listdir(pima_data.ont_fastq):
132 |                 name = os.path.splitext(os.path.basename(fastq))[0]
133 | 
134 |                 if re.search(r"\.(gz|gzip)$", fastq):
135 |                     name = os.path.splitext(os.path.splitext(fastq)[0])[0]
136 | 
137 |                 fastq_size = os.path.getsize(os.path.join(pima_data.ont_fastq, fastq))
138 |                 multiplexed_dirs[name] = barcode_data(
139 |                             barcode_id = name,
140 |                             barcode_root_path = pima_data.ont_fastq,
141 |                             barcode_fastq_list = [fastq],
142 |                             barcode_fastq_paths = [os.path.join(pima_data.ont_fastq, fastq)],
143 |                             barcode_size_bytes = fastq_size)
144 |                 total_dir_size = total_dir_size + fastq_size
145 | 
146 |         else: 
147 |             for root, dirs, files in os.walk(pima_data.ont_fastq):
148 |                 if any([re.search(r"fail", dir) for dir in dirs]):
149 |                     print_and_log(
150 |                         pima_data,
151 |                         "There are directories with the string 'fail' in the name. We will use both passing and failing reads for each sample. If you wish to use only passing reads, please cancel (ctrl+c) and re-run giving just the fastq_pass directory as input",
152 |                         pima_data.warning_verbosity,
153 |                         pima_data.warning_color,
154 |                     )
155 | 
156 |                 if len(files) > 0:
157 |                     if re.search(r"fastq",files[0]) and not re.search(r"unclassified", root):
158 |                         if os.path.basename(root) in multiplexed_dirs:
159 |                             dir_size = sum(os.path.getsize(os.path.join(root, f)) for f in files)
160 |                             multiplexed_dirs[os.path.basename(root)].update_barcode(root, files, [os.path.join(root, f) for f in files], dir_size)
161 |                             total_dir_size = total_dir_size + dir_size
162 | 
163 |                         else:
164 |                             dir_size = sum(os.path.getsize(os.path.join(root, f)) for f in files)
165 |                             multiplexed_dirs[os.path.basename(root)] = barcode_data(
166 |                                                         barcode_id = os.path.basename(root),
167 |                                                         barcode_root_path = root,
168 |                                                         barcode_fastq_list = files,
169 |                                                         barcode_fastq_paths = [os.path.join(root, f) for f in files],
170 |                                                         barcode_size_bytes = dir_size)
171 |                             total_dir_size = total_dir_size + dir_size
172 | 
173 |         #ignored_barcodes = []
174 |         ignored_barcodes = dict()
175 |         for barcode in multiplexed_dirs.copy().values():
176 |             perc_data = barcode.barcode_size_bytes / total_dir_size
177 |             if perc_data < pima_data.barcode_min_fraction:
178 |                 #ignored_barcodes.append(barcode.barcode_id)
179 |                 ignored_barcodes[barcode.barcode_id] = perc_data
180 |                 del multiplexed_dirs[barcode.barcode_id]
181 | 
182 |         print_and_log(
183 |                 pima_data,
184 |                 f"Running PiMA on barcodes: {', '.join(multiplexed_dirs.keys())}",
185 |                 pima_data.main_process_verbosity,
186 |                 pima_data.main_process_color,
187 |         )
188 | 
189 |         if len(ignored_barcodes) > 0:
190 |             message = (
191 |                 "The following barcodes were found in the input directory but were NOT analyzed "
192 |                 f"because they contained less than {pima_data.barcode_min_fraction*100}% (default=0.025 [2.5%]) of the fastq data:\n"
193 |                 "If you need to change the min_fraction, please re-run pima with the following flag '--barcode_min_fraction <fractional value>'\n"
194 |             )
195 |             for k, v in ignored_barcodes.items():
196 |                 message = message + "{:<15} {:>.1%}".format(k, v) + "\n"
197 |             print_and_log(
198 |                 pima_data,
199 |                 message,
200 |                 pima_data.warning_verbosity,
201 |                 pima_data.warning_color,
202 |             )
203 |         pima_data.barcodes = multiplexed_dirs
204 | 
205 | 
206 | def initialize_multiplex_analysis(pima_data: PimaData, settings: Settings):
207 |     
208 |     identify_multiplexed_fastq_files(pima_data)
209 | 
210 |     if pima_data.nextflow or isinstance(pima_data.nextflow, str):
211 |         validate_nextflow(pima_data)
212 | 
213 |         #This is pretty hacky, but on rosalind I can't get it to pass the modules / environment through correctly
214 |         ## Here we basically use the pima run environment to feed the nextflow.config script user-specific values
215 |         
216 |         #generate nextflow config with the users conda environment that pima ran in
217 |         try:
218 |             conda_env = os.environ['CONDA_PREFIX']
219 |         except KeyError:
220 |             conda_env = "None"
221 | 
222 |         #use the activate script that was used in the parent environment
223 |         try:
224 |             activate_sh = os.environ['CONDA_EXE'].replace(r"/conda", "/activate")
225 |         except KeyError:
226 |             activate_sh = "None"
227 | 
228 |         nextflow_dir = os.path.join(settings.pima_path, "nextflow_parallelization")
229 |         nextflow_config_template = os.path.join(nextflow_dir, "nextflow.config.template")
230 |         user_nextflow_config = os.path.join(nextflow_dir, "nextflow.config")
231 |         find_replace = {
232 |             "conda = None": f"conda = '{conda_env}'",
233 |             "beforeScript = None": f"beforeScript = 'source {activate_sh}'"
234 |         }
235 |         with open(nextflow_config_template, "rt") as fin:
236 |             with open(user_nextflow_config, "wt") as fout:
237 |                 for line in fin:
238 |                     for key in find_replace:
239 |                         if key in line:
240 |                             line = line.replace(key, find_replace[key])
241 |                     fout.write(line)
242 | 
243 |         if isinstance(pima_data.nextflow, str):
244 |             nextflow_args = pima_data.nextflow.replace("'","").replace('"','')
245 |         else:
246 |             nextflow_args = ""
247 |         
248 |         print_and_log(
249 |             pima_data,
250 |             "Handing off multiplexing to Nextflow",
251 |             pima_data.main_process_verbosity,
252 |             pima_data.main_process_color,
253 |         )
254 | 
255 |         stop_logging(pima_data, "Sample specific logs are found in their respective directories, closing multiplex log now.")
256 | 
257 |         nf_file = os.path.join(pima_data.output_dir, "nf_singplex_inputs.csv")
258 |         
259 |         with open(nf_file, "w") as nf_handle:
260 |             for barcode in pima_data.barcodes.keys():
261 |                 barcode_pima_data = copy.deepcopy(pima_data)
262 |                 barcode_pima_data.output_dir = os.path.join(pima_data.output_dir, barcode)
263 |                 barcode_pima_data.ont_fastq = os.path.join(pima_data.output_dir, f"{barcode}.fastq")
264 |                 barcode_pima_data.barcodes[barcode].create_concat_fastq(barcode_pima_data, barcode_pima_data.ont_fastq)
265 |                 updated_cmd = strip_pima_cmd(pima_data, pima_data.run_command)
266 |                 line = (
267 |                     f"{barcode},"
268 |                     f"{barcode_pima_data.output_dir},"
269 |                     f"{barcode_pima_data.ont_fastq},"
270 |                     f"{updated_cmd}\n"
271 |                 )
272 |                 nf_handle.write(line)
273 | 
274 |         #run nextflow
275 |         #specify nextflow workdir to be within the pima output dir so we can delete everything after a successful run
276 |         nextflow_stdout, nextflow_stderr = std_files(f"{pima_data.output_dir}/nextflow")
277 |         command = " ".join(
278 |             [
279 |                 "nextflow run",
280 |                 os.path.join(settings.pima_path, "nextflow_parallelization/main.nf"),
281 |                 "--sample_sheet",
282 |                 nf_file,
283 |                 "--output",
284 |                 pima_data.output_dir,
285 |                 "-w",
286 |                 os.path.join(pima_data.output_dir, "work"),
287 |                 nextflow_args,
288 |                 "1>",
289 |                 nextflow_stdout,
290 |                 "2>",
291 |                 nextflow_stderr,
292 |             ]
293 |         )
294 |         print_and_run(pima_data, command,  change_exe_dir = pima_data.output_dir)
295 |         cleanup_nextflow(pima_data)
296 | 
297 |     #not using nextflow, running pima in serial
298 |     else:
299 |         stop_logging(pima_data, "Sample specific logs are found in their respective directories, closing multiplex log now.")
300 |         for barcode in pima_data.barcodes.keys():
301 |             barcode_pima_data = copy.deepcopy(pima_data)
302 |             barcode_pima_settings = copy.deepcopy(settings)
303 |             barcode_pima_data.output_dir = os.path.join(barcode_pima_data.output_dir, barcode)
304 |             barcode_pima_data.logging_file = os.path.join(barcode_pima_data.output_dir, "pima.log")
305 |             barcode_pima_data.ont_fastq = os.path.join(barcode_pima_data.output_dir, f"{barcode}.fastq")
306 |             barcode_pima_data.multiplexed = None
307 |             log_message = [("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"{barcode_pima_data.barcodes[barcode].report_multiplex_sample()}")]
308 |             modules.validate_output_dir(barcode_pima_data, settings, log_message)
309 |             barcode_pima_data.barcodes[barcode].create_concat_fastq(barcode_pima_data)
310 |             Pima.pima.run_workflow(barcode_pima_data, settings)
311 | 
312 | 
313 | def strip_pima_cmd(pima_data, system_args: list):
314 |     """
315 |     Regenerate the pima command except singleplex with the specific ONT reads, updated output directory, removed multiplexing/nextflow statements
316 |     """
317 |     #TODO: How should Illumina data be handled for a multiplexed run??? 
318 |     #      - Just need to add a sample_sheet mode
319 | 
320 |     params_to_change = ['--output', '--ont-fastq', '--threads']
321 |     params_to_remove = ['--multiplexed', '--nextflow']
322 |     if isinstance(pima_data.nextflow, str):
323 |         system_args = re.sub(pima_data.nextflow, "", system_args)
324 |     params_to_fix_path = ['--reference-genome', '--mutation-regions']
325 |     param_iter = iter(system_args.split(" "))
326 |     new_cmd = []
327 |     for param in param_iter:
328 |         if not param in [*params_to_change, *params_to_remove, *params_to_fix_path]:
329 |             new_cmd.append(param)
330 |         elif param in params_to_change:
331 |             next(param_iter)
332 |         elif param in params_to_fix_path:
333 |             new_cmd.append(param)
334 |             new_cmd.append(os.path.realpath(next(param_iter)))
335 |         elif param in params_to_remove:
336 |             continue
337 |     return " ".join(i for i in new_cmd)
338 | 
339 | 
340 | def validate_nextflow(pima_data):
341 |     if not pima_data.nextflow:
342 |         return
343 |     if pima_data.no_assembly:
344 |         return
345 |     if not pima_data.will_have_ont_fastq:
346 |         error = "Nextflow is only setup for using ont-fastq data, not assemblies"
347 |         pima_data.errors.append(error)
348 |         print_and_log(
349 |             pima_data,
350 |             error,
351 |             pima_data.fail_verbosity,
352 |             pima_data.error_color,
353 |         )
354 |         return
355 |     
356 |     if pima_data.nextflow and not pima_data.multiplexed:
357 |         error = "Nextflow is only useful for multiplexed data"
358 |         pima_data.errors.append(error)
359 |         print_and_log(
360 |             pima_data,
361 |             error,
362 |             pima_data.fail_verbosity,
363 |             pima_data.error_color,
364 |         )
365 |     
366 |     if validate_utility(
367 |         pima_data, "nextflow", "nextflow is not on the PATH (required by --multiplexed --nextflow)."
368 |     ):
369 |         command = "nextflow -v"
370 |         pima_data.versions["nextflow"] = re.search(
371 |             r"[0-9]+\.[0-9.]+", print_and_run(pima_data, command)[0]
372 |         ).group(0)
373 | 
374 | 
375 | def cleanup_nextflow(pima_data):
376 |     nf_temp_files = glob.glob(os.path.join(pima_data.output_dir, ".nextflow*"))
377 |     nf_work_dir = os.path.join(pima_data.output_dir, "work")
378 |     for file in nf_temp_files:
379 |         try: 
380 |             shutil.rmtree(file)
381 |         except NotADirectoryError:
382 |             os.remove(file)
383 | 
384 |     shutil.rmtree(nf_work_dir)


--------------------------------------------------------------------------------
/Pima/modules/ont_polishing.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import subprocess
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from Pima.pima_data import PimaData
  8 | from Pima.utils.utils import (
  9 |     add_warning,
 10 |     print_and_log,
 11 |     validate_utility,
 12 |     validate_file_and_size_or_error,
 13 |     print_and_run,
 14 |     find_checkpoint,
 15 |     make_start_file,
 16 |     make_finish_file,
 17 |     std_files,
 18 | )
 19 | 
 20 | def validate_medaka(pima_data: PimaData):
 21 |     # Skip conditions
 22 |     if pima_data.no_assembly:
 23 |         return
 24 | 
 25 |     if pima_data.no_medaka:
 26 |         return
 27 | 
 28 |     if not pima_data.will_have_genome_fasta:
 29 |         return
 30 | 
 31 |     if not pima_data.will_have_ont_fastq:
 32 |         return
 33 | 
 34 |     print_and_log(
 35 |         pima_data,
 36 |         "Validating medaka",
 37 |         pima_data.main_process_verbosity,
 38 |         pima_data.main_process_color,
 39 |     )
 40 | 
 41 |     if validate_utility(
 42 |         pima_data,
 43 |         "medaka_consensus",
 44 |         "medaka_consensus is not on the PATH (required by --medaka)",
 45 |     ):
 46 |         command = "medaka --version"
 47 |         pima_data.versions["medaka"] = re.search(
 48 |             r"[0-9]+\.[0-9.]+", print_and_run(pima_data, command)[0]
 49 |         ).group(0)
 50 | 
 51 |     if pima_data.ont_model == 'auto':
 52 |         print_and_log(
 53 |             pima_data,
 54 |             "Trying to determine the ont basecalling model for medaka",
 55 |             pima_data.sub_process_verbosity,
 56 |             pima_data.sub_process_color,
 57 |         )
 58 | 
 59 |         determine_ont_model(pima_data)
 60 |         return
 61 |     
 62 |     pima_data.analysis.append(["medaka_ont_assembly", pima_data])
 63 | 
 64 |     # Assume this means we have an ONT assembly
 65 |     pima_data.will_have_ont_assembly = True
 66 | 
 67 | def medaka_ont_assembly(pima_data: PimaData):
 68 | 
 69 |     print_and_log(
 70 |         pima_data,
 71 |         "Running Medaka on ONT assembly",
 72 |         pima_data.main_process_verbosity,
 73 |         pima_data.main_process_color,
 74 |     )
 75 | 
 76 | 
 77 |     medaka_dir = os.path.join(pima_data.output_dir, "medaka")
 78 |     if find_checkpoint(pima_data, medaka_dir):
 79 |         print_and_log(
 80 |             pima_data,
 81 |             "Medaka had previously been run and finished successfully",
 82 |             pima_data.main_process_verbosity,
 83 |             pima_data.main_process_color,
 84 |         )
 85 |         pima_data.genome_fasta = os.path.join(medaka_dir, "assembly.fasta")
 86 |         pima_data.did_medaka_ont_assembly = True
 87 |         return
 88 | 
 89 |     os.makedirs(medaka_dir)
 90 |     make_start_file(pima_data, medaka_dir)
 91 | 
 92 |     # Actually run Medaka
 93 |     print_and_log(
 94 |         pima_data,
 95 |         f"Starting medaka using model: '{pima_data.ont_model}'",
 96 |         pima_data.sub_process_verbosity,
 97 |         pima_data.sub_process_color,
 98 |     )
 99 |     medaka_threads = min(pima_data.threads, 2)
100 |     medaka_fasta = os.path.join(medaka_dir, "consensus.fasta")
101 |     medaka_stdout, medaka_stderr = std_files(
102 |         os.path.join(medaka_dir, "medaka")
103 |     )
104 |     command = " ".join(
105 |         [
106 |             "medaka_consensus",
107 |             "-i",
108 |             pima_data.ont_fastq,
109 |             "-d",
110 |             pima_data.genome_fasta,
111 |             "-o",
112 |             medaka_dir,
113 |             "-t",  
114 |             str(medaka_threads), # Medaka throttles anything more than 2 due to poor scaling
115 |             "-b 50",  # MUCH more efficient on scicomp than the default [-b 100] (10x faster, 1/10th the RAM)
116 |             "-m",
117 |             pima_data.ont_model,
118 |             "1>", medaka_stdout,
119 |             "2>", medaka_stderr,
120 |         ]
121 |     )
122 |     print_and_run(pima_data, command)
123 |     validate_file_and_size_or_error(
124 |         pima_data,
125 |         medaka_fasta,
126 |         "Medaka FASTA",
127 |         "cannot be found after Medaka",
128 |         "is empty",
129 |     )
130 | 
131 |     medaka_bam = os.path.join(medaka_dir, "calls_to_draft.bam")
132 | 
133 |     print_and_log(
134 |         pima_data,
135 |         "Repairing contig names after Medaka",
136 |         pima_data.sub_process_verbosity,
137 |         pima_data.sub_process_color,
138 |     )
139 |     pima_data.genome_fasta = os.path.join(medaka_dir, "assembly.fasta")
140 |     command = " ".join(
141 |         [
142 |             'awk \'{if($0 ~ /^>/){gsub(":.*", "", $0);gsub("_segment", "_", $0)}print}\'',
143 |             medaka_fasta,
144 |             ">",
145 |             pima_data.genome_fasta,
146 |         ]
147 |     )
148 |     print_and_run(pima_data, command)
149 |     validate_file_and_size_or_error(
150 |         pima_data,
151 |         pima_data.genome_fasta,
152 |         "Genome assembly",
153 |         "cannot be found after fixing names",
154 |         "is empty",
155 |     )
156 | 
157 |     pima_data.load_genome()
158 |     make_finish_file(pima_data, medaka_dir)
159 |     pima_data.did_medaka_ont_assembly = True
160 |     pima_data.files_to_clean.extend([medaka_bam, medaka_bam + ".bai"])
161 |     pima_data.files_to_clean.append(os.path.join(medaka_dir, "consensus_probs.hdf"))
162 | 
163 | def determine_ont_model(pima_data: PimaData):
164 |     #skip if this ran during the info_fastq assessment step ()
165 |     if pima_data.ont_model != "auto":
166 |         return
167 |     
168 |     command = " ".join(
169 |         [
170 |             "medaka tools resolve_model --auto_model consensus",
171 |             pima_data.ont_fastq,
172 |         ]
173 |     )
174 |     result = subprocess.run(command, shell=True, capture_output=True, text=True)
175 |     if result.returncode == 0:
176 |         pima_data.ont_model = result.stdout.strip() #tell medaka to automatically determine the model
177 |         pima_data.analysis.append(["medaka_ont_assembly", pima_data])
178 |         print_and_log(
179 |             pima_data,
180 |             f"Identified basecalling model: {pima_data.ont_model}",
181 |             pima_data.sub_process_verbosity,
182 |             pima_data.sub_process_color,
183 |         )
184 |         
185 |     elif result.returncode != 0:
186 |         warn = "Medaka could not determine the basecalling model used to generate the fastq files and '--ont-model' was not provided, continuing PiMA without medaka polishing."
187 |         add_warning(pima_data, warn)
188 |         pima_data.assembly_notes = pd.concat([pima_data.assembly_notes, pd.Series(warn, dtype='object')])


--------------------------------------------------------------------------------
/Pima/modules/outdir.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import datetime
 4 | from pathlib import Path
 5 | 
 6 | from Pima.pima_data import PimaData
 7 | from Pima.utils.settings import Settings
 8 | from Pima.utils.utils import print_and_log, start_logging
 9 | 
10 | 
11 | def validate_output_dir(pima_data: PimaData, settings: Settings, log_messages:list=[]):
12 | 
13 |     log_messages.append(("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"PiMA version: {settings.pima_version}"))
14 |     log_messages.append(("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', "Validating output dir"))
15 | 
16 |     if not pima_data.output_dir:
17 |         pima_data.errors += ["No output directory given (--output)"]
18 |     elif pima_data.overwrite and pima_data.resume:
19 |         pima_data.errors += ["--overwrite and --resume are mutually exclusive"]
20 |     elif os.path.exists(pima_data.output_dir) and not (
21 |         pima_data.overwrite or pima_data.resume
22 |     ):
23 |         pima_data.errors += [
24 |             "Output directory "
25 |             + pima_data.output_dir
26 |             + " already exists.  Add --overwrite OR --resume to ignore"
27 |         ]
28 |     
29 |     else:
30 |         pima_data.output_dir = os.path.realpath(pima_data.output_dir)
31 |         make_outdir(pima_data, log_messages)
32 | 
33 | 
34 | def make_outdir(pima_data: PimaData, log_messages: list):
35 |     if pima_data.resume and os.path.isdir(pima_data.output_dir):
36 |         start_logging(pima_data)
37 |         log_messages.append(("main", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"Resuming from previous run, previous log has been renamed to 'previous_<logname>.log.'"))
38 |         report_logs(pima_data, log_messages)
39 |         return
40 | 
41 |     if os.path.isdir(pima_data.output_dir):
42 |         log_messages.append(("warn", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"Output directory {pima_data.output_dir} already exists. It will be removed."))
43 |         shutil.rmtree(pima_data.output_dir)
44 |     elif os.path.isfile(pima_data.output_dir):
45 |         log_messages.append(("warn", f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]', f"Output directory {pima_data.output_dir} already exists. It will be removed."))
46 |         os.remove(pima_data.output_dir)
47 | 
48 |     os.makedirs(pima_data.output_dir)
49 |     start_logging(pima_data)
50 |     report_logs(pima_data, log_messages)
51 | 
52 | def report_logs(pima_data, log_messages):
53 |     for message in log_messages:
54 |         if isinstance(message, str):
55 |             print_and_log(
56 |                 pima_data,
57 |                 message,
58 |                 pima_data.main_process_verbosity,
59 |                 pima_data.main_process_color,
60 |             )
61 |         else:
62 |             if message[0] == "main":
63 |                 print_and_log(
64 |                     pima_data,
65 |                     message[2],
66 |                     pima_data.main_process_verbosity,
67 |                     pima_data.main_process_color,
68 |                     message[1],
69 |                 )
70 |             elif message[0] == "warn":
71 |                 print_and_log(
72 |                     pima_data,
73 |                     message[2],
74 |                     pima_data.warning_verbosity,
75 |                     pima_data.warning_color,
76 |                     message[1],
77 |                 )
78 | 


--------------------------------------------------------------------------------
/Pima/modules/plasmids.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import shutil
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from Pima.pima_data import PimaData
  8 | from Pima.utils.settings import Settings
  9 | 
 10 | from Pima.utils.utils import (
 11 |     print_and_log,
 12 |     print_and_run,
 13 |     validate_utility,
 14 |     validate_file_and_size,
 15 |     validate_file_and_size_or_error,
 16 |     make_start_file,
 17 |     make_finish_file,
 18 |     find_checkpoint,
 19 |     std_files,
 20 | )
 21 | 
 22 | from .annotations import make_blast_database
 23 | 
 24 | def validate_plasmids(pima_data: PimaData, settings: Settings):
 25 | 
 26 |     if not pima_data.plasmids:
 27 |         return
 28 | 
 29 |     print_and_log(
 30 |         pima_data,
 31 |         'Validating plasmid database and utilities', 
 32 |         pima_data.main_process_verbosity, 
 33 |         pima_data.main_process_color,
 34 |     )
 35 |     
 36 |     for utility in ['minimap2']:
 37 |         if validate_utility(pima_data, utility, utility + ' is not on the PATH.'):
 38 |             command = utility + ' --version'
 39 |             pima_data.versions[utility] = re.search(r'[0-9]+\.[0-9.]*', print_and_run(pima_data, command)[0]).group(0)
 40 | 
 41 |     for utility in ['Rscript', 'R']:
 42 |         if validate_utility(pima_data, utility, utility + ' is not on the PATH.'):
 43 |             command = utility + ' --version 2>&1'
 44 |             pima_data.versions[utility] = re.search(r'[0-9]+\.[0-9.]*', print_and_run(pima_data, command)[0]).group(0)
 45 |         
 46 |     if not validate_file_and_size(pima_data, the_file = pima_data.plasmid_database, min_size = 2000):
 47 |         pima_data.errors.append(f"Can't find plasmid database {pima_data.plasmid_database} or is empty. Try --download?")
 48 |         
 49 |     if not pima_data.will_have_genome_fasta:
 50 |         pima_data.errors.append("Can't call plasmids without a genome or an assembly")
 51 | 
 52 |     pima_data.analysis.append(['call_plasmids', pima_data, settings])
 53 | 
 54 | 
 55 | def call_plasmids(pima_data: PimaData, settings: Settings):
 56 | 
 57 |     print_and_log(
 58 |         pima_data,
 59 |         'Calling plasmids', 
 60 |         pima_data.main_process_verbosity, 
 61 |         pima_data.main_process_color,
 62 |     )
 63 | 
 64 |     if not validate_file_and_size(pima_data, pima_data.plasmid_database) and validate_file_and_size(pima_data, settings.DockerPathPlasmid):
 65 |         pima_data.plasmid_database = settings.DockerPathPlasmid
 66 | 
 67 |     # Make a directory for plasmid stuff
 68 |     pima_data.plasmid_dir = os.path.join(pima_data.output_dir, 'plasmids')
 69 |     if find_checkpoint(pima_data, pima_data.plasmid_dir):
 70 |         return
 71 |     os.makedirs(pima_data.plasmid_dir)
 72 |     make_start_file(pima_data, pima_data.plasmid_dir)
 73 | 
 74 |     # Take very large things out of the assembly.  They aren't plasmids and take a long time to run
 75 |     print_and_log(
 76 |         pima_data,
 77 |         'Finding contigs < 500000 bp', 
 78 |         pima_data.sub_process_verbosity, 
 79 |         pima_data.sub_process_color,
 80 |     )
 81 |     smaller_contigs_fasta = os.path.join(pima_data.plasmid_dir, 'small_contigs.fasta')
 82 |     command = " ".join(
 83 |         [
 84 |             'faidx -i chromsizes', 
 85 |             pima_data.genome_fasta,
 86 |             '| awk \'($2 <= 500000){print $1}\'',
 87 |             '| parallel -n1 -n1 faidx', pima_data.genome_fasta, '>', smaller_contigs_fasta,
 88 |         ]
 89 |     )
 90 |     print_and_run(pima_data, command)
 91 | 
 92 |     # See if there is anything in the small contigs file; if not, we done
 93 |     # TODO - Add something to the report about no small contigs
 94 |     small_contigs = pima_data.load_fasta(smaller_contigs_fasta)
 95 |     if len(small_contigs) == 0:
 96 |         print_and_log(
 97 |             pima_data,
 98 |             'No contigs smaller than 500kb found, skipping plasmid search', 
 99 |             pima_data.sub_process_verbosity, 
100 |             pima_data.sub_process_color,
101 |         )
102 |         pima_data.did_call_plasmids = True
103 |         pima_data.plasmids = None
104 |         make_finish_file(pima_data, pima_data.plasmid_dir)
105 |         return
106 |                                 
107 |     # Query plasmid sequences against the assembly using minimap2
108 |     print_and_log(
109 |         pima_data,
110 |         'Running minimap2 against the plasmid database', 
111 |         pima_data.sub_process_verbosity, 
112 |         pima_data.sub_process_color,
113 |     )
114 |     plasmid_sam = os.path.join(pima_data.plasmid_dir, 'plasmid_hits.sam')
115 |     _, minimap_stderr = std_files(os.path.join(pima_data.plasmid_dir, 'minimap'))
116 |     command = " ".join(
117 |         [
118 |             'minimap2',
119 |             '-k 20 -p .2 -a',
120 |             '-t', str(pima_data.threads),
121 |             smaller_contigs_fasta,
122 |             pima_data.plasmid_database,
123 |             '1>', plasmid_sam,
124 |             '2>', minimap_stderr,
125 |         ]
126 |     )
127 |     print_and_run(pima_data, command)
128 |     validate_file_and_size_or_error(pima_data, plasmid_sam, 'Plasmid v. contig SAM', 'cannot be found', 'is empty')
129 | 
130 |     pima_data.files_to_clean.append(plasmid_sam)
131 |     
132 |     # Turn the SAM file in to a PSL file using the modified sam2psl script
133 |     print_and_log(
134 |         pima_data,
135 |         'Converting the SAM file to a PSL file', 
136 |         pima_data.sub_process_verbosity, 
137 |         pima_data.sub_process_color,
138 |     )
139 |     plasmid_psl = os.path.join(pima_data.plasmid_dir, 'plasmid_hits.psl')
140 |     sam2psl_stdout, sam2psl_stderr = std_files(os.path.join(pima_data.plasmid_dir, 'sam2psl'))
141 |     path2sam2psl = os.path.join(settings.pima_path, "Pima", "accessory_scripts", "sam2psl.py")
142 |     command = " ".join(
143 |         [
144 |             'python3',
145 |             path2sam2psl,
146 |             '-i', plasmid_sam,
147 |             '-o', plasmid_psl,
148 |             '1>', sam2psl_stdout, '2>', sam2psl_stderr,
149 |         ]
150 |     )
151 |     print_and_run(pima_data, command)
152 |     validate_file_and_size_or_error(pima_data, plasmid_sam, 'Plasmid v. contig PSL', 'cannot be found', 'is empty')
153 |     
154 |     # Make a BLAST database of the plasmid sequences
155 |     make_blast_database(pima_data, pima_data.plasmid_database)
156 |     
157 |     # Pass the data onto pChunks
158 |     print_and_log(
159 |         pima_data,
160 |         'Running pChunks', 
161 |         pima_data.sub_process_verbosity, 
162 |         pima_data.sub_process_color,
163 |     )
164 |     pima_data.pchunks_dir = os.path.join(pima_data.plasmid_dir, 'pChunks')
165 |     
166 |     if find_checkpoint(pima_data, pima_data.pchunks_dir):
167 |         pima_data.did_call_plasmids = True
168 |         new_plasmid_tsv = os.path.join(pima_data.plasmid_dir, 'plasmids.tsv')
169 |         pima_data.plasmid_tsv = new_plasmid_tsv
170 |         try:
171 |             pima_data.plasmids = pd.read_csv(filepath_or_buffer = pima_data.plasmid_tsv, sep = '\t', header = 0)
172 |         except:
173 |             pima_data.plasmids = None
174 |         return
175 |     
176 |     os.makedirs(pima_data.pchunks_dir)
177 |     
178 |     pima_data.plasmid_tsv = os.path.join(pima_data.pchunks_dir, 'plasmids.tsv')
179 |     stdout_file, stderr_file = std_files(os.path.join(pima_data.pchunks_dir, "pchunks"))
180 |     path2pChunks = os.path.join(settings.pima_path, "Pima", "accessory_scripts", "pChunks.R")
181 |     command = " ".join(
182 |         [
183 |             'Rscript',
184 |             path2pChunks, '--plasmid-psl', plasmid_psl,
185 |             '--output', pima_data.pchunks_dir,
186 |             '--no-amr', '--no-inc',
187 |             '--plasmid-database', pima_data.plasmid_database,
188 |             '--threads', str(pima_data.threads),
189 |             '1>', stdout_file, '2>', stderr_file,
190 |         ]
191 |     )
192 |     print_and_run(pima_data, command)
193 |     pima_data.plasmid_tsv = os.readlink(os.path.join(pima_data.pchunks_dir, 'plasmids.tsv'))
194 |     validate_file_and_size_or_error(pima_data, pima_data.plasmid_tsv, 'Plasmid output table', 'cannot be found', 'is empty')
195 |     
196 |     # The final file is in pChunks
197 |     new_plasmid_tsv = os.path.join(pima_data.plasmid_dir, 'plasmids.tsv')
198 |     shutil.copy2(pima_data.plasmid_tsv, new_plasmid_tsv)
199 |     pima_data.plasmid_tsv = new_plasmid_tsv
200 | 
201 |     try:
202 |         pima_data.plasmids = pd.read_csv(filepath_or_buffer = pima_data.plasmid_tsv, sep = '\t', header = 0)
203 |     except:
204 |         pima_data.plasmids = None
205 | 
206 |     pima_data.did_call_plasmids = True
207 |     make_finish_file(pima_data, pima_data.plasmid_dir)


--------------------------------------------------------------------------------
/Pima/modules/report.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import re
 4 | import string
 5 | 
 6 | from Pima.pima_data import PimaData
 7 | from Pima.utils.settings import Settings
 8 | from Pima.utils.utils import (
 9 |     print_and_log,
10 |     print_and_run,
11 |     validate_utility,
12 |     find_checkpoint,
13 |     validate_file_and_size_or_error,
14 |     std_files,
15 | )
16 | from Pima.accessory_scripts.MarkdownReport import PimaReport
17 | 
18 | def validate_make_report(pima_data: PimaData, settings: Settings):
19 |     if pima_data.no_report:
20 |         return
21 | 
22 |     print_and_log(
23 |         pima_data,
24 |         "Validating reporting utilities",
25 |         pima_data.main_process_verbosity,
26 |         pima_data.main_process_color,
27 |     )
28 | 
29 |     if pima_data.bundle:
30 |         if not os.path.isdir(pima_data.bundle):
31 |             pima_data.errors.append(f"Can't find pandoc bundle {pima_data.bundle}")
32 | 
33 |     validate_utility(
34 |         pima_data, "pandoc", "pandoc is not on the PATH (required for reporting)."
35 |     )
36 |     
37 |     pima_data.analysis.append(["make_report", pima_data, settings])
38 | 
39 | def make_report(pima_data: PimaData, settings: Settings):
40 | 
41 |     print_and_log(
42 |         pima_data,
43 |         "Making report", 
44 |         pima_data.main_process_verbosity, 
45 |         pima_data.main_process_color,
46 |     )
47 |     pima_data.report_dir = os.path.join(pima_data.output_dir, 'report')
48 | 
49 |     if find_checkpoint(pima_data, pima_data.report_dir):
50 |         ##Always regenerate report in case downstream steps have changed the results
51 |         shutil.rmtree(pima_data.report_dir) 
52 |     os.mkdir(pima_data.report_dir)
53 | 
54 |     pima_data.report_prefix = os.path.join(pima_data.report_dir, 'report')
55 |     pima_data.report_md = pima_data.report_prefix + '.md'
56 | 
57 |     pima_data.markdown_report = PimaReport(pima_data, settings)
58 |     pima_data.markdown_report.make_report()
59 | 
60 |     ## Add appendices to the pima report
61 |     if len(pima_data.markdown_report.appendices) > 0:
62 |         #assign each new AMR class its own appendix ID (A-Z)
63 |         for i, appendix in zip(string.ascii_uppercase, pima_data.markdown_report.appendices):
64 |             png = re.sub(r"\.md", ".png", appendix)
65 |             mod_appendix = os.path.join(pima_data.report_dir, os.path.basename(appendix))
66 |             shutil.copyfile(png, os.path.join(pima_data.report_dir, os.path.basename(png)))
67 |             with open(appendix, "rt") as fin:
68 |                 with open(mod_appendix, "wt") as fout:
69 |                     for line in fin:
70 |                         fout.write(line.replace("LETTER", i))
71 | 
72 |             # translate the appendix into the markdown report
73 |             with open(pima_data.report_md, "a") as file:
74 |                 with open(mod_appendix, "r") as temp_file:
75 |                     file.write(temp_file.read())
76 |             #pima_data.files_to_clean.append(mod_appendix)
77 |             #pima_data.files_to_clean.append(os.path.join(pima_data.report_dir, os.path.basename(png)))
78 | 
79 |     pima_data.report_pdf = pima_data.report_prefix + '.pdf'
80 |     validate_file_and_size_or_error(pima_data, pima_data.report_md, 'Report MD', 'cannot be found', 'is empty')
81 |     
82 |     tectonic_stdout, tectonic_stderr = std_files(os.path.join(pima_data.report_dir, 'markdown2pdf'))
83 |     command = ' '.join(
84 |         [
85 |             'pandoc -f gfm',
86 |             pima_data.report_md,
87 |             '-o', pima_data.report_pdf,
88 |             '--pdf-engine=weasyprint',
89 |             '--css ' + settings.pima_css,
90 |             '1>', tectonic_stdout, 
91 |             '2>', tectonic_stderr,
92 |         ]
93 |     )
94 |     print_and_run(pima_data, command, change_exe_dir=pima_data.report_dir)
95 |     validate_file_and_size_or_error(pima_data, pima_data.report_pdf, 'Report PDF', 'cannot be found', 'is empty')


--------------------------------------------------------------------------------
/Pima/nextflow_parallelization/main.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | nextflow.enable.dsl = 2
 3 | 
 4 | include { PIMA_SINGLEPLEX } from './modules/pima_singleplex.nf'
 5 | include { COPY_RESULTS } from './modules/copy_results.nf'
 6 | 
 7 | params.sample_sheet = null
 8 | params.output = null
 9 | 
10 | workflow {
11 |     outdir = file( params.output )
12 | 
13 |     samples = Channel
14 |         .fromPath(params.sample_sheet)
15 |         .splitCsv(header:false, sep:",")
16 | 
17 |     PIMA_SINGLEPLEX( samples )
18 | 
19 |     COPY_RESULTS( PIMA_SINGLEPLEX.out.output_directory, outdir )
20 | 
21 | }
22 | 
23 | workflow.onComplete {
24 |     println "Project : $workflow.projectDir"
25 |     println "Workdir : $workflow.workDir"
26 |     println "homeDir : $workflow.homeDir"
27 |     println "launchDir : $workflow.launchDir"
28 | }


--------------------------------------------------------------------------------
/Pima/nextflow_parallelization/modules/copy_results.nf:
--------------------------------------------------------------------------------
 1 | process COPY_RESULTS {
 2 |     input:
 3 |     path(sample_outdir)
 4 |     val(workflow_outdir)
 5 | 
 6 |     output:
 7 |     stdout
 8 | 
 9 |     script:
10 |     """
11 |     #move the pima results folder to the original output location
12 |     cp -Lr $sample_outdir $workflow_outdir/
13 |     """
14 | }


--------------------------------------------------------------------------------
/Pima/nextflow_parallelization/modules/pima_singleplex.nf:
--------------------------------------------------------------------------------
 1 | process PIMA_SINGLEPLEX {
 2 |     tag "$sample"
 3 |     
 4 |     input:
 5 |     // [sample, output, ont_fastq, pima_cmd]
 6 |     tuple val(sample), val(output), path(ont_fastq), val(pima_cmd)
 7 | 
 8 |     output:
 9 |     path(sample), emit: output_directory
10 |     path("$sample/report/report.pdf")
11 | 
12 |     script:
13 |     """
14 |     python3 $pima_cmd --ont-fastq $ont_fastq --output $sample --threads ${task.cpus}
15 |     """
16 | }


--------------------------------------------------------------------------------
/Pima/nextflow_parallelization/nextflow.config.template:
--------------------------------------------------------------------------------
 1 | conda.enabled = true
 2 | conda.useMamba = true
 3 | 
 4 | params {
 5 |     config_profile_description = 'Rosalind HPC @ CDC'
 6 |     config_profile_contact = 'OAMD'
 7 |     config_profile_url = 'https://info.biotech.cdc.gov/info/'
 8 |     custom_config_version = 'master'
 9 | 
10 |     // Default resource parameters. Expecting to be overwritten.
11 |     max_memory                 = '128.GB'
12 |     max_cpus                   = 16
13 |     max_time                   = '240.h'
14 | }
15 | 
16 | 
17 | executor {
18 |     name = 'sge'
19 |     pollInterval = '10sec'
20 |     submitRateLimit = '2sec'
21 |     queueSize = 24
22 | }
23 | 
24 | process {
25 |     executor = 'sge'
26 |     penv = 'smp'
27 |     queue = 'all.q'
28 |     beforeScript = None
29 |     conda = None
30 |     errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
31 |     maxRetries = 3
32 |     maxErrors     = '-1'
33 |     // Set h_vmem option for qsub submissions. +6 memory to h_vmem prevents memory allocation errors.
34 |     clusterOptions = { "-l h_vmem=${(check_max((task.memory.toGiga())+6), 'memory').toString().replaceAll(/[\sB]/,'')}G" }
35 |     cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
36 |     memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
37 |     time   = { check_max( 8.h   * task.attempt, 'time'    ) }
38 | }
39 | 
40 | def check_max(obj, type) {
41 |     if (type == 'memory') {
42 |         try {
43 |             if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
44 |                 return params.max_memory as nextflow.util.MemoryUnit
45 |             else
46 |                 return obj
47 |         } catch (all) {
48 |             println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
49 |             return obj
50 |         }
51 |     } else if (type == 'time') {
52 |         try {
53 |             if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
54 |                 return params.max_time as nextflow.util.Duration
55 |             else
56 |                 return obj
57 |         } catch (all) {
58 |             println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
59 |             return obj
60 |         }
61 |     } else if (type == 'cpus') {
62 |         try {
63 |             return Math.min( obj, params.max_cpus as int )
64 |         } catch (all) {
65 |             println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
66 |             return obj
67 |         }
68 |     }
69 | }


--------------------------------------------------------------------------------
/Pima/pima.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import annotations  # for "|" in docstrings python 3.7-3.9
  3 | import os
  4 | import sys
  5 | import matplotlib as mpl
  6 | 
  7 | from Pima.pima_data import PimaData
  8 | from Pima.utils.utils import print_and_log, stop_logging, clean_up
  9 | from Pima.utils.settings import Settings
 10 | from Pima.utils.cli import parse_args
 11 | from Pima import modules
 12 | 
 13 | mpl.use("Agg")
 14 | 
 15 | def run_prechecks(pima_data: PimaData, settings: Settings, pima_cmdtext: str):
 16 |     print_and_log(
 17 |         pima_data,
 18 |         "STARTING VALIDATION STEPS",
 19 |         pima_data.main_process_verbosity,
 20 |         pima_data.main_process_color,
 21 |     )
 22 |     modules.validate_download(pima_data, settings)
 23 |     modules.validate_output_dir(pima_data, settings, pima_cmdtext)
 24 |     modules.validate_organism(pima_data)
 25 |     modules.validate_multiplex_fastq(pima_data)
 26 | 
 27 |     if len(pima_data.errors) > 0:
 28 |         print_and_log(
 29 |             pima_data,
 30 |             "Errors were found during validation.",
 31 |             pima_data.fail_verbosity,
 32 |             pima_data.error_color,
 33 |         )
 34 |         for error in pima_data.errors:
 35 |             print_and_log(
 36 |                 pima_data,
 37 |                 error,
 38 |                 pima_data.fail_verbosity,
 39 |                 pima_data.error_color,
 40 |             )
 41 |         print_and_log(
 42 |             pima_data,
 43 |             "Aborting.",
 44 |             pima_data.fail_verbosity,
 45 |             pima_data.error_color,
 46 |         )
 47 |         sys.exit(1)
 48 | 
 49 | def run_validation(pima_data: PimaData, settings: Settings):
 50 |     modules.validate_ont_fastq(pima_data, settings)
 51 |     modules.validate_illumina_fastq(pima_data)
 52 |     modules.validate_genome_fasta(pima_data)
 53 |     modules.validate_genome_assembly_size(pima_data)
 54 |     modules.validate_contamination_check(pima_data, settings)
 55 |     modules.validate_assembler(pima_data)
 56 |     modules.validate_assembly_info(pima_data)
 57 |     modules.validate_medaka(pima_data)
 58 |     modules.validate_illumina_polish(pima_data)
 59 |     modules.validate_evaluate_assembly(pima_data)
 60 |     modules.validate_plasmids(pima_data, settings)
 61 |     modules.validate_features(pima_data, settings)
 62 |     modules.validate_blast(pima_data)
 63 |     modules.validate_reference_fasta(pima_data)
 64 |     modules.validate_quast(pima_data)
 65 |     modules.validate_mutations(pima_data)
 66 |     modules.validate_draw_features(pima_data)
 67 |     modules.validate_draw_amr_matrix(pima_data)
 68 |     modules.validate_draw_circos(pima_data, settings)
 69 |     modules.validate_make_report(pima_data, settings)
 70 | 
 71 |     if len(pima_data.errors) > 0:
 72 |         print_and_log(
 73 |             pima_data,
 74 |             "Errors were found during validation.",
 75 |             pima_data.fail_verbosity,
 76 |             pima_data.error_color,
 77 |         )
 78 |         for error in pima_data.errors:
 79 |             print_and_log(
 80 |                 pima_data,
 81 |                 error,
 82 |                 pima_data.fail_verbosity,
 83 |                 pima_data.error_color,
 84 |             )
 85 |         print_and_log(
 86 |             pima_data,
 87 |             "Aborting.",
 88 |             pima_data.fail_verbosity,
 89 |             pima_data.error_color,
 90 |         )
 91 |         sys.exit(1)
 92 | 
 93 |     #log all the versions
 94 |     version_log = "Utility versions:"
 95 |     for utility, version in pima_data.versions.items():
 96 |         version_log = version_log + "\n" + "{:<25} {:<10}".format(utility, version)
 97 |     print_and_log(
 98 |         pima_data,
 99 |         version_log,
100 |         pima_data.main_process_verbosity,
101 |         pima_data.main_process_color,
102 |     )
103 | 
104 | def define_workflow(pima_data: PimaData):
105 |     """
106 |     Step through modules and run pima
107 | 
108 |     The run_validation steps parse the cmdline args, check all necessary tools / files are availabe for the
109 |     requested steps in the pipeline, and queues up the modules by adding the steps to 
110 |     pima_data.analysis object
111 |     """
112 |     print_and_log(
113 |         pima_data,
114 |         "STARTING PiMA Analysis",
115 |         pima_data.main_process_verbosity,
116 |         pima_data.main_process_color,
117 |     )
118 |     while (True):
119 |         step = pima_data.analysis[0]
120 |         pima_data.analysis = pima_data.analysis[1:]
121 |         
122 |         ## See if we have arguments to pass to our function
123 |         if type(step) is list:
124 |             arguments = []
125 |             if len(step) > 1:
126 |                 arguments = step[1:]
127 |             step = step[0]
128 |             function = getattr(modules, step)
129 |             function(*arguments)
130 |         else:
131 |             function = getattr(modules, step)
132 |             function()
133 | 
134 |         if (len(pima_data.analysis) == 0):
135 |             break
136 | 
137 | def run_workflow(pima_data: PimaData, settings: Settings):
138 |     run_validation(pima_data, settings)
139 |     define_workflow(pima_data)
140 |     clean_up(pima_data)
141 |     stop_logging(pima_data, "PiMA completed successfully")
142 | 
143 | def main():
144 |     """ """
145 | 
146 |     settings = Settings()
147 |     opts, unknown_args = parse_args(settings)
148 | 
149 |     # Start the analysis
150 |     pima_data = PimaData(opts, unknown_args)
151 | 
152 |     # Capture commandline options used
153 |     pima_data.run_command = ' '.join(sys.argv)
154 |     run_prechecks(pima_data, settings, [f"PiMA command used: {' '.join(sys.argv)}"])
155 | 
156 |     ##Initialize serial multiplex analysis
157 |     if pima_data.multiplexed:
158 |         modules.initialize_multiplex_analysis(pima_data, settings)
159 | 
160 |     else:
161 |         run_workflow(pima_data, settings)
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     main()
166 | 


--------------------------------------------------------------------------------
/Pima/pima_colors.py:
--------------------------------------------------------------------------------
1 | class Colors:
2 |     HEADER = '\033[95m'
3 |     OKBLUE = '\033[94m'
4 |     OKGREEN = '\033[92m'
5 |     WARNING = '\033[93m'
6 |     FAIL = '\033[91m'
7 |     ENDC = '\033[0m'
8 |     BOLD = '\033[1m'
9 |     UNDERLINE = '\033[4m'


--------------------------------------------------------------------------------
/Pima/pima_data.py:
--------------------------------------------------------------------------------
  1 | import Bio.SeqIO
  2 | import datetime
  3 | 
  4 | from Pima.pima_colors import Colors
  5 | import pandas as pd
  6 | 
  7 | class PimaData:
  8 |     def __init__(self, opts=None, unknown_args=None):
  9 |         # The actual steps to carry out in the analysis held as a list
 10 |         self.analysis = []
 11 |         
 12 |         # Verbosity levels and colors
 13 |         self.error_color = Colors.FAIL
 14 |         self.fail_verbosity = 1
 15 |         self.main_process_verbosity = 1
 16 |         self.warning_color = Colors.WARNING
 17 |         self.warning_verbosity = 1
 18 |         self.main_process_color = Colors.OKGREEN
 19 |         self.sub_process_verbosity = 2
 20 |         self.sub_process_color = Colors.OKBLUE
 21 |         self.command_verbosity = 3
 22 |         self.errors = []
 23 |         self.warnings = []
 24 | 
 25 |         # ONT FASTQ input
 26 |         self.ont_fastq = None
 27 |         self.ont_raw_fastq = self.ont_fastq
 28 |         self.ont_read_count = None
 29 |         self.ont_read_lengths = None
 30 |         self.will_have_ont_fastq = False
 31 |         self.ont_read_lengths = []
 32 | 
 33 |         # Read metadata
 34 |         self.read_metadata = pd.Series(dtype=object)
 35 | 
 36 |         # Demultiplexing
 37 |         self.multiplexed = None
 38 |         self.nextflow = None
 39 |         self.barcodes = None
 40 |         self.barcode_min_fraction = None
 41 |         self.barcode_summary = None
 42 | 
 43 |         # Contamination
 44 |         self.contam_check = False
 45 |         self.kraken_fracs = pd.Series(dtype=object)
 46 |         self.did_contamination_check = False
 47 | 
 48 |         # Genome FASTA input
 49 |         self.genome_fasta = None
 50 |         self.will_have_genome_fasta = False
 51 | 
 52 |         # Illumina FASTQ input
 53 |         self.illumina_fastq = None
 54 |         self.pilon_coverage_min = 25
 55 |         self.did_spades_illumina_fastq = False
 56 |         self.did_pilon_ont_assembly = False
 57 |         self.did_polypolish_ont_assembly = False
 58 | 
 59 |         # Output options
 60 |         self.output_dir = None
 61 |         self.overwrite = False
 62 |         self.resume = False
 63 |         self.keep_intermediates = False
 64 | 
 65 |         # Assembly options
 66 |         self.assembler = "flye"
 67 |         self.flye_sup = False
 68 |         self.genome_assembly_size = None
 69 |         self.genome_assembly_raw_size = None
 70 |         self.assembly_coverage = None
 71 |         self.no_medaka = False
 72 |         self.ont_n50 = None
 73 |         self.ont_n50_min = 2500
 74 |         self.ont_coverage_min = 30
 75 |         self.only_assemble = False
 76 |         self.no_assembly = False
 77 |         self.did_flye_ont_fastq = False
 78 |         self.did_raven_ont_fastq = False
 79 |         self.will_have_ont_assembly = False
 80 |         self.mean_coverage = dict()
 81 |         self.did_circos_plots = False
 82 |         
 83 |         # ONT polishing
 84 |         self.ont_model = None
 85 |         self.did_medaka_ont_assembly = False
 86 | 
 87 |         # Illumina polishing
 88 |         self.illumina_polisher = "pilon"
 89 | 
 90 |         # Feature options
 91 |         self.no_amr = False
 92 |         self.no_inc = False
 93 |         self.feature_fastas = None
 94 |         self.feature_hits = pd.Series(dtype=object)
 95 |         self.feature_plots = pd.Series(dtype=object)
 96 |         self.feature_dirs = []
 97 |         self.feature_names = []
 98 |         self.feature_colors = []
 99 |         self.did_blast_feature_sets = False
100 | 
101 |         # Download options
102 |         self.download = False
103 | 
104 |         # Reference options
105 |         self.reference_dir = None
106 |         self.organism = None
107 |         self.organism_dir = None
108 |         self.list_organisms = False
109 |         self.will_have_reference_fasta = False
110 |         self.reference = None
111 |         self.amr_mutations = pd.Series(dtype=object)
112 |         self.mutation_regions = None
113 |         self.amr_region_names = None
114 |         self.virulence_genes_fp = None
115 |         self.did_call_mutations = False
116 |         self.amr_deletions = pd.DataFrame()
117 |         self.did_call_large_indels = False
118 |         self.reference_contig_order = None
119 |         self.organism_amr_appendices = None
120 |         # Files to remove when done
121 |         self.files_to_clean = []
122 | 
123 |         # Plasmid  options
124 |         self.plasmids = False
125 |         self.did_call_plasmids = False
126 | 
127 |         # Notes for different sections of the analysis
128 |         self.assembly_notes = pd.Series(dtype=object)
129 |         self.alignment_notes = pd.Series(dtype=object)
130 |         self.large_indel_notes = pd.Series(dtype=object)
131 |         self.contig_alignment = pd.Series(dtype=object)
132 |         self.versions = pd.Series(dtype=object)
133 | 
134 |         self.logging_handle = None
135 |         self.fake_run = False
136 | 
137 |         self.bundle = None
138 |         self.report = pd.Series(dtype=object)
139 | 
140 |         if opts is None or unknown_args is None:
141 |             return
142 | 
143 |         # Date-time information
144 |         self.start_time = datetime.datetime.now().strftime("%Y-%m-%d")
145 | 
146 |         # Logging information
147 |         self.logging_file = None
148 |         self.logging_handle = None
149 | 
150 |         # ONT FASTQ input
151 |         self.ont_fastq = opts.ont_fastq
152 |         self.ont_raw_fastq = self.ont_fastq
153 | 
154 |         # Demultiplexing
155 |         self.multiplexed = opts.multiplexed
156 |         self.nextflow = opts.nextflow
157 |         self.barcodes = None
158 |         self.barcode_min_fraction = opts.barcode_min_fraction
159 | 
160 |         # Contamination
161 |         self.contam_check = opts.contamination
162 |         self.did_contamination_check = False
163 | 
164 |         # Illumina FASTQ input
165 |         self.illumina_fastq = opts.illumina_fastq
166 | 
167 |         # Genome FASTA input
168 |         self.genome_fasta = opts.genome
169 | 
170 |         # Output options
171 |         self.output_dir = opts.output
172 |         self.overwrite = opts.overwrite
173 |         self.resume = opts.resume
174 |         self.keep_intermediates = opts.keep_intermediates
175 | 
176 |         # Assembly options
177 |         self.assembler = opts.assembler
178 |         self.genome_assembly_size = opts.genome_size
179 |         self.assembly_coverage = opts.assembly_coverage
180 |         self.only_assemble = opts.only_assemble
181 |         self.no_assembly = opts.no_assembly
182 | 
183 |         # ONT polishing
184 |         self.ont_model = opts.ont_model
185 |         self.no_medaka = opts.no_medaka
186 | 
187 |         # Illumina polishing
188 |         self.illumina_polisher = opts.illumina_polisher
189 | 
190 |         # Illumina metrics
191 |         self.illumina_length_mean = None
192 |         self.illumina_coverage_min = 30
193 |         self.did_pilon_ont_assembly = False
194 |         self.did_polypolish_ont_assembly = False
195 | 
196 |         # The assembly itself
197 |         self.genome = None
198 |         self.contig_info = None
199 | 
200 |         # Vs. reference options
201 |         self.reference_identity_min = 98.0
202 |         self.reference_alignment_min = 97.0
203 |         self.query_alignment_min = 97.0
204 | 
205 |         #placeholders for comparison to given reference
206 |         self.reference_identity = 0
207 |         self.reference_aligned_bases = 0
208 |         self.query_aligned_bases = 0
209 |         self.reference_aligned_fraction = 0
210 |         self.query_aligned_fraction = 0
211 | 
212 |         # Plasmid and feature options
213 |         self.plasmids = opts.plasmids
214 |         self.plasmid_database = opts.plasmid_database
215 |         self.did_call_plasmids = False
216 |         self.no_drawing = opts.no_drawing
217 |         self.amr_database = opts.amr_database
218 |         self.no_amr = opts.no_amr
219 |         self.inc_database = opts.inc_database
220 |         self.no_inc = opts.no_inc
221 |         self.feature_fastas = opts.feature
222 |         self.feature_hits = pd.Series(dtype="float64")
223 |         self.feature_plots = pd.Series(dtype="float64")
224 |         self.feature_dirs = []
225 |         self.feature_names = []
226 |         self.feature_colors = []
227 |         self.download = opts.download
228 | 
229 |         # Reference options
230 |         self.reference_dir = opts.reference_dir
231 |         self.organism = opts.organism
232 |         self.list_organisms = opts.list_organisms
233 |         self.reference_fasta = opts.reference_genome
234 |         self.mutation_region_bed = opts.mutation_regions
235 |         self.self_circos = opts.self_circos
236 |         self.threads = opts.threads
237 | 
238 |         # How much stuff to print
239 |         self.verbosity = opts.verbosity
240 | 
241 |         # Files to remove when done
242 |         self.files_to_clean = []
243 | 
244 |         # Don't actully run any commands
245 |         self.fake_run = opts.fake_run
246 | 
247 |         # Reporting
248 |         self.no_report = False
249 |         self.bundle = opts.bundle
250 |         self.analysis_name = opts.name
251 |         self.mutation_title = "Mutations"
252 |         self.report[self.mutation_title] = pd.Series(dtype="float64")
253 |         self.large_indels = pd.Series(dtype="float64")
254 |         self.plasmid_title = "Plasmid annotation"
255 |         self.report[self.plasmid_title] = pd.Series(dtype="float64")
256 |         self.amr_matrix_title = "AMR matrix"
257 |         self.did_draw_amr_matrix = False
258 |         self.report[self.amr_matrix_title] = pd.Series(dtype="float64")
259 |         self.methods_title = "Methods summary"
260 |         self.report[self.methods_title] = pd.Series(dtype="float64")
261 |         self.basecalling_methods = "Basecalling & processing"
262 |         self.report[self.methods_title][self.basecalling_methods] = pd.Series(
263 |             dtype="float64"
264 |         )
265 |         self.assembly_methods = "Assembly & polishing"
266 |         self.report[self.methods_title][self.assembly_methods] = pd.Series(
267 |             dtype="float64"
268 |         )
269 |         self.mutation_methods = "Mutation screening "
270 |         self.report[self.methods_title][self.mutation_methods] = pd.Series(
271 |             dtype="float64"
272 |         )
273 |         self.plasmid_methods = "Plasmid annotation"
274 |         self.report[self.methods_title][self.plasmid_methods] = pd.Series(
275 |             dtype="float64"
276 |         )
277 |         self.meta_title = "PIMA meta-information"
278 | 
279 |         # See if we got any unknown args.  Not allowed.
280 |         if len(unknown_args) != 0:
281 |             self.errors = self.errors + [
282 |                 "Unknown argument: " + unknown for unknown in unknown_args
283 |             ]
284 | 
285 |     def load_reference(self):
286 |         self.reference = self.load_fasta(self.reference_fasta)
287 |         self.will_have_reference_fasta = True
288 | 
289 |         self.reference_size = 0
290 |         for i in self.reference:
291 |             self.reference_size += len(i.seq)
292 | 
293 |     @staticmethod
294 |     def load_fasta(fasta: str):
295 |         sequence = pd.Series(dtype=object)
296 |         for contig in Bio.SeqIO.parse(fasta, "fasta"):
297 |             sequence[contig.id] = contig
298 |         return sequence
299 | 
300 |     def load_genome(self):
301 |         self.genome = self.load_fasta(self.genome_fasta)
302 |         self.genome_size = 0
303 |         for i in self.genome:
304 |             self.genome_size += len(i.seq)
305 | 


--------------------------------------------------------------------------------
/Pima/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/appliedbinf/pima/0766b56df2c2045f750aa0a5ba6626166b12842b/Pima/utils/__init__.py


--------------------------------------------------------------------------------
/Pima/utils/cli.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from argparse import ArgumentParser, HelpFormatter
  4 | 
  5 | from Pima.pima_colors import Colors
  6 | from Pima.utils.settings import Settings
  7 | 
  8 | 
  9 | def parse_args(settings: Settings):
 10 |     # with open(os.path.join(settings.pima_path, "VERSION"), "r") as version_fp:
 11 |     #     VERSION = version_fp.read().strip()
 12 | 
 13 |     parser = ArgumentParser(
 14 |         allow_abbrev=False,
 15 |         prog="pima.py",
 16 |         add_help=False,
 17 |         description="""
 18 |                             P.I.M.A. bacterial genome analysis pipeline
 19 |                             """,
 20 |         formatter_class=lambda prog: HelpFormatter(
 21 |             prog, width=120, max_help_position=120
 22 |         ),
 23 |     )
 24 | 
 25 |     parser._optionals.title = "Help and version"
 26 |     parser.add_argument(
 27 |         "-h", "--help", action="store_true", help="Print this help and exit."
 28 |     )
 29 |     parser.add_argument(
 30 |         "-v",
 31 |         "--version",
 32 |         action="version",
 33 |         help="Print the software version.",
 34 |         version=f"PIMA microbial genome analysis pipeline (version {settings.pima_version})",
 35 |     )
 36 | 
 37 |     # Input arguments
 38 |     input_group = parser.add_argument_group("Input and basecalilng options")
 39 | 
 40 |     input_group.add_argument(
 41 |         "--ont-model",
 42 |         required=False,
 43 |         default="auto",
 44 |         metavar="<ONT BASECALLING MODEL",
 45 |         help="ONT model used for base calling. 'auto' will try and determine which basecalling model was used and if 'sup' is detected will tell flye to use '--nano-hq'. Available models can be listed using 'medaka tools list_models' as described by medaka (https://github.com/nanoporetech/medaka) (default : %(default)s)",
 46 |     )
 47 |     input_group.add_argument(
 48 |         "--ont-fastq",
 49 |         required=False,
 50 |         default=None,
 51 |         metavar="<FASTQ|GZ>, fastq_pass",
 52 |         help="File containing basecalled ONT reads (fastq|gz), or folder containing directories of demultiplexed basecalled reads (fastq_pass)",
 53 |     )
 54 |     input_group.add_argument(
 55 |         "--multiplexed",
 56 |         required=False,
 57 |         default=False,
 58 |         action="store_true",
 59 |         help="The ONT data are multiplexed (default : %(default)s)",
 60 |     )
 61 |     input_group.add_argument(
 62 |         "--nextflow",
 63 |         required=False,
 64 |         default=False,
 65 |         const=True,
 66 |         nargs='?',
 67 |         metavar='nextflow configs',
 68 |         help="Use nextflow to parallelize PiMA (otherwise samples will be run in serial). Can also provide additional commands to the nextflow run wrapped in single quotes ('), e.g. --nextflow '-c <my_hpc_nextflow.config> -resume' (default : %(default)s)",
 69 |     )
 70 |     input_group.add_argument(
 71 |         "--barcode-min-fraction",
 72 |         required=False,
 73 |         default=0.025,
 74 |         type=float,
 75 |         metavar="",
 76 |         help="The minimum fraction of data necessary to include a barcode in the analysis (default : %(default)s)",
 77 |     )
 78 |     input_group.add_argument(
 79 |         "--contamination",
 80 |         required=False,
 81 |         default=False,
 82 |         action="store_true",
 83 |         help="Use Kraken2 to look for contamination in the input read (default : %(default)s)",
 84 |     )
 85 |     input_group.add_argument(
 86 |         "--illumina-fastq",
 87 |         required=False,
 88 |         default=None,
 89 |         nargs="+",
 90 |         metavar="<R1> [R2]",
 91 |         help="Files containing R1 & R2 Illumina reads",
 92 |     )
 93 |     input_group.add_argument(
 94 |         "--genome",
 95 |         required=False,
 96 |         default=None,
 97 |         metavar="<GENOME_FASTA>",
 98 |         help="A genome FASTA file to be used in place of assembly",
 99 |     )
100 | 
101 |     output_group = parser.add_argument_group("Output options")
102 |     output_group.add_argument(
103 |         "--output",
104 |         required=False,
105 |         default=None,
106 |         metavar="<OUTPUT_DIR>",
107 |         help="Output directory for the analysis",
108 |     )
109 |     output_group.add_argument(
110 |         "--overwrite",
111 |         required=False,
112 |         default=False,
113 |         action="store_true",
114 |         help="Overwrite an existing output directory (default : %(default)s)",
115 |     )
116 |     output_group.add_argument(
117 |         "--keep-intermediates",
118 |         required=False,
119 |         default=False,
120 |         action="store_true",
121 |         help="Keep all intermediate files (mostly retains large bam/mpileup files) (default : %(default)s)"
122 |     )
123 | 
124 |     # Assembly options
125 |     assembly_group = parser.add_argument_group("Assembly options")
126 |     assembly_group.add_argument(
127 |         "--assembler",
128 |         required=False,
129 |         default="flye",
130 |         type=str,
131 |         choices=["flye", "flye_sup", "raven"],
132 |         help="Assembler to use. Choose 'flye_sup' if you are providing R10 fastq data base called with a super_accuracy model and wish to use flye (default : %(default)s)",
133 |     )
134 |     assembly_group.add_argument(
135 |         "--genome-size",
136 |         required=False,
137 |         default="estimate",
138 |         type=str,
139 |         metavar="<GENOME_SIZE>",
140 |         help="Genome size estimate for the assembly & downsampling, can use M,m,K,k shorthand (e.g. 5.5m). If unknown, you can let pima estimate the genome size using 'estimate' (default : %(default)s)",
141 |     )
142 |     assembly_group.add_argument(
143 |         "--assembly-coverage",
144 |         required=False,
145 |         default=200,
146 |         type=int,
147 |         metavar="<X>",
148 |         help="Downsample the provided reads to this coverage (default : %(default)sX)",
149 |     )
150 |     assembly_group.add_argument(
151 |         "--no-medaka",
152 |         required=False,
153 |         default=False,
154 |         action="store_true",
155 |         help="Skip Medaka polising of the ONT assembly (faster) (default : %(default)s)",
156 |     )
157 |     assembly_group.add_argument(
158 |         "--illumina-polisher",
159 |         required=False,
160 |         default="pilon",
161 |         choices=["pilon", "polypolish", "skip"],
162 |         help="Polish the genome assembly using short-reads, to skip use '--illumina-polisher skip' (default : %(default)s)",
163 |     )
164 |     assembly_group.add_argument(
165 |         "--only-assemble",
166 |         required=False,
167 |         default=False,
168 |         action="store_true",
169 |         help="Only carry out assembly steps with the given data; no downstream analysis.",
170 |     )
171 |     assembly_group.add_argument(
172 |         "--no-assembly",
173 |         required=False,
174 |         default=False,
175 |         action="store_true",
176 |         help="Don't attempt to assembly/polish a given genome/set of reads (default : %(default)s)",
177 |     )
178 | 
179 |     # Database/download options
180 |     download_group = parser.add_argument_group("Database downloading arguments")
181 |     download_group.add_argument(
182 |         "--download",
183 |         required=False,
184 |         default=False,
185 |         action="store_true",
186 |         help="Attempt to download Kraken/Plasmid databases if not found locally."
187 |         + "Use witout other options.",
188 |     )
189 | 
190 |     # Plasmid options
191 |     plasmid_group = parser.add_argument_group("Plasmid and vector search options")
192 |     plasmid_group.add_argument(
193 |         "--plasmids",
194 |         required=False,
195 |         default=False,
196 |         action="store_true",
197 |         help="Do a plasmid search (default : %(default)s)",
198 |     )
199 |     plasmid_group.add_argument(
200 |         "--plasmid-database",
201 |         required=False,
202 |         default=settings.plasmid_database_default_fasta,
203 |         metavar="<PLASMID_FASTA>",
204 |         help="Path to a FASTA file with reference plasmid sequences",
205 |     )
206 |     ##add mob_suite (recommended by Gulvik) - https://github.com/phac-nml/mob-suite
207 | 
208 |     # AMR gene options
209 |     amr_group = parser.add_argument_group("AMR gene search options")
210 |     amr_group.add_argument(
211 |         "--amr-database",
212 |         required=False,
213 |         default=settings.amr_database_default,
214 |         metavar="<AMR_FASTA>",
215 |         help="Path to a FASTA file with AMR gene sequences (default : %(default)s)",
216 |     )
217 |     amr_group.add_argument(
218 |         "--no-amr",
219 |         required=False,
220 |         default=False,
221 |         action="store_true",
222 |         help="Skip AMR search (default : %(default)s)",
223 |     )
224 | 
225 |     # Inc group options
226 |     inc_group = parser.add_argument_group("Incompatibility group search options")
227 |     inc_group.add_argument(
228 |         "--inc-database",
229 |         required=False,
230 |         default=settings.inc_database_default,
231 |         metavar="<INC_FASTA>",
232 |         help="Path to a FASTA file with incompatibility group sequences (default : %(default)s)",
233 |     )
234 |     inc_group.add_argument(
235 |         "--no-inc",
236 |         required=False,
237 |         default=False,
238 |         action="store_true",
239 |         help="Skip incompatibility group search (default : %(default)s)",
240 |     )
241 | 
242 |     # Pull in custom feature sets
243 |     other_feature_group = parser.add_argument_group("Other feature search options")
244 |     other_feature_group.add_argument(
245 |         "--feature",
246 |         required=False,
247 |         default=None,
248 |         metavar="<FEATURE_FASTA>",
249 |         action="append",
250 |         help="Path to a FASTA file with feature sequences",
251 |     )
252 | 
253 |     # Drawing options
254 |     drawing_group = parser.add_argument_group("Drawing options")
255 |     drawing_group.add_argument(
256 |         "--no-drawing",
257 |         required=False,
258 |         default=False,
259 |         action="store_true",
260 |         help="Skip drawing of contigs & Features (default : %(default)s)",
261 |     )
262 |     drawing_group.add_argument(
263 |         "--self-circos",
264 |         required=False,
265 |         default=False,
266 |         action="store_true",
267 |         help="Use the assembled genome as the reference to draw circos images. NOT recommended for Illumina assemblies (default : %(default)s)",
268 |     )
269 | 
270 |     # Options for comparing to a reference genome
271 |     reference_group = parser.add_argument_group("Reference options")
272 |     reference_group.add_argument(
273 |         "--reference-dir",
274 |         required=False,
275 |         default=settings.reference_dir_default,
276 |         metavar="<REFERNCE_DIR>",
277 |         help="Directory containing refrence organisms (default : %(default)s)",
278 |     )
279 |     reference_group.add_argument(
280 |         "--organism",
281 |         required=False,
282 |         default=None,
283 |         metavar="<Genus_species>",
284 |         help="Reference organism to compare against",
285 |     )
286 |     reference_group.add_argument(
287 |         "--list-organisms",
288 |         required=False,
289 |         default=False,
290 |         action="store_true",
291 |         help="List the reference organisms available to this pipeline",
292 |     )
293 |     reference_group.add_argument(
294 |         "--reference-genome",
295 |         required=False,
296 |         default=False,
297 |         metavar="<GENOME_FASTA>",
298 |         nargs="?",
299 |         help="Reference genome to compare against (default : %(default)s)",
300 |     )
301 |     reference_group.add_argument(
302 |         "--mutation-regions",
303 |         required=False,
304 |         default=False,
305 |         metavar="<REGION_BED>",
306 |         nargs="?",
307 |         help="Regions in the reference genome to screen for mutations (default : %(default)s)",
308 |     )
309 | 
310 |     # Other arguments
311 |     other_group = parser.add_argument_group("Other options")
312 |     other_group.add_argument(
313 |         "--name",
314 |         required=False,
315 |         type=str,
316 |         default="Genome",
317 |         metavar="<NAME>",
318 |         help="Name of this analysis for reporting.",
319 |     )
320 |     other_group.add_argument(
321 |         "--threads",
322 |         required=False,
323 |         type=int,
324 |         default=1,
325 |         metavar="<NUM_THREADS>",
326 |         help="Number of worker threads to use (default : %(default)s)",
327 |     )
328 |     other_group.add_argument(
329 |         "--verbosity",
330 |         required=False,
331 |         type=int,
332 |         default=1,
333 |         metavar="<INT>",
334 |         help="How much information to print as PiMA runs (default : %(default)s)",
335 |     )
336 |     other_group.add_argument(
337 |         "--logfile",
338 |         required=False,
339 |         type=str,
340 |         default="pipeline.log",
341 |         metavar="<FILENAME>",
342 |         help="Name of logfile written in output directory (default : %(default)s)",
343 |     )
344 |     other_group.add_argument(
345 |         "--resume",
346 |         required=False,
347 |         action="store_true",
348 |         help="Restart pipeline from last completed step (default : %(default)s)",
349 |     )
350 |     other_group.add_argument(
351 |         "--bundle",
352 |         required=False,
353 |         type=str,
354 |         default=None,
355 |         metavar="<PATH>",
356 |         help="Local Tectonic bundle (default : %(default)s)",
357 |     )
358 |     other_group.add_argument(
359 |         "--fake-run",
360 |         required=False,
361 |         default=False,
362 |         action="store_true",
363 |         help="Don't actually run the pipeline, just pretend to (default : %(default)s)",
364 |     )
365 | 
366 |     opts, unknown_args = parser.parse_known_args()
367 | 
368 |     if opts.organism:
369 |         opts.organism = opts.organism.replace(" ", "_")
370 | 
371 |     if opts.help:
372 |         print(Colors.HEADER)
373 |         parser.print_help()
374 |         print(Colors.ENDC)
375 |         sys.exit(0)
376 | 
377 |     return opts, unknown_args
378 | 


--------------------------------------------------------------------------------
/Pima/utils/mapping.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | 
  4 | from Pima.pima_data import PimaData
  5 | from Pima.utils.utils import (
  6 |     print_and_log,
  7 |     print_and_run,
  8 |     validate_file_and_size,
  9 |     validate_file_and_size_or_error,
 10 |     std_files,
 11 | )
 12 | 
 13 | 
 14 | def minimap_and_sort(
 15 |     pima_data: PimaData,
 16 |     genome: str,
 17 |     bam: str,
 18 |     fastq: list,
 19 |     ont: bool = True):
 20 | 
 21 |     if ont:
 22 |         map_mode = "map-ont"
 23 |     else:
 24 |         map_mode = "sr"
 25 | 
 26 |     if not type(fastq) is list:
 27 |         fastq = [fastq]
 28 |     
 29 |     std_prefix = re.sub(r"\.bam$", "", bam)
 30 |     _, minimap_stderr = std_files(std_prefix)
 31 |     command = " ".join(
 32 |         [
 33 |             "minimap2 -a",
 34 |             "-t",
 35 |             str(pima_data.threads),
 36 |             "-x",
 37 |             map_mode,
 38 |             genome,
 39 |             " ".join(fastq), #handle 1 or 2 fastqs
 40 |             "2>", minimap_stderr,
 41 |             "| samtools sort",
 42 |             "-@",
 43 |             str(pima_data.threads),
 44 |             "-o",
 45 |             bam,
 46 |             "-T reads.tmp -",
 47 |             "1>/dev/null 2>/dev/null",
 48 |         ]
 49 |     )
 50 | 
 51 |     print_and_run(pima_data, command)
 52 |     validate_file_and_size_or_error(pima_data, the_file=bam, error_prefix="The file", presence_suffix="doesn't exist", size_suffix="is below min expected size", min_size=1000)
 53 |     index_bam(pima_data, bam)
 54 | 
 55 | def filter_bam(pima_data: PimaData, inbam: str, outbam: str = None, F: str = None, q: str = None):
 56 |     """Filter the bam file, if outbam not provided, we filter in-place"""
 57 |     if not outbam:
 58 |         outbam = inbam
 59 |     command = " ".join(
 60 |         [
 61 |             "samtools view -h",
 62 |             "-F",
 63 |             F,
 64 |             "-q",
 65 |             q,
 66 |             inbam,
 67 |             "| samtools sort",
 68 |             "-@",
 69 |             str(pima_data.threads),
 70 |             "-o",
 71 |             outbam,
 72 |             "-T reads.tmp -",
 73 |             "1>/dev/null 2>/dev/null",
 74 |         ]
 75 |     )
 76 |     print_and_run(pima_data, command)
 77 |     validate_file_and_size_or_error(pima_data, the_file=outbam, min_size=1000)
 78 |     index_bam(pima_data, outbam)
 79 | 
 80 | def index_bam(pima_data: PimaData, bam: str):
 81 |     command = " ".join(["samtools index", bam, "1>/dev/null 2>/dev/null"])
 82 |     print_and_run(pima_data, command)
 83 |     index_bai = bam + ".bai"
 84 |     validate_file_and_size_or_error(pima_data, the_file=index_bai, min_size=1000)
 85 | 
 86 | def mpileup_bam(pima_data: PimaData, reference_genome: str, bam: str, mpileup: str, output_dir: str):
 87 | 
 88 |     print_and_log(
 89 |         pima_data,
 90 |         "Making mpileup from BAM", 
 91 |         pima_data.sub_process_verbosity, 
 92 |         pima_data.sub_process_color,
 93 |     )
 94 |     
 95 |     mpileup_stdout, mpileup_stderr = std_files(os.path.join(output_dir, 'mpileup'))
 96 |     command = " ".join(
 97 |         [
 98 |             'samtools mpileup',
 99 |             '-B',
100 |             '-a',
101 |             '-f', reference_genome,
102 |             '-o' + mpileup,
103 |             bam,
104 |             '1>', mpileup_stdout, 
105 |             '2>', mpileup_stderr,
106 |         ]
107 |     )
108 |     print_and_run(pima_data, command)
109 |     validate_file_and_size_or_error(pima_data,
110 |                                     mpileup, 
111 |                                     'Region MPILEUP file', 
112 |                                     'cannot be found', 
113 |                                     'is empty',
114 |     )
115 | 
116 | def bwa_index_fasta(pima_data: PimaData, fasta: str):
117 |     
118 |     print_and_log(
119 |         pima_data,
120 |         'Indexing FASTA with bwa index', 
121 |         pima_data.sub_process_verbosity, 
122 |         pima_data.sub_process_color,
123 |     )
124 |         
125 |     # Check for an index already there
126 |     bwa_index = f"{fasta}.bwt"
127 |     if validate_file_and_size(pima_data, bwa_index):
128 |         return
129 |     
130 |     # Make the bwa index
131 |     std_prefix = re.sub(r'\.f(na|asta)$', '', fasta)
132 |     bwa_index_stdout, bwa_index_stderr = std_files(std_prefix + '_index')
133 |     command = " ".join(
134 |         [
135 |             'bwa',
136 |             'index',
137 |             fasta,
138 |             '1>', bwa_index_stdout, '2>', bwa_index_stderr,
139 |         ]
140 |     )
141 |     print_and_run(pima_data, command)
142 | 
143 |     # Check that the index was built
144 |     validate_file_and_size_or_error(pima_data, bwa_index, 'BWA index', 'doesn\'t exist', 'is empty')
145 |                  
146 | def bwa_short_illumina_fastq_and_sort(pima_data: PimaData, genome: str, fastq: str, bam: str):
147 | 
148 |     std_prefix = re.sub(r'\.bam$', '', bam)
149 |         
150 |     bwa_index_fasta(pima_data, genome)
151 | 
152 |     # Align the reads 
153 |     sai = []
154 |     for i in range(len(fastq)):
155 |         bwa_stdout, bwa_stderr = std_files(std_prefix + '_aln')
156 |         this_sai = std_prefix + '_aln_' + str(i) + '.sai'
157 |         command = " ".join(
158 |             [
159 |                 'bwa aln',
160 |                 '-t', str(pima_data.threads),
161 |                 genome,
162 |                 fastq[i],
163 |                 '1>', this_sai,
164 |                 '2>', bwa_stderr,
165 |             ]
166 |         )
167 |         sai.append(this_sai)
168 |         print_and_run(pima_data, command)
169 |         validate_file_and_size_or_error(pima_data, this_sai)
170 |         
171 |     # And turn the SAI into a proper SAM file
172 |     read_type = 'samse'
173 |     if len(fastq) > 1:
174 |         read_type = 'sampe'
175 |     bwa_stdout, bwa_stderr = std_files(std_prefix + '_sam')
176 |     tmp_file = std_prefix + '.tmp'
177 |     command = " ".join(
178 |         [
179 |             'bwa',
180 |             read_type,
181 |             genome,
182 |             ' '.join(sai),
183 |             ' '.join(fastq),
184 |             '2>', bwa_stderr,
185 |             '| samtools',
186 |             'sort',
187 |             '-T', tmp_file,
188 |             '-o', bam,
189 |             '-',
190 |             '1>/dev/null 2>/dev/null',
191 |         ]
192 |     )
193 |     print_and_run(pima_data, command)
194 |     validate_file_and_size_or_error(pima_data, the_file = bam, min_size = 100)
195 |     index_bam(pima_data, bam)
196 | 
197 | def bwa_mem_all_aln_illumina(pima_data: PimaData, genome: str, fastq: list, bam: str):
198 |     std_prefix = re.sub(r'\.bam$', '', bam)
199 |         
200 |     bwa_index_fasta(pima_data, genome)
201 | 
202 |     bams = []
203 |     # Align the reads 
204 |     for i in range(len(fastq)):
205 |         read = f"_R{i+1}"
206 |         _, bwa_stderr = std_files(std_prefix + read + '_aln')
207 |         this_bam = std_prefix + read + '.bam'
208 |         #Polypolish warns not to sort the bam files
209 |         command = " ".join(
210 |             [
211 |                 'bwa mem',
212 |                 '-a',
213 |                 '-t', str(pima_data.threads),
214 |                 genome,
215 |                 fastq[i],
216 |                 '1>', this_bam,
217 |                 '2>', bwa_stderr,
218 |             ]
219 |         )
220 |         bams.append(this_bam)
221 |         print_and_run(pima_data, command)
222 |         validate_file_and_size_or_error(pima_data, this_bam)
223 |     return bams


--------------------------------------------------------------------------------
/Pima/utils/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | class Settings():
 5 |     def __init__(self):
 6 |         self.data_dir = os.path.join(self.pima_path, 'data')
 7 |         self.amr_database_default = os.path.join(self.pima_path, 'data/amr.fasta')
 8 |         self.amr_gene_drug_tsv = os.path.join(self.pima_path, 'data/gene_drug.tsv')
 9 |         self.amr_default_color = '#FED976'
10 |         self.inc_database_default = os.path.join(self.pima_path, 'data/inc.fasta')
11 |         self.inc_default_color = '#0570B0'
12 |         self.included_databases = [self.amr_database_default, self.inc_database_default]
13 | 
14 |         self.plasmid_database_default_fasta = os.path.join(self.pima_path, 'data/plasmids_and_vectors.fasta')
15 |         self.kraken_database_default = os.path.join(self.pima_path, 'data/kraken2')
16 |         self.reference_dir_default = os.path.join(self.pima_path, 'data/reference_sequences')
17 |         self.pima_css = os.path.join(self.pima_path,'data/pima.css')
18 |         self.virulence_genes_fp = os.path.join(self.data_dir, "reference_sequences/Bacillus_anthracis/ba_virulence_genes.bed")
19 | 
20 |         ## Docker specific paths
21 |         self.DockerPathPlasmid = os.path.join('/home/DockerDir/Data/Temp_Data/plasmids_and_vectors.fasta')
22 |         self.DockerPathKraken = os.path.join('/home/DockerDir/Data/Temp_Data/kraken2')
23 |         
24 |     @property
25 |     def pima_path(self):
26 |         # Is __name__ the most robust way to print the path of importing scripts, not this one?
27 |         #return os.path.dirname(os.path.realpath(__name__))
28 |         return Path(__file__).parent.parent
29 |     @property
30 |     def pima_version(self):
31 |         with open(os.path.join(self.pima_path, "VERSION"), "r") as version_fp:
32 |             VERSION = version_fp.read().strip()
33 |         return VERSION


--------------------------------------------------------------------------------
/Pima/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import re
  4 | import datetime
  5 | import shutil
  6 | import subprocess
  7 | from pathlib import Path
  8 | 
  9 | import numpy as np
 10 | 
 11 | from Pima.pima_data import PimaData
 12 | from Pima.pima_colors import Colors
 13 | 
 14 | 
 15 | def nicenumber(x: float, round: int):
 16 |     exp = np.floor(np.log10(x))
 17 |     f = x / 10**exp
 18 | 
 19 |     if round:
 20 |         if f < 1.5:
 21 |             nf = 1.0
 22 |         elif f < 3.0:
 23 |             nf = 2.0
 24 |         elif f < 7.0:
 25 |             nf = 5.0
 26 |         else:
 27 |             nf = 10.0
 28 |     else:
 29 |         if f <= 1.0:
 30 |             nf = 1.0
 31 |         elif f <= 2.0:
 32 |             nf = 2.0
 33 |         elif f <= 5.0:
 34 |             nf = 5.0
 35 |         else:
 36 |             nf = 10.0
 37 | 
 38 |     return nf * 10.0**exp
 39 | 
 40 | 
 41 | def pretty(low, high, n):
 42 |     range = nicenumber(high - low, False)
 43 |     d = nicenumber(range / (n - 1), True)
 44 |     miny = np.floor(low / d) * d
 45 |     maxy = np.ceil(high / d) * d
 46 |     return np.arange(miny, maxy + 0.5 * d, d)
 47 | 
 48 | 
 49 | def format_kmg(number: float, decimals: int = 0):
 50 |     if number == 0:
 51 |         return "0"
 52 | 
 53 |     magnitude_powers = [10**9, 10**6, 10**3, 1]
 54 |     magnitude_units = ["G", "M", "K", ""]
 55 |     for i in range(len(magnitude_units)):
 56 |         if number >= magnitude_powers[i]:
 57 |             magnitude_power = magnitude_powers[i]
 58 |             magnitude_unit = magnitude_units[i]
 59 |             return f"{round(number/magnitude_power,decimals)}{magnitude_unit}"
 60 | 
 61 | 
 62 | def print_and_log(
 63 |     pima_data: PimaData, text: str, verbosity: int, color: str = Colors.ENDC, time_string:str = None
 64 |     ):
 65 |     if not time_string:
 66 |         time_string = f'[{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]'
 67 |     if verbosity <= pima_data.verbosity:
 68 |         sys.stderr.write(f"{time_string} {color}{text}{Colors.ENDC}\n")
 69 | 
 70 |     if pima_data.logging_handle:
 71 |         pima_data.logging_handle.write(f"{time_string} {text}\n")
 72 |         #otherwise log doesn't get created until it is closed
 73 |         ## I don't think this is too expensive of a process since we are not logging that frequently
 74 |         pima_data.logging_handle.flush()
 75 | 
 76 | 
 77 | def start_logging(pima_data: PimaData):
 78 | 
 79 |     if pima_data.resume:
 80 |         # if user has specified a new logfile for us
 81 |         if pima_data.logging_file and not os.path.isfile(pima_data.logging_file):
 82 |             pima_data.logging_handle = open(pima_data.logging_file, 'w')
 83 | 
 84 |         # if user has resumed a run using the same log path
 85 |         elif pima_data.logging_file and os.path.isfile(pima_data.logging_file):
 86 |             backup_log_file(pima_data.logging_file)
 87 |             pima_data.logging_handle = open(pima_data.logging_file, 'w')
 88 | 
 89 |         # if user is using the default pima.log file
 90 |         else:
 91 |             backup_log_file(os.path.join(pima_data.output_dir, "pima.log"))
 92 |             pima_data.logging_file = os.path.join(pima_data.output_dir, "pima.log")
 93 |             pima_data.logging_handle = open(pima_data.logging_file, 'w')
 94 | 
 95 |     else:
 96 |         if pima_data.logging_file:
 97 |             pima_data.logging_handle = open(pima_data.logging_file, 'w')
 98 | 
 99 |         else:
100 |             pima_data.logging_file = os.path.join(pima_data.output_dir, "pima.log")
101 |             pima_data.logging_handle = open(pima_data.logging_file, 'w')
102 | 
103 | 
104 | def backup_log_file(existing_log_path: str):
105 |     log_path = Path(existing_log_path)
106 | 
107 |     if os.path.isfile(log_path):
108 |         rename_log_path = os.path.join(log_path.parent, f"previous_{log_path.name}")
109 |         
110 |         if os.path.isfile(rename_log_path):
111 |             with open(rename_log_path, 'a') as prev_log_file:
112 |                 prev_log_file.write("\n\n")
113 |                 with open(existing_log_path) as cur_log_file:
114 |                     shutil.copyfileobj(cur_log_file, prev_log_file)
115 |             os.remove(existing_log_path)
116 |             return
117 | 
118 |         os.rename(existing_log_path, rename_log_path)
119 | 
120 | 
121 | def stop_logging(pima_data: PimaData, message: str = None):
122 | 
123 |     if message:
124 |         print_and_log(
125 |             pima_data,
126 |             message,
127 |             pima_data.main_process_verbosity,
128 |             pima_data.main_process_color,
129 |         )
130 |         
131 |     pima_data.logging_handle.close()
132 |     pima_data.logging_handle = None
133 | 
134 | 
135 | def validate_file(the_file: str):
136 |     return os.path.isfile(the_file)
137 | 
138 | 
139 | def validate_file_size(pima_data: PimaData, the_file: str, min_size: int = 0):
140 |     if not pima_data.fake_run:
141 |         return os.stat(the_file).st_size >= min_size
142 |     else:
143 |         return True
144 | 
145 | 
146 | def validate_file_and_size(pima_data: PimaData, the_file: str, min_size: int = 0):
147 |     return validate_file(the_file) and validate_file_size(pima_data, the_file, min_size)
148 | 
149 | 
150 | def validate_file_and_size_or_error(
151 |     pima_data,
152 |     the_file,
153 |     error_prefix="The file",
154 |     presence_suffix="doesn't exist",
155 |     size_suffix="is size 0",
156 |     min_size=0,
157 | ):
158 |     if not validate_file(the_file) and not pima_data.fake_run:
159 |         error_out(pima_data, " ".join([error_prefix, the_file, presence_suffix]))
160 | 
161 |     if not validate_file_size(pima_data, the_file, min_size) and not pima_data.fake_run:
162 |         error_out(pima_data, " ".join([error_prefix, the_file, size_suffix]))
163 | 
164 | 
165 | def validate_utility(pima_data: PimaData, utility: str, error: str):
166 |     if not shutil.which(utility):
167 |         pima_data.errors.append(error)
168 |         print_and_log(
169 |             pima_data,
170 |             error,
171 |             pima_data.fail_verbosity,
172 |             pima_data.error_color,
173 |         )
174 |         return False
175 |     else:
176 |         return True
177 | 
178 | 
179 | def print_and_run(pima_data: PimaData, command: str, change_exe_dir: str = None):
180 |     print_and_log(pima_data, command, pima_data.command_verbosity)
181 |     return run_command(pima_data, command, change_exe_dir)
182 | 
183 | 
184 | def run_command(pima_data: PimaData, command: str, change_exe_dir: str = None):
185 |     if not pima_data.fake_run:
186 |         if change_exe_dir:
187 |             result = subprocess.run(command, shell=True, capture_output=True, text=True, cwd = change_exe_dir)
188 |         else:
189 |             result = subprocess.run(command, shell=True, capture_output=True, text=True)
190 | 
191 |         if result.returncode == 0:
192 |             return result.stdout.split("\n")
193 |         elif re.search(r"\.stderr$", command):
194 |             sterr_f = [x for x in command.split(" ") if re.search(r"\.stderr$", x)][0]
195 |             message = f"Command {command} failed with the following error. exiting\n{Path(sterr_f).read_text()}"
196 |             error_out(pima_data, message)
197 |         else:
198 |             message = f"Command {command} failed with the following error. exiting\n{result.stderr}"
199 |             error_out(pima_data, message)
200 | 
201 | 
202 | def error_out(pima_data: PimaData, message: str):
203 |     print_and_log(
204 |         pima_data,
205 |         message,
206 |         pima_data.fail_verbosity,
207 |         pima_data.error_color,
208 |     )
209 |     sys.exit(1)
210 | 
211 | 
212 | def print_warning(pima_data: PimaData, warning: str):
213 |     print_and_log(
214 |         pima_data, warning, pima_data.warning_verbosity, pima_data.warning_color
215 |     )
216 | 
217 | 
218 | def add_warning(pima_data: PimaData, warning: str):
219 |     print_warning(pima_data, warning)
220 |     pima_data.warnings += [warning]
221 | 
222 | 
223 | def find_checkpoint(pima_data: PimaData, dir: str):
224 |     """Searches for the .finish file generated after each analysis step is completed
225 | 
226 |     Args:
227 |         dir (path): Path to analysis directory
228 | 
229 |     Returns:
230 |         True if 'resume' flag provided & ".finish" found
231 |         False if ".finish" is not found, existing directory deleted
232 |     """
233 |     if not pima_data.resume:
234 |         return False
235 | 
236 |     if os.path.exists(os.path.join(dir, ".finish")):
237 |         return True
238 | 
239 |     else:
240 |         if os.path.exists(dir):
241 |             shutil.rmtree(dir)
242 |         return False
243 | 
244 | 
245 | def std_files(prefix: str):
246 |     return [prefix + ".std" + i for i in ["out", "err"]]
247 | 
248 | 
249 | def touch_file(pima_data: PimaData, a_file: str):
250 |     command = " ".join(["touch", a_file])
251 |     print_and_run(pima_data, command)
252 | 
253 | 
254 | def make_start_file(pima_data: PimaData, a_dir: str):
255 |     start_file = os.path.join(a_dir, ".start")
256 |     touch_file(pima_data, start_file)
257 | 
258 | 
259 | def make_finish_file(pima_data: PimaData, a_dir: str):
260 |     finish_file = os.path.join(a_dir, ".finish")
261 |     touch_file(pima_data, finish_file)
262 | 
263 | 
264 | def make_report_info_file(pima_data: PimaData, a_dir: str):
265 |     report_info_file = os.path.join(a_dir, ".report_info")
266 |     touch_file(pima_data, report_info_file)
267 |     return report_info_file
268 | 
269 | 
270 | def clean_up(pima_data: PimaData):
271 | 
272 |     print_and_log(
273 |         pima_data,
274 |         "Cleaning up",
275 |         pima_data.main_process_verbosity,
276 |         pima_data.main_process_color,
277 |     )
278 | 
279 |     if (pima_data.genome_fasta):
280 |         final_fasta = os.path.join(pima_data.output_dir, 'assembly.fasta')
281 |         command = ' '.join(['cp', pima_data.genome_fasta, final_fasta])
282 |         print_and_run(pima_data, command)
283 | 
284 |     #create a shortcut to the report in the primary output_dir
285 |     if os.path.isfile(os.path.join(pima_data.output_dir, "report.pdf")):
286 |         os.remove(os.path.join(pima_data.output_dir, "report.pdf"))
287 |     
288 |     os.symlink(os.path.join(pima_data.output_dir, "report", "report.pdf"), os.path.join(pima_data.output_dir, "report.pdf"))
289 | 
290 |     if not pima_data.keep_intermediates:
291 |         if len(pima_data.files_to_clean) > 0:
292 |             for file in pima_data.files_to_clean :
293 |                 if os.path.isfile(file):
294 |                     try:
295 |                         os.remove(file)
296 |                     except OSError:
297 |                         continue


--------------------------------------------------------------------------------
/conda_recipe/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | $PYTHON -m pip install -vv --no-deps --ignore-installed .
4 | 
5 | 


--------------------------------------------------------------------------------
/conda_recipe/environment.yml:
--------------------------------------------------------------------------------
 1 | #mirrors meta.yaml but can be used to build a conda enviroment WITHOUT pima for debugging
 2 | name: base
 3 | 
 4 | channels:
 5 |   - bioconda
 6 |   - conda-forge
 7 | 
 8 | dependencies:
 9 |     - git==2.39.1
10 |     - gawk==5.3
11 |     - bedtools==2.31.1
12 |     - biopython==1.84
13 |     - blast==2.16
14 |     - bwa==0.7.18
15 |     - curl
16 |     - flye==2.9.4
17 |     - raven-assembler==1.8.3
18 |     - mdutils==1.6.0
19 |     - minimap2==2.28
20 |     - multiprocess=0.70.16
21 |     - mummer==3.23
22 |     - kraken2==2.1.3
23 |     - spades==4.0.0
24 |     - pandas==2.2.2
25 |     - pandoc==3.3
26 |     - weasyprint==62.3 # pdf engine for pandoc
27 |     - pango==1.50.14
28 |     - pathos
29 |     - python >=3.9,<3.11 #,<3.9
30 |     - pyfaidx==0.8.1.2
31 |     - python_circos ## will replace with the development verison using pip in pima_install.sh script
32 |     - r
33 |     - r-hash #needed for pChunks
34 |     - r-stringr #needed for pChunks
35 |     - r-gridextra #needed for pChunks
36 |     - r-optparse #needed for pChunks
37 |     - samtools==1.18 #needs to be the conda-forge version and NOT the bioconda version
38 |     - varscan==2.4.6
39 |     - medaka >=1.11 #bumped to get access to the inspect model tools
40 |     - pilon==1.24
41 |     - polypolish==0.6
42 |     - quast==5.2.0 #installs perl-circos as a dependency...
43 |     - nextflow==24.04.4 # used for parallelizing multiplex runs
44 | 


--------------------------------------------------------------------------------
/conda_recipe/environment_open_versions.yml:
--------------------------------------------------------------------------------
 1 | name: testpima
 2 | 
 3 | channels:
 4 |   - bioconda
 5 |   - conda-forge
 6 | 
 7 | dependencies:
 8 |   - git
 9 |   - gawk
10 |   - bedtools
11 |   - biopython
12 |   - blast
13 |   - bwa
14 |   - curl
15 |   - flye
16 |   - raven-assembler
17 |   - mdutils
18 |   - minimap2
19 |   - multiprocess
20 |   - mummer
21 |   - kraken2
22 |   - spades
23 |   - pandas
24 |   - pandoc
25 |   - weasyprint
26 |   - pango
27 |   - pathos
28 |   - python
29 |   - pyfaidx
30 |   - python_circos
31 |   - r
32 |   - r-hash
33 |   - r-stringr
34 |   - r-gridextra
35 |   - r-optparse
36 |   - samtools
37 |   - varscan
38 |   - qcat
39 |   - medaka
40 |   - pilon
41 |   - polypolish
42 |   - quast
43 |   - nextflow
44 | 


--------------------------------------------------------------------------------
/conda_recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "pima" %}
 2 | #{% set python = "3.8" %}
 3 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %}
 4 | 
 5 | package:
 6 |   name: "{{ name|lower }}"
 7 |   version: "{{ data.get('version') }}"
 8 | 
 9 | source:
10 |   # git_url: https://github.com/appliedbinf/MergedPima.git
11 |   # git_tag: development
12 |   path: ../
13 | 
14 | 
15 | build:
16 |   number: 1
17 |   skip: True  # [win]
18 | 
19 | requirements:
20 |   host:
21 |     - python >=3.9,<3.11 #,<3.9
22 |     - pip
23 | 
24 |   run:
25 |     - git==2.39.1
26 |     - gawk==5.3
27 |     - bedtools==2.31.1
28 |     - biopython==1.84
29 |     - blast==2.16
30 |     - bwa==0.7.18
31 |     - curl
32 |     - flye==2.9.4
33 |     - raven-assembler==1.8.3
34 |     - mdutils==1.6.0
35 |     - minimap2==2.28
36 |     - multiprocess=0.70.16
37 |     - mummer==3.23
38 |     - kraken2==2.1.3
39 |     - spades==4.0.0
40 |     - pandas==2.2.2
41 |     - pandoc==3.3
42 |     - weasyprint==62.3 # pdf engine for pandoc
43 |     - pango==1.50.14
44 |     - pathos
45 |     - python >=3.9,<3.11 #,<3.9
46 |     - pyfaidx==0.8.1.2
47 |     - python_circos ## will replace with the development verison using pip in pima_install.sh script
48 |     - r
49 |     - r-hash #needed for pChunks
50 |     - r-stringr #needed for pChunks
51 |     - r-gridextra #needed for pChunks
52 |     - r-optparse #needed for pChunks
53 |     - samtools==1.18 #needs to be the conda-forge version and NOT the bioconda version
54 |     - varscan==2.4.6
55 |     - medaka >=1.11 #bumped to get access to the inspect model tools
56 |     - pilon==1.24
57 |     - polypolish==0.6
58 |     - quast==5.2.0 #installs perl-circos as a dependency...
59 |     - nextflow==24.04.4 # used for parallelizing multiplex runs
60 |     
61 | test:
62 | 
63 |   commands:
64 |     - minimap2 -h
65 |     - pima -h
66 | 
67 | about:
68 |   home: https://githib.com/appliedbinf/MergedPima
69 |   License: MIT
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/conda_recipe/meta_open_versions.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "pima" %}
 2 | {% set python = "3.8" %}
 3 | {% set data = load_setup_py_data(setup_file='../setup.py', from_recipe_dir=True) %}
 4 | 
 5 | package:
 6 |   name: "{{ name|lower }}"
 7 |   version: "{{ data.get('version') }}"
 8 | 
 9 | source:
10 |   # git_url: https://github.com/appliedbinf/MergedPima.git
11 |   # git_tag: development
12 |   path: ../
13 | 
14 | 
15 | build:
16 |   number: 7
17 |   skip: True  # [win]
18 | 
19 | requirements:
20 |   host:
21 |     - python
22 |     - pip
23 | 
24 |   run:
25 |     - git
26 |     - gawk
27 |     - bedtools
28 |     - biopython
29 |     - blast
30 |     - bwa
31 |     - curl
32 |     - flye
33 |     - raven-assembler
34 |     - mdutils
35 |     - minimap2
36 |     - multiprocess
37 |     - mummer
38 |     - kraken2
39 |     - spades
40 |     - pandas
41 |     - pandoc
42 |     - weasyprint
43 |     - pango
44 |     - pathos
45 |     - python
46 |     - pyfaidx
47 |     - python_circos
48 |     - r
49 |     - r-hash
50 |     - r-stringr
51 |     - r-gridextra
52 |     - r-optparse
53 |     - samtools
54 |     - varscan
55 |     - qcat
56 |     - medaka
57 |     - pilon
58 |     - polypolish
59 |     - quast
60 |     - nextflow
61 |     
62 | test:
63 | 
64 |   commands:
65 |     - minimap2 -h
66 |     - pima -h
67 | 
68 | about:
69 |   home: https://githib.com/appliedbinf/MergedPima
70 |   License: MIT
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/conda_recipe/post-link.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python -m pip install --no-deps dna_features_viewer si-prefix 
4 | 
5 | # upgrade python_circos from the conda version that controls the dependencies, but lacks features
6 | python -m pip install --force-reinstall --no-deps git+https://github.com/ponnhide/pyCircos.git
7 | 
8 | 


--------------------------------------------------------------------------------
/dockerbuild/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mambaorg/micromamba:1.5.8 as app
 2 | 
 3 | ARG PIMA_VER="2.1.1"
 4 | 
 5 | # 'LABEL' instructions tag the image with metadata that might be important to the user
 6 | LABEL base.image="mambaorg/micromamba:1.5.8"
 7 | LABEL dockerfile.version="1"
 8 | LABEL software="pima"
 9 | LABEL software.version="${PIMA_VER}"
10 | LABEL description="Plasmid, Integrations, Mutations, and Antibiotic resistance annotation pipeline"
11 | LABEL maintainer="Will Overholt"
12 | LABEL maintainer.email="woverholt@asrtinc.com"
13 | 
14 | USER root
15 | 
16 | ## Need to build the file from the primary pima dir
17 | # cd ....pima
18 | # docker build -t local/pima:2.1.0 -f dockerbuild/Dockerfile .
19 | # cd ....singularity_images
20 | # apptainer build pima2.1.0.sif docker-daemon://local/pima:2.1.0
21 | 
22 | # include required pima files
23 | ADD conda_recipe/environment.yml environment.yml
24 | ADD Pima Pima
25 | ADD setup.py setup.py
26 | ADD README.md README.md
27 | 
28 | # build run environment
29 | RUN apt-get update && apt-get install -y --no-install-recommends \
30 |     locales \
31 |     locales-all \
32 |     libpango-1.0-0 libpangoft2-1.0-0 libharfbuzz-subset0 \
33 |     wget \
34 |     procps \
35 |     ca-certificates && \
36 |     apt-get autoclean && rm -rf /var/lib/apt/lists/*
37 | 
38 | RUN micromamba install --name base -c conda-forge -c bioconda -f environment.yml && \
39 |     micromamba clean -a -f -y && \
40 |     mkdir /data
41 | 
42 | ENV PATH="/opt/conda/bin/:$PATH" \
43 |     LC_ALL=C.UTF-8
44 | 
45 | # install pima
46 | RUN python -m pip install -vv --no-deps --ignore-installed --no-cache-dir . && \
47 |     python -m pip install --no-deps --no-cache-dir dna_features_viewer si-prefix && \
48 |     python -m pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/ponnhide/pyCircos.git
49 | 
50 | CMD pima --help
51 | 
52 | WORKDIR /data
53 | 
54 | FROM app as test
55 | 
56 | WORKDIR /test
57 | 
58 | # test installation
59 | RUN pima --help && \
60 |     pima --version
61 | 
62 | # prep pima
63 | RUN pima --download


--------------------------------------------------------------------------------
/pima:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | import re
4 | import sys
5 | from Pima.pima import main
6 | if __name__ == '__main__':
7 |     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
8 |     sys.exit(main())
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | import os
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | with open(os.path.join("Pima", "VERSION"), "r", encoding="utf-8") as version_fp:
 7 |     VERSION = version_fp.read().strip()
 8 | 
 9 | setuptools.setup(
10 |     name="pima",
11 |     version=VERSION,
12 |     author="Applied Bioinformatics Laboratory",
13 |     author_email="woverholt@asrtinc.com",
14 |     description="Genomic characterization pipeline for Bacillus anthracis",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/appliedbinf/pima",
18 |     packages=setuptools.find_packages(),
19 |     python_requires='>=3.8',
20 |     package_data={
21 |         "Pima": [
22 |             "data/**",
23 |             "VERSION",
24 |             "nextflow_parallelization/**",
25 |         ],
26 |     },
27 |     scripts = ['Pima/pima.py', 'Pima/accessory_scripts/building_pycircos_figures.py'],
28 |     zip_safe=False,
29 |     include_package_data=True,
30 |     entry_points={
31 |         # Optional: specify any entry points for your package here
32 |         'console_scripts': [
33 |             'pima = Pima.pima:main',
34 |         ],
35 |     },
36 | )


--------------------------------------------------------------------------------