├── .DS_Store
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   ├── docs.yaml
    │   ├── pypi.yml
    │   └── testpypi.yml
├── .gitignore
├── Makefile
├── README.md
├── cfutils
    ├── __init__.py
    ├── align.py
    ├── cli.py
    ├── count.py
    ├── parser.py
    ├── run.py
    ├── show.py
    └── utils.py
├── data
    ├── B5-M13R_B07.ab1
    ├── B5-M13R_B07_vs_ref.pdf
    ├── B5-M13R_B07_vs_ref.tsv
    ├── data_file
    ├── matplotlib_example.png
    ├── plot.png
    └── ref.fa
├── docs
    ├── CNAME
    ├── _config.yml
    ├── api.md
    ├── cli.md
    ├── favicon.ico
    ├── features.md
    ├── index.md
    └── installation.md
├── pyproject.toml
├── test
    ├── __init__.py
    ├── __init__.py.py
    ├── test_advance.py
    ├── test_align.py
    ├── test_basic.py
    ├── test_parser.py
    ├── test_run.py
    └── test_show.py
└── uv.lock


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/.DS_Store


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: y9c
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * treeio version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Deploy Jekyll with GitHub Pages dependencies preinstalled
 2 | 
 3 | on:
 4 |   # Runs on pushes targeting the default branch
 5 |   push:
 6 |     branches:
 7 |       - 'main'
 8 |       - 'dev'
 9 |     paths:
10 |       - 'docs/**'
11 | 
12 |   # Allows you to run this workflow manually from the Actions tab
13 |   workflow_dispatch:
14 | 
15 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
16 | permissions:
17 |   contents: read
18 |   pages: write
19 |   id-token: write
20 | 
21 | # Allow one concurrent deployment
22 | concurrency:
23 |   group: 'pages'
24 |   cancel-in-progress: true
25 | 
26 | jobs:
27 |   # Build job
28 |   build:
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - name: Checkout
32 |         uses: actions/checkout@v3
33 |       - name: Setup Pages
34 |         uses: actions/configure-pages@v2
35 |       - name: Build with Jekyll
36 |         uses: actions/jekyll-build-pages@v1
37 |         with:
38 |           source: ./docs
39 |           destination: ./_site
40 |       - name: Upload artifact
41 |         uses: actions/upload-pages-artifact@v1
42 | 
43 |   # Deployment job
44 |   deploy:
45 |     environment:
46 |       name: github-pages
47 |       url: ${{ steps.deployment.outputs.page_url }}
48 |     runs-on: ubuntu-latest
49 |     needs: build
50 |     steps:
51 |       - name: Deploy to GitHub Pages
52 |         id: deployment
53 |         uses: actions/deploy-pages@v1
54 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v[0-9]+.[0-9]+.[0-9]+'
 7 | 
 8 | jobs:
 9 |   build-n-publish:
10 |     name: Build and publish Python 🐍 distributions 📦 to PyPI
11 |     runs-on: ubuntu-18.04
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@master
15 | 
16 |       - name: Set up Python 3.8
17 |         uses: actions/setup-python@v1
18 |         with:
19 |           python-version: 3.8
20 | 
21 |       - name: Install poetry
22 |         run: >-
23 |           curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python &&
24 |           source $HOME/.poetry/env
25 | 
26 |       - name: Build a source tarball
27 |         run: >-
28 |           $HOME/.poetry/bin/poetry build
29 | 
30 |       - name: Publish distribution 📦 to PyPI
31 |         uses: pypa/gh-action-pypi-publish@master
32 |         with:
33 |           password: ${{ secrets.pypi_password }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/testpypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to TestPyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v[0-9]+.[0-9]+.[0-9]+*'
 7 | 
 8 | jobs:
 9 |   build-n-publish:
10 |     name: Build and publish Python 🐍 distributions 📦 to TestPyPI
11 |     runs-on: ubuntu-18.04
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@master
15 | 
16 |       - name: Set up Python 3.8
17 |         uses: actions/setup-python@v1
18 |         with:
19 |           python-version: 3.8
20 | 
21 |       - name: Install poetry
22 |         run: >-
23 |           curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python &&
24 |           source $HOME/.poetry/env
25 | 
26 |       - name: Build a source tarball
27 |         run: >-
28 |           $HOME/.poetry/bin/poetry build
29 | 
30 |       - name: Publish distribution 📦 to Test PyPI
31 |         uses: pypa/gh-action-pypi-publish@master
32 |         with:
33 |           password: ${{ secrets.test_pypi_password }}
34 |           repository_url: https://test.pypi.org/legacy/
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | 
163 | # directory for temp file
164 | temp/
165 | # tool cache
166 | .ruff_cache
167 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Improved Makefile for cfutils
 2 | .PHONY: init dev test clean lock
 3 | 
 4 | # Install dependencies (production only)
 5 | init:
 6 | 	uv pip install --no-deps
 7 | 
 8 | # Install all dependencies (dev + prod)
 9 | dev:
10 | 	uv pip install
11 | 
12 | # Run all tests
13 | test:
14 | 	python -m unittest discover -s test
15 | 
16 | # Remove Python cache and temp files
17 | clean:
18 | 	rm -rf __pycache__ */__pycache__ *.pyc *.pyo *.pyd temp/* test/__pycache__ cfutils/__pycache__
19 | 
20 | # Update lock file from pyproject.toml
21 | lock:
22 | 	uv pip compile
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Readthedocs](https://readthedocs.org/projects/cfutils/badge/?version=latest)](https://cfutils.readthedocs.io/en/latest/?badge=latest)
  2 | [![Pypi Releases](https://img.shields.io/pypi/v/cfutils.svg)](https://pypi.python.org/pypi/cfutils)
  3 | [![Downloads](https://static.pepy.tech/badge/cfutils)](https://pepy.tech/project/cfutils)
  4 | 
  5 | **Chromatogram File Utils**
  6 | 
  7 | For Sanger sequencing data visualizing, alignment, mutation calling, and trimming etc.
  8 | 
  9 | ## Demo
 10 | 
 11 | ![plot chromatogram with mutation](https://raw.githubusercontent.com/y9c/cfutils/master/data/plot.png)
 12 | 
 13 | > command to generate the demo above
 14 | 
 15 | ```bash
 16 | cfutils mut --query ./data/B5-M13R_B07.ab1 --subject ./data/ref.fa --outdir ./data/ --plot
 17 | ```
 18 | 
 19 | ## How to use?
 20 | 
 21 | - You can have mutation detection and visualization in one step using the command line.
 22 | 
 23 | ```bash
 24 | cfutils mut --help
 25 | ```
 26 | 
 27 | - You can also integrate the result matplotlib figures and use it as a python module.
 28 | 
 29 | An example:
 30 | 
 31 | ```python
 32 | import matplotlib.pyplot as plt
 33 | import numpy as np
 34 | 
 35 | from cfutils.parser import parse_abi
 36 | from cfutils.show import plot_chromatograph
 37 | 
 38 | seq = parse_abi("./data/B5-M13R_B07.ab1")
 39 | peaks = seq.annotations["peak positions"][100:131]
 40 | 
 41 | fig, axes = plt.subplots(2, 1, figsize=(12, 6), sharex=True)
 42 | plot_chromatograph(
 43 |     seq,
 44 |     region=(100, 130),
 45 |     ax=axes[0],
 46 |     show_bases=True,
 47 |     show_positions=True,
 48 |     color_map=dict(zip("ATGC", ["C0", "C2", "C1", "C4"])),
 49 | )
 50 | axes[1].bar(peaks, np.random.randn(len(peaks)), color="0.66")
 51 | plt.show()
 52 | ```
 53 | 
 54 | ![plot chromatogram in_matplotlib](https://raw.githubusercontent.com/y9c/cfutils/master/data/matplotlib_example.png)
 55 | 
 56 | ## How to install?
 57 | 
 58 | ### form pypi
 59 | 
 60 | _(use this way ONLY, if you don't know what's going on)_
 61 | 
 62 | ```bash
 63 | pip install --user cfutils
 64 | ```
 65 | 
 66 | ### manipulate the source code
 67 | 
 68 | - clone from github
 69 | 
 70 | ```bash
 71 | git clone git@github.com:y9c/cfutils.git
 72 | ```
 73 | 
 74 | - install the dependence
 75 | 
 76 | ```bash
 77 | make init
 78 | ```
 79 | 
 80 | - do unittest
 81 | 
 82 | ```bash
 83 | make test
 84 | ```
 85 | 
 86 | ## ChangeLog
 87 | 
 88 | - Reverse completement the chromatogram file. (Inspired by Snapgene)
 89 | - build as python package for pypi
 90 | - fix bug that highlighting wrong base
 91 | - replace blastn with buildin python aligner
 92 | 
 93 | ## TODO
 94 | 
 95 | - [ ] call mutation by alignment and plot Chromatogram graphic
 96 | - [ ] add a doc
 97 | - [x] change xaxis by peak location
 98 | - [ ] fix bug that chromatogram switch pos after trim
 99 | - [x] wrap as a cli app
100 | - [ ] return quality score in output
101 | - [ ] fix issue that selected base is not in the middle
102 | - [ ] fix plot_chromatograph rendering bug
103 | 
104 | - [ ] add projection feature to make align and assemble possible
105 | 


--------------------------------------------------------------------------------
/cfutils/__init__.py:
--------------------------------------------------------------------------------
1 | # export function in show modual?
2 | # from .show import plot_chromatograph
3 | 


--------------------------------------------------------------------------------
/cfutils/align.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2019 yech <yech1990@gmail.com>
  5 | # Distributed under terms of the MIT license.
  6 | #
  7 | # Created: 2019-05-27 20:19
  8 | 
  9 | 
 10 | """align query sequence with ref.
 11 | 
 12 | Use 1-based for all the position
 13 | """
 14 | 
 15 | from dataclasses import dataclass
 16 | from typing import List, Optional, Tuple
 17 | 
 18 | import ssw
 19 | from .parser import SeqRecord
 20 | 
 21 | from .utils import get_logger
 22 | 
 23 | LOGGER = get_logger(__name__)
 24 | 
 25 | 
 26 | @dataclass
 27 | class SitePair:
 28 |     """Object for storing align pair at mutation site."""
 29 | 
 30 |     ref_pos: int
 31 |     ref_base: str
 32 |     cf_pos: int
 33 |     cf_base: str
 34 |     qual_site: Optional[int] = None
 35 |     qual_local: Optional[int] = None
 36 | 
 37 |     def __repr__(self):
 38 |         return f"{self.ref_base}({self.ref_pos})->{self.cf_base}({self.cf_pos})"
 39 | 
 40 | 
 41 | def run_align(reference: str, query: str) -> List[SitePair]:
 42 |     """Align query sequence with reference sequence.
 43 | 
 44 |     Args:
 45 |         reference (str): The reference sequence.
 46 |         query (str): The query sequence.
 47 | 
 48 |     Returns:
 49 |         List[SitePair]: A list of SitePair objects representing alignment.
 50 |     """
 51 |     aligner = ssw.Aligner()
 52 |     alignment = aligner.align(reference=reference, query=query)
 53 |     results = []
 54 |     query_pos = alignment.query_begin
 55 |     ref_pos = alignment.reference_begin
 56 |     for query_base, _, ref_base in zip(*alignment.alignment):
 57 |         results.append(
 58 |             SitePair(
 59 |                 ref_pos=ref_pos,
 60 |                 ref_base=ref_base,
 61 |                 cf_pos=query_pos,
 62 |                 cf_base=query_base,
 63 |             )
 64 |         )
 65 |         if query_base != "-":
 66 |             query_pos += 1
 67 |         if ref_base != "-":
 68 |             ref_pos += 1
 69 |     return results
 70 | 
 71 | 
 72 | def get_quality(pos: int, query_record: SeqRecord, flank_base_num=0) -> Tuple[int, int]:
 73 |     """get quality of site and local region.
 74 | 
 75 |     change flank_base_num to number gt 0 to get mean qual within region
 76 |     """
 77 |     qual = query_record.letter_annotations["phred_quality"]
 78 |     qual_site = qual[pos - 1]
 79 |     qual_flank = qual[
 80 |         max(0, pos - 1 - flank_base_num) : min(len(qual), pos + flank_base_num)
 81 |     ]
 82 |     qual_local = int(sum(qual_flank) / len(qual_flank))
 83 |     return qual_site, qual_local
 84 | 
 85 | 
 86 | def align_chromatograph(
 87 |     query_record: SeqRecord, subject_record: SeqRecord
 88 | ) -> List[SitePair]:
 89 |     """run align.
 90 | 
 91 |     @return: list of SitePair about all sites
 92 |     """
 93 |     sitepairs = run_align(
 94 |         reference=str(subject_record.seq), query=str(query_record.seq)
 95 |     )
 96 |     LOGGER.info(f"{query_record.name}: Total aligned number: {len(sitepairs)}")
 97 |     for site in sitepairs:
 98 |         site.qual_site, site.qual_local = get_quality(
 99 |             site.cf_pos, query_record, flank_base_num=5
100 |         )
101 |         LOGGER.debug(f"{site}\tlocal:{site.qual_local}\tsite:{site.qual_site}")
102 |     return sitepairs
103 | 
104 | 
105 | def call_mutations(
106 |     query_record: SeqRecord,
107 |     subject_record: SeqRecord,
108 |     report_all_sites: bool = False,
109 | ) -> List[SitePair]:
110 |     """run align and call mutations.
111 | 
112 |     @return: list of SitePair about mutation sites
113 |     """
114 |     sitepairs = align_chromatograph(query_record, subject_record)
115 |     mutations = []
116 |     for site in sitepairs:
117 |         if report_all_sites:
118 |             mutations.append(site)
119 |             LOGGER.debug(f"Site ({site}) is reported!")
120 |         else:
121 |             if site.ref_base != site.cf_base:
122 |                 mutations.append(site)
123 |                 LOGGER.debug(f"Site ({site}) is with mutation!")
124 |     if not report_all_sites:
125 |         LOGGER.info(f"{query_record.name}: Total mutation number: {len(mutations)}")
126 |     return mutations
127 | 


--------------------------------------------------------------------------------
/cfutils/cli.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2019 yech <yech1990@gmail.com>
 5 | #
 6 | # Distributed under terms of the MIT license.
 7 | 
 8 | """Chromatogram File Utils.
 9 | 
10 | wrap cfutils into cli app
11 | - update in 20190405
12 | """
13 | 
14 | import click
15 | from cfutils.run import report_mutation
16 | 
17 | 
18 | @click.group()
19 | @click.option("--debug/--no-debug", default=False)
20 | def cli(debug):
21 |     """Chromatogram File Utils."""
22 |     if debug:
23 |         click.echo("Debug mode is on")
24 | 
25 | 
26 | # call mutation
27 | @cli.command()
28 | @click.option("--query", prompt="QUERY (abi file): ", help="Query file in abi format")
29 | @click.option(
30 |     "--subject",
31 |     prompt="SUBJECT (fasta file): ",
32 |     help="Subject file in fasta format as ref",
33 | )
34 | @click.option("--outdir", default=None, required=False, help="Output directory")
35 | @click.option("--outbase", default=None, required=False, help="Output basename")
36 | @click.option(
37 |     "--aligned/--mutated",
38 |     default=False,
39 |     help="Report all aligned sites or mutation sites only",
40 | )
41 | @click.option(
42 |     "--plot/--no-plot",
43 |     default=False,
44 |     help="Generate figure of mutation in chromatogram.",
45 | )
46 | def mut(query, subject, outdir, outbase, aligned, plot):
47 |     """do mutation calling, then report in tsv and pdf."""
48 |     report_mutation(
49 |         query_ab1_file=query,
50 |         subject_fasta_file=subject,
51 |         output_dir=outdir,
52 |         file_basename=outbase,
53 |         report_all_sites=aligned,
54 |         report_mut_plot=plot,
55 |     )
56 | 


--------------------------------------------------------------------------------
/cfutils/count.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2019 yech <yech1990@gmail.com>
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """Chromatogram File Utils.
  9 | 
 10 | - update in 20190405
 11 | """
 12 | 
 13 | import re
 14 | import sys
 15 | from datetime import date
 16 | 
 17 | 
 18 | class CountMutations:
 19 |     def __init__(self):
 20 |         pass
 21 | 
 22 |     def _to_int(self, str):
 23 |         """Convert str to int if it can otherwise 0."""
 24 |         tmp = 1
 25 |         try:
 26 |             tmp = int(str)
 27 |         except ValueError:
 28 |             pass
 29 | 
 30 |         return tmp
 31 | 
 32 |     def _get_date(self, seq_def):
 33 |         """Parses a sequence definition and extracts the date."""
 34 |         parts = seq_def.split("|")
 35 | 
 36 |         # Probably no date if there is no | in the line
 37 |         if len(parts) == 1:
 38 |             return date(1800, 1, 1)
 39 | 
 40 |         try:
 41 |             y = int(parts[-4])
 42 |         except ValueError:
 43 |             print("Failed on this definition line")
 44 |             sys.exit(-1)
 45 |             m = self._to_int(parts[-3])
 46 |             d = self._to_int(parts[-2])
 47 |             dt = date(y, m, d)
 48 |             return dt
 49 | 
 50 |     def _get_gi(self, seq_def):
 51 |         """Parses genbank id out."""
 52 |         parts = seq_def.split("|")
 53 |         if len(parts) > 1:
 54 |             return parts[1]
 55 |         else:
 56 |             return seq_def
 57 | 
 58 |     def _get_name(self, seq_def):
 59 |         """Parses a sequence definition and extracts the name Assumed to be in
 60 |         the 2 column after split( '|' ) and between ()"""
 61 |         p = re.compile(r"\((.*)\)")
 62 |         parts = seq_def.split("|")
 63 |         if len(parts) > 1:
 64 |             m = p.search(parts[2])
 65 |             return m.group(1)
 66 |         else:
 67 |             return seq_def
 68 | 
 69 |     def parse(self, mutations_file, cutoff_date):
 70 |         """Parse a mutations file generated by mutalign.py."""
 71 |         fh = open(mutations_file)
 72 |         chart = []
 73 |         mut_count = 0
 74 |         last_date = None
 75 |         inc = 1
 76 |         for line in fh:
 77 |             # Start new date and set counter to 0
 78 |             if not line.startswith("Q: "):
 79 |                 # Set mutation count for last date
 80 |                 if last_date:
 81 |                     chart[-1][3] = mut_count
 82 | 
 83 |                 # Get the new date and set it as last_date
 84 |                 last_date = self._get_date(line)
 85 | 
 86 |                 if last_date < cutoff_date:
 87 |                     inc = -1
 88 |                 else:
 89 |                     inc = 1
 90 |                 name = self._get_name(line)
 91 |                 gi = self._get_gi(line)
 92 |                 chart.append([gi, last_date, name, 0])
 93 |                 mut_count = 0
 94 |             # Else Count mutations
 95 |             mut_count += inc
 96 | 
 97 |         fh.close()
 98 | 
 99 |         chart[-1][3] = mut_count
100 | 
101 |         return chart
102 | 
103 |     def get_chart(self, mutations_file, cutoff_date):
104 |         p = self.parse(mutations_file, cutoff_date)
105 |         for gi, dt, name, num in p:
106 |             print("%s,%s,%s,%s" % (gi, name, dt, num))
107 | 
108 | 
109 | def parse_date(dte):
110 |     if "-" in dte:
111 |         p = dte.split("-")
112 |     elif "/" in dte:
113 |         p = dte.split("/")
114 | 
115 |     if len(p) == 3:
116 |         return date(int(p[0]), int(p[1]), int(p[2]))
117 | 


--------------------------------------------------------------------------------
/cfutils/parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2019 yech <yech1990@gmail.com>
  5 | # Distributed under terms of the MIT license.
  6 | #
  7 | # Copyright 2011 by Wibowo Arindrarto (w.arindrarto@gmail.com)
  8 | # Revisions copyright 2011 by Peter Cock.
  9 | # This code is part of the Biopython distribution and governed by its
 10 | # license. Please see the LICENSE file that should have been included
 11 | # as part of this package.
 12 | #
 13 | # Created: 2019-08-30 13:51
 14 | 
 15 | 
 16 | """parser for the ABI format.
 17 | 
 18 | Learned from Bio.SeqIO 
 19 | 
 20 | ABI is the format used by Applied Biosystem's sequencing machines to store
 21 | sequencing results.
 22 | 
 23 | For more details on the format specification, visit:
 24 | http://www.appliedbiosystem.com/support/software_community/ABIF_File_Format.pdf
 25 | """
 26 | 
 27 | import datetime
 28 | import struct
 29 | from pathlib import Path
 30 | 
 31 | # dictionary for determining which tags goes into SeqRecord annotation
 32 | # each key is tag_name + tag_number
 33 | # if a tag entry needs to be added, just add its key and its key
 34 | # for the annotations dictionary as the value
 35 | _EXTRACT = {
 36 |     "TUBE1": "sample_well",
 37 |     "DySN1": "dye",
 38 |     "GTyp1": "polymer",
 39 |     "MODL1": "machine_model",
 40 | }
 41 | # dictionary for tags that require preprocessing before use in creating
 42 | # seqrecords
 43 | _SPCTAGS = [
 44 |     "PBAS2",  # base-called sequence
 45 |     "PCON2",  # quality values of base-called sequence
 46 |     "SMPL1",  # sample id inputted before sequencing run
 47 |     "RUND1",  # run start date
 48 |     "RUND2",  # run finish date
 49 |     "RUNT1",  # run start time
 50 |     "RUNT2",  # run finish time
 51 |     # NOTE: The following are used for trace data
 52 |     "PLOC2",  # position of peaks
 53 |     "DATA1",  # channel1 raw data
 54 |     "DATA2",  # channel2 raw data
 55 |     "DATA3",  # channel3 raw data
 56 |     "DATA4",  # channel4 raw data
 57 |     "DATA9",  # channel1 analyzed data
 58 |     "DATA10",  # channel2 analyzed data
 59 |     "DATA11",  # channel3 analyzed data
 60 |     "DATA12",  # channel4 analyzed data
 61 |     "FWO_1",  # base order for channels
 62 | ]
 63 | # dictionary for data unpacking format
 64 | _BYTEFMT = {
 65 |     1: "b",  # byte
 66 |     2: "s",  # char
 67 |     3: "H",  # word
 68 |     4: "h",  # short
 69 |     5: "i",  # long
 70 |     6: "2i",  # rational, legacy unsupported
 71 |     7: "f",  # float
 72 |     8: "d",  # double
 73 |     10: "h2B",  # date
 74 |     11: "4B",  # time
 75 |     12: "2i2b",  # thumb
 76 |     13: "B",  # bool
 77 |     14: "2h",  # point, legacy unsupported
 78 |     15: "4h",  # rect, legacy unsupported
 79 |     16: "2i",  # vPoint, legacy unsupported
 80 |     17: "4i",  # vRect, legacy unsupported
 81 |     18: "s",  # pString
 82 |     19: "s",  # cString
 83 |     20: "2i",  # tag, legacy unsupported
 84 | }
 85 | # header data structure (exluding 4 byte ABIF marker)
 86 | _HEADFMT = ">H4sI2H3I"
 87 | # directory data structure
 88 | _DIRFMT = ">4sI2H4I"
 89 | 
 90 | 
 91 | class SeqRecord:
 92 |     def __init__(self, seq, id="", name="", description="", annotations=None, letter_annotations=None):
 93 |         self.seq = str(seq)  # Ensure sequence is stored as a string
 94 |         self.id = id
 95 |         self.name = name
 96 |         self.description = description
 97 |         self.annotations = annotations if annotations is not None else {}
 98 |         self.letter_annotations = letter_annotations if letter_annotations is not None else {}
 99 | 
100 |     def __getitem__(self, key):
101 |         new_seq = self.seq[key]
102 |         new_annotations = self.annotations.copy()
103 |         new_letter_annotations = {k: v[key] for k, v in self.letter_annotations.items()}
104 |         return SeqRecord(new_seq, self.id, self.name, self.description, new_annotations, new_letter_annotations)
105 | 
106 |     def __len__(self):
107 |         return len(self.seq)
108 | 
109 |     def __str__(self):
110 |         return self.seq  # Return the sequence string for display
111 | 
112 | 
113 | def abi_iterator(handle):
114 |     """Iterator for the Abi file format."""
115 | 
116 |     # raise exception if handle mode is not 'rb'
117 |     if hasattr(handle, "mode"):
118 |         if set("rb") != set(handle.mode.lower()):
119 |             raise ValueError("ABI files has to be opened in 'rb' mode.")
120 | 
121 |     # check if input file is a valid Abi file
122 |     handle.seek(0)
123 |     marker = handle.read(4)
124 |     if not marker:
125 |         # handle empty file gracefully
126 |         raise StopIteration
127 |     if marker != b"ABIF":
128 |         raise IOError("File should start ABIF, not %r" % marker)
129 | 
130 |     # dirty hack for handling time information
131 |     times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}
132 | 
133 |     # initialize annotations
134 |     annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))
135 | 
136 |     # parse header and extract data from directories
137 |     header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))
138 | 
139 |     for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
140 |         # stop iteration if all desired tags have been extracted
141 |         # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
142 |         # and seq, qual, id
143 |         # todo
144 | 
145 |         key = tag_name + str(tag_number)
146 | 
147 |         # PBAS2 is base-called sequence
148 |         if key == "PBAS2":
149 |             seq = tag_data
150 |         # PCON2 is quality values of base-called sequence
151 |         elif key == "PCON2":
152 |             qual = [ord(val) for val in tag_data]
153 |         # PLOC2 is the location of peaks
154 |         elif key == "PLOC2":
155 |             peakamps = [float(val) for val in tag_data]
156 |             annot["peak positions"] = peakamps
157 |         # DATA1-DATA4 is raw channel 1-4 output, DATA9-12 the analyzed one
158 |         elif key in ["DATA9", "DATA10", "DATA11", "DATA12"]:
159 |             rawch = [float(val) for val in tag_data]
160 |             annot["channel " + str(int(key[4:]) - 8)] = rawch
161 |         # FWO_1 is the order of channels in bases
162 |         elif key == "FWO_1":
163 |             channelorders = tag_data
164 |             annot["channels"] = channelorders
165 |         # SMPL1 is sample id entered before sequencing run
166 |         elif key == "SMPL1":
167 |             sample_id = tag_data
168 |         elif key in times:
169 |             times[key] = tag_data
170 |         else:
171 |             # extract sequence annotation as defined in _EXTRACT
172 |             if key in _EXTRACT:
173 |                 annot[_EXTRACT[key]] = tag_data
174 | 
175 |     # set time annotations
176 |     annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
177 |     annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])
178 | 
179 |     # use the file name as SeqRecord.name if available
180 |     #  try:
181 |     file_name = Path(handle.name).stem
182 |     #  except:
183 |     #  file_name = ""
184 | 
185 |     record = SeqRecord(
186 |         seq,  # Use the sequence string directly
187 |         id=sample_id,
188 |         name=file_name,
189 |         description="",
190 |         annotations=annot,
191 |         letter_annotations={"phred_quality": qual},
192 |     )
193 | 
194 |     yield record
195 | 
196 | 
197 | def _abi_parse_header(header, handle):
198 |     """Generator that returns directory contents."""
199 |     # header structure (after ABIF marker):
200 |     # file version, tag name, tag number,
201 |     # element type code, element size, number of elements
202 |     # data size, data offset, handle (not file handle)
203 |     head_elem_size = header[4]
204 |     head_elem_num = header[5]
205 |     head_offset = header[7]
206 |     index = 0
207 | 
208 |     while index < head_elem_num:
209 |         start = head_offset + index * head_elem_size
210 |         # add directory offset to tuple
211 |         # to handle directories with data size <= 4 bytes
212 |         handle.seek(start)
213 |         dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (
214 |             start,
215 |         )
216 |         index += 1
217 |         # only parse desired dirs
218 |         key = dir_entry[0].decode()
219 |         key += str(dir_entry[1])
220 |         if key in list(_EXTRACT.keys()) + _SPCTAGS:
221 |             tag_name = dir_entry[0].decode()
222 |             tag_number = dir_entry[1]
223 |             elem_code = dir_entry[2]
224 |             elem_num = dir_entry[4]
225 |             data_size = dir_entry[5]
226 |             data_offset = dir_entry[6]
227 |             tag_offset = dir_entry[8]
228 |             # if data size <= 4 bytes, data is stored inside tag
229 |             # so offset needs to be changed
230 |             if data_size <= 4:
231 |                 data_offset = tag_offset + 20
232 |             handle.seek(data_offset)
233 |             data = handle.read(data_size)
234 |             yield tag_name, tag_number, _parse_tag_data(elem_code, elem_num, data)
235 | 
236 | 
237 | def _abi_trim(seq_record: SeqRecord) -> SeqRecord:
238 |     """Trims the sequence using Richard Mott's modified trimming algorithm.
239 | 
240 |     seq_record - SeqRecord object to be trimmed.
241 | 
242 |     Trimmed bases are determined from their segment score, which is a
243 |     cumulative sum of each base's score. Base scores are calculated from
244 |     their quality values.
245 | 
246 |     More about the trimming algorithm:
247 |     http://www.phrap.org/phredphrap/phred.html
248 |     http://www.clcbio.com/manual/genomics/Quality_abif_trimming.html
249 |     """
250 | 
251 |     start = False  # flag for starting position of trimmed sequence
252 |     segment = 20  # minimum sequence length
253 |     trim_start = 0  # init start index
254 |     cutoff = 0.05  # default cutoff value for calculating base score
255 | 
256 |     if len(seq_record) <= segment:
257 |         return seq_record
258 |     else:
259 |         # calculate base score
260 |         score_list = [
261 |             cutoff - (10 ** (qual / -10.0))
262 |             for qual in seq_record.letter_annotations["phred_quality"]
263 |         ]
264 | 
265 |         # calculate cummulative score
266 |         # if cummulative value < 0, set it to 0
267 |         # first value is set to 0, because of the assumption that
268 |         # the first base will always be trimmed out
269 |         cummul_score = [0]
270 |         for i in range(1, len(score_list)):
271 |             score = cummul_score[-1] + score_list[i]
272 |             if score < 0:
273 |                 cummul_score.append(0)
274 |             else:
275 |                 cummul_score.append(score)
276 |                 if not start:
277 |                     # trim_start = value when cummulative score is first > 0
278 |                     trim_start = i
279 |                     start = True
280 | 
281 |         # trim_finish = index of highest cummulative score,
282 |         # marking the end of sequence segment with highest cummulative score
283 |         trim_finish = cummul_score.index(max(cummul_score))
284 |         new_record = seq_record[trim_start:trim_finish]
285 |         new_record.annotations = seq_record.annotations.copy()
286 | 
287 |         return new_record
288 | 
289 | 
290 | def _parse_tag_data(elem_code, elem_num, raw_data):
291 |     """Returns single data value.
292 | 
293 |     elem_code - What kind of data
294 |     elem_num - How many data points
295 |     raw_data - abi file object from which the tags would be unpacked
296 |     """
297 |     if elem_code in _BYTEFMT:
298 |         # because '>1s' unpack differently from '>s'
299 |         if elem_num == 1:
300 |             num = ""
301 |         else:
302 |             num = str(elem_num)
303 |         fmt = ">" + num + _BYTEFMT[elem_code]
304 | 
305 |         assert len(raw_data) == struct.calcsize(fmt)
306 |         data = struct.unpack(fmt, raw_data)
307 | 
308 |         # no need to use tuple if len(data) == 1
309 |         # also if data is date / time
310 |         if elem_code not in [10, 11] and len(data) == 1:
311 |             data = data[0]
312 | 
313 |         # account for different data types
314 |         if elem_code == 2:
315 |             return data.decode()
316 |         if elem_code == 10:
317 |             return str(datetime.date(*data))
318 |         if elem_code == 11:
319 |             return str(datetime.time(*data[:3]))
320 |         if elem_code == 13:
321 |             return bool(data)
322 |         if elem_code == 18:
323 |             return data[1:].decode()
324 |         if elem_code == 19:
325 |             return data[:-1].decode()
326 |         return data
327 |     else:
328 |         return None
329 | 
330 | 
331 | def trim_and_rescale_trace(seq):
332 |     """Trim traces to peak positions, shift to start from zero, and rescale."""
333 | 
334 |     traces = [seq.annotations["channel " + str(i)] for i in range(1, 5)]
335 |     peaks = seq.annotations["peak positions"]
336 |     n = len(peaks)
337 |     step = 1.0 * (peaks[-1] - peaks[0]) / n
338 | 
339 |     traces = [
340 |         [t for (i, t) in enumerate(trace) if peaks[0] <= i < peaks[-1]]
341 |         for trace in traces
342 |     ]
343 |     peaks = [(p - peaks[0]) / step for p in peaks]
344 | 
345 |     x = [1.0 * i / step for i in range(len(traces[0]))]
346 | 
347 |     seq.annotations["peak positions"] = peaks
348 |     for i, trace in enumerate(traces, 1):
349 |         seq.annotations["channel " + str(i)] = trace
350 |     seq.annotations["trace_x"] = x
351 |     return seq
352 | 
353 | 
354 | def rescale_trace(seq: SeqRecord) -> SeqRecord:
355 |     traces = [seq.annotations["channel " + str(i)] for i in range(1, 5)]
356 |     peaks = seq.annotations["peak positions"]
357 |     n = len(peaks)
358 |     step = 1.0 * (peaks[-1] - peaks[0]) / n
359 |     traces = [
360 |         [t for (i, t) in enumerate(trace) if peaks[0] <= i < peaks[-1]]
361 |         for trace in traces
362 |     ]
363 |     #  peaks = [(p - peaks[0]) / step for p in peaks]
364 |     peaks = [p / step for p in peaks]
365 | 
366 |     x = [1.0 * i / step for i in range(len(traces[0]))]
367 | 
368 |     seq.annotations["peak positions"] = peaks
369 |     for i, trace in enumerate(traces, 1):
370 |         seq.annotations["channel " + str(i)] = trace
371 |     seq.annotations["trace_x"] = x
372 |     return seq
373 | 
374 | 
375 | def parse_abi(filename: str) -> SeqRecord:
376 |     """Parse an ABI file from Sanger sequencing."""
377 |     with open(filename, "rb") as abifile:
378 |         seq = list(abi_iterator(abifile))[0]
379 | 
380 |     seq = rescale_trace(seq)
381 |     return seq
382 | 
383 | 
384 | def parse_fasta(filename: str) -> SeqRecord:
385 |     """parse_fasta may support other type of file in the future.
386 | 
387 |     :param filename:
388 |     :type filename: str
389 |     :rtype: SeqRecord
390 |     """
391 |     with open(filename, "r") as file:
392 |         lines = file.readlines()
393 |         id_line = lines[0].strip()
394 |         sequence = ''.join(line.strip() for line in lines[1:])
395 |         seq_id = id_line[1:] if id_line.startswith('>') else ''
396 |         return SeqRecord(sequence, id=seq_id)
397 | 


--------------------------------------------------------------------------------
/cfutils/run.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2019 yech <yech1990@gmail.com>
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """Chromatogram File Utils.
  9 | 
 10 | do some wrap functions
 11 | """
 12 | 
 13 | import os
 14 | from datetime import datetime
 15 | from pathlib import Path
 16 | from typing import List
 17 | 
 18 | import matplotlib as mpl
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | from cfutils.align import call_mutations
 22 | from cfutils.parser import parse_abi, parse_fasta, SeqRecord
 23 | from cfutils.show import annotate_mutation, highlight_base, plot_chromatograph
 24 | 
 25 | from .utils import get_logger
 26 | 
 27 | mpl.use("Agg", force=True)
 28 | 
 29 | LOGGER = get_logger(__name__)
 30 | 
 31 | 
 32 | def do_mutation_showing(query_record: SeqRecord, mutations: List, output_fig_file: str) -> None:
 33 |     """report mutations in pdf format."""
 34 |     min_base_qual = 50
 35 |     min_local_qual = 20
 36 | 
 37 |     mutations = sorted(mutations, key=lambda x: x.cf_pos)
 38 |     flanking_size = 6
 39 |     windows_size = 30
 40 |     mutation_windows = []
 41 |     start_pos = max(1, mutations[0].cf_pos - flanking_size)
 42 |     mutation_region = []
 43 |     for idx, mut in enumerate(mutations):
 44 |         if mut.cf_pos + flanking_size <= start_pos + windows_size:
 45 |             mutation_region.append(mut)
 46 |         else:
 47 |             mutation_windows.append(mutation_region)
 48 |             start_pos = max(1, mutations[idx].cf_pos - flanking_size)
 49 |             mutation_region = [mut]
 50 |     mutation_windows.append(mutation_region)
 51 | 
 52 |     fig, axes = plt.subplots(
 53 |         len(mutation_windows), figsize=(20, 5 * len(mutation_windows))
 54 |     )
 55 |     for idx, mutation_region in enumerate(mutation_windows):
 56 |         if len(mutation_windows) == 1:
 57 |             ax = axes
 58 |         else:
 59 |             ax = axes[idx]
 60 |         region_start = max(1, mutation_region[0].cf_pos - flanking_size)
 61 |         plot_chromatograph(
 62 |             query_record,
 63 |             region=(region_start, region_start + windows_size),
 64 |             ax=ax,
 65 |         )
 66 |         for mut in mutation_region:
 67 |             base_passed = (
 68 |                 mut.qual_site is not None
 69 |                 and mut.qual_site >= min_base_qual
 70 |                 and mut.qual_local is not None
 71 |                 and mut.qual_local >= min_local_qual
 72 |             )
 73 |             highlight_base(mut.cf_pos, query_record, ax, passed_filter=base_passed)
 74 |             annotate_mutation(mut, query_record, ax)
 75 |     fig.savefig(output_fig_file, bbox_inches="tight")
 76 | 
 77 | 
 78 | def report_mutation(
 79 |     query_ab1_file,
 80 |     subject_fasta_file,
 81 |     output_dir=None,
 82 |     file_basename=None,
 83 |     report_all_sites=False,
 84 |     report_mut_plot=False,
 85 | ):
 86 |     """reprot mutation within region."""
 87 |     if output_dir is None:
 88 |         output_dir = os.path.join(
 89 |             os.getcwd(),
 90 |             "CFresult_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
 91 |         )
 92 |     os.makedirs(output_dir, exist_ok=True)
 93 | 
 94 |     if file_basename is None:
 95 |         file_basename = (
 96 |             Path(query_ab1_file).stem + "_vs_" + Path(subject_fasta_file).stem
 97 |         )
 98 | 
 99 |     query_record = parse_abi(query_ab1_file)
100 |     subject_record = parse_fasta(subject_fasta_file)
101 | 
102 |     sites = call_mutations(
103 |         query_record,
104 |         subject_record,
105 |         report_all_sites=report_all_sites,
106 |     )
107 |     # save mutation / alignment to tsv file
108 |     with open(os.path.join(output_dir, file_basename + ".tsv"), "w") as f_mut:
109 |         header = [
110 |             "RefLocation",
111 |             "RefBase",
112 |             "CfLocation",
113 |             "CfBase",
114 |             "SiteQual",
115 |             "LocalQual",
116 |         ]
117 |         f_mut.write("\t".join(header) + "\n")
118 |         for site in sites:
119 |             f_mut.write(
120 |                 f"{site.ref_pos}\t{site.ref_base}\t{site.cf_pos}\t{site.cf_base}\t{site.qual_site}\t{site.qual_local}\n"
121 |             )
122 | 
123 |     # do forget to filter mutation for plot
124 |     if report_all_sites:
125 |         mutations = [s for s in sites if s.ref_base != s.cf_base]
126 |         LOGGER.info(f"{query_record.name}: Mutation number for plot: {len(mutations)}")
127 |     else:
128 |         mutations = sites
129 | 
130 |     # show mutation in pdf file
131 |     if mutations and report_mut_plot:
132 |         output_fig_file = os.path.join(output_dir, file_basename + ".pdf")
133 |         do_mutation_showing(query_record, mutations, output_fig_file)
134 | 


--------------------------------------------------------------------------------
/cfutils/show.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright © 2019 yech <yech1990@gmail.com>
  5 | #
  6 | # Distributed under terms of the MIT license.
  7 | 
  8 | """
  9 | Chromatogram File Utils.
 10 | 
 11 | show alignment with matplotlib
 12 | 
 13 | author:     Fabio Zanini
 14 | date:       09/12/13
 15 | content:    Plot functions for Sanger chromatographs.
 16 | modified:   By Ye Chang in 2018-05-14
 17 | """
 18 | 
 19 | from collections import defaultdict
 20 | from typing import Optional, Tuple
 21 | 
 22 | import matplotlib as mpl
 23 | import matplotlib.pyplot as plt
 24 | from matplotlib.axes import Axes
 25 | 
 26 | from .align import SitePair, align_chromatograph
 27 | from .parser import SeqRecord  # Import the custom SeqRecord class
 28 | from .utils import get_logger, reverse_complement
 29 | 
 30 | LOGGER = get_logger(__name__)
 31 | 
 32 | 
 33 | def plot_chromatograph(
 34 |     seq: SeqRecord,
 35 |     region: Optional[Tuple[int, int]] = None,
 36 |     ax: Optional[Axes] = None,
 37 |     color_map: Optional[dict] = None,
 38 |     show_bases: bool = True,
 39 |     show_positions: bool = True,
 40 |     show_rc: bool = False,
 41 | ) -> Axes:
 42 |     """
 43 |     Plot Sanger chromatograph.
 44 | 
 45 |     region: include both start and end (1-based)
 46 |     """
 47 |     if ax is None:
 48 |         ax = plt.gca()
 49 |         # _, ax = plt.subplots(1, 1, figsize=(16, 6))
 50 | 
 51 |     if seq is None:
 52 |         return ax
 53 | 
 54 |     if region is None:
 55 |         # turn into 0 based for better indexing
 56 |         region_start, region_end = 0, len(seq)
 57 |     else:
 58 |         region_start = max(region[0], 0)
 59 |         region_end = min(region[1], len(seq) - 1)
 60 | 
 61 |     _colors = defaultdict(lambda: "purple", {"A": "g", "C": "b", "G": "k", "T": "r"})
 62 |     if color_map is not None:
 63 |         _colors.update(color_map)
 64 | 
 65 |     # Get signals
 66 |     peaks = seq.annotations["peak positions"]
 67 |     trace_x = seq.annotations["trace_x"]
 68 |     traces_y = [seq.annotations["channel " + str(i)] for i in range(1, 5)]
 69 |     bases = seq.annotations["channels"]
 70 | 
 71 |     xlim_left, xlim_right = peaks[region_start] - 1, peaks[region_end] + 0.5
 72 | 
 73 |     # Ensure seq is treated as a string
 74 |     sequence_str = seq.seq
 75 | 
 76 |     # subset peak and sequence
 77 |     # TODO: this might fix the bug
 78 |     peak_start = peaks[0]
 79 |     peak_zip = [
 80 |         (p, s)
 81 |         for i, (p, s) in enumerate(zip(peaks, sequence_str))
 82 |         if region_start <= i <= region_end
 83 |     ]
 84 |     peaks, sequence_str = list(zip(*peak_zip))
 85 | 
 86 |     # subset trace_x and traces_y together
 87 |     trace_zip = [
 88 |         (x + peak_start, *ys)
 89 |         for x, *ys in zip(trace_x, *traces_y)
 90 |         if xlim_left <= x <= xlim_right
 91 |     ]
 92 |     if not trace_zip:
 93 |         return ax
 94 |     trace_x, *traces_y = list(zip(*trace_zip))
 95 | 
 96 |     # Plot traces
 97 |     trmax = max(map(max, traces_y))
 98 |     for base in bases:
 99 |         chanel_index = bases.index(base)
100 |         trace_y = [1.0 * ci / trmax for ci in traces_y[chanel_index]]
101 |         if show_rc:
102 |             base = reverse_complement(base)
103 |         ax.plot(trace_x, trace_y, color=_colors[base], lw=2, label=base)
104 |         ax.fill_between(trace_x, 0, trace_y, facecolor=_colors[base], alpha=0.125)
105 | 
106 |     # Plot bases at peak positions
107 |     if show_bases:
108 |         for i, peak in enumerate(peaks):
109 |             b = reverse_complement(sequence_str[i]) if show_rc else sequence_str[i]
110 |             ax.text(
111 |                 peak,
112 |                 -0.11,
113 |                 b,
114 |                 color=_colors[b],
115 |                 va="center",
116 |                 ha="center",
117 |                 alpha=0.66,
118 |                 fontsize="x-large",
119 |                 fontweight="bold",
120 |             )
121 |         ax.set_ylim(bottom=-0.15, top=1.05)
122 |     else:
123 |         ax.set_ylim(bottom=-0.05, top=1.05)
124 | 
125 |     #  peaks[0] - max(2, 0.02 * (peaks[-1] - peaks[0])),
126 |     #  right=peaks[-1] + max(2, 0.02 * (peaks[-1] - peaks[0])),
127 |     ax.set_xlim(xlim_left + 0.5, xlim_right)
128 | 
129 |     if show_positions:
130 |         ax.set_xticks(peaks)
131 |         ax.set_xticklabels(list(range(region_start + 1, region_end + 2)))
132 |     else:
133 |         ax.set_xticks([])
134 | 
135 |     if show_rc:
136 |         ax.invert_xaxis()
137 | 
138 |     # hide y axis
139 |     ax.set_yticklabels([])
140 |     ax.get_yaxis().set_visible(False)
141 |     # hide border
142 |     ax.spines["left"].set_visible(False)
143 |     ax.spines["right"].set_visible(False)
144 |     ax.spines["top"].set_visible(False)
145 |     # hide grid
146 |     ax.grid(False)
147 |     # set legend
148 |     ax.legend(loc="upper left", bbox_to_anchor=(0.95, 0.99))
149 |     return ax
150 | 
151 | 
152 | def show_reference(
153 |     query_record: SeqRecord,
154 |     subject_record: SeqRecord,
155 |     ax: Axes,
156 |     ref_central: Optional[int] = None,
157 | ) -> Axes:
158 |     """
159 |     show the reference of the chromatograph.
160 | 
161 |     design: if location is not proviode, do the alignment first
162 |     @param seq: input SeqRecord of ref
163 |     """
164 | 
165 |     sitepairs = align_chromatograph(query_record, subject_record)
166 |     sitepairs_indexing = {s.cf_pos: s for s in sitepairs}
167 |     cf_sites = [int(i.get_text()) for i in ax.get_xticklabels()]
168 |     matched_sitepairs = [sitepairs_indexing[pos] for pos in cf_sites]
169 |     for i, peak in enumerate(ax.get_xticks()):
170 |         ax.text(
171 |             peak,
172 |             1.05,
173 |             matched_sitepairs[i].ref_base,
174 |             color="dimgrey",
175 |             va="bottom",
176 |             ha="center",
177 |             alpha=0.85,
178 |             fontsize="xx-large",
179 |             fontweight="bold",
180 |             clip_on=False,
181 |         )
182 |         if ref_central is not None:
183 |             ref_pos = matched_sitepairs[i].ref_pos - ref_central
184 |         else:
185 |             ref_pos = matched_sitepairs[i].ref_pos
186 |         ax.text(
187 |             peak,
188 |             1.12,
189 |             ref_pos,
190 |             color="dimgrey",
191 |             va="bottom",
192 |             ha="center",
193 |             alpha=0.85,
194 |             fontsize="medium",
195 |             fontweight="normal",
196 |             clip_on=False,
197 |         )
198 |     return ax
199 | 
200 | 
201 | def highlight_base(
202 |     pos_highlight: int, seq: SeqRecord, ax: Axes, passed_filter=True
203 | ) -> Axes:
204 |     """
205 |     Highlight the area around a peak with a rectangle.
206 |     """
207 | 
208 |     peaks = seq.annotations["peak positions"]
209 |     peak = peaks[pos_highlight - 1]
210 | 
211 |     xmin, xmax = ax.get_xlim()
212 |     if not xmin <= peak < xmax:
213 |         raise ValueError("peak not within plot bounds")
214 | 
215 |     if pos_highlight == 1:
216 |         xmin = -0.5
217 |     else:
218 |         xmin = 0.5 * (peaks[pos_highlight - 1] + peaks[pos_highlight - 2])
219 | 
220 |     if pos_highlight == len(peaks):
221 |         xmax = -0.5
222 |     else:
223 |         xmax = 0.5 * (peaks[pos_highlight - 1] + peaks[pos_highlight])
224 |     ymin, ymax = ax.get_ylim()
225 | 
226 |     if passed_filter:
227 |         fcolor = "yellow"
228 |     else:
229 |         fcolor = "grey"
230 |     rec = mpl.patches.Rectangle(
231 |         (xmin, ymin),
232 |         (xmax - xmin),
233 |         (ymax - ymin),
234 |         edgecolor="none",
235 |         facecolor=fcolor,
236 |         alpha=0.3,
237 |     )
238 |     ax.add_patch(rec)
239 |     return ax
240 | 
241 | 
242 | def annotate_mutation(mut: SitePair, seq: SeqRecord, ax) -> Axes:
243 |     """
244 |     Annotate mutation pattern chromatograph position.
245 |     """
246 |     peaks = seq.annotations["peak positions"]
247 |     peak = peaks[mut.cf_pos - 1]
248 |     ax.text(
249 |         peak,
250 |         0.99,
251 |         f"{mut.ref_base}{mut.ref_pos}{mut.cf_base}",
252 |         color="c",
253 |         fontsize="large",
254 |         fontweight="bold",
255 |         rotation=45,
256 |         ha="center",
257 |         va="center",
258 |     )
259 |     return ax
260 | 


--------------------------------------------------------------------------------
/cfutils/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright © 2019 yech <yech1990@gmail.com>
 5 | # Distributed under terms of the MIT license.
 6 | #
 7 | # Created: 2019-05-27 21:19
 8 | 
 9 | """shared function for package."""
10 | 
11 | import logging
12 | import sys
13 | 
14 | try:
15 |     assert sys.version_info > (3, 6)
16 | except AssertionError:
17 |     raise RuntimeError("cfutils requires Python 3.6+!")
18 | 
19 | 
20 | def get_logger(name: str) -> logging.Logger:
21 |     """global logging."""
22 |     logger: logging.Logger = logging.getLogger(name)
23 |     if not logger.handlers:
24 |         handler: logging.StreamHandler = logging.StreamHandler()
25 |         formatter: logging.Formatter = logging.Formatter(
26 |             "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
27 |         )
28 |         handler.setFormatter(formatter)
29 |         logger.addHandler(handler)
30 |         #  logger.setLevel(logging.DEBUG)
31 |         logger.setLevel(logging.INFO)
32 |     return logger
33 | 
34 | 
35 | LOGGER: logging.Logger = get_logger(__name__)
36 | 
37 | 
38 | def evenchunks(string, chunksize=10):
39 |     out = []
40 |     for i in range(0, len(string), chunksize):
41 |         end = i + chunksize
42 |         out.append(string[i:end])
43 |     return out
44 | 
45 | 
46 | def chunked_lines(string, chunksize=10, chunks_per_line=5, spacer=" "):
47 |     chunks = evenchunks(string, chunksize)
48 |     lines = []
49 |     while chunks:
50 |         lines.append(spacer.join(chunks[:chunks_per_line]))
51 |         del chunks[:chunks_per_line]
52 |     return lines
53 | 
54 | 
55 | def reverse_complement(dna):
56 |     "Return the reverse complement of a DNA sequence."
57 |     return dna.translate(str.maketrans("ATCG", "TAGC"))[::-1]
58 | 


--------------------------------------------------------------------------------
/data/B5-M13R_B07.ab1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/data/B5-M13R_B07.ab1


--------------------------------------------------------------------------------
/data/B5-M13R_B07_vs_ref.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/data/B5-M13R_B07_vs_ref.pdf


--------------------------------------------------------------------------------
/data/B5-M13R_B07_vs_ref.tsv:
--------------------------------------------------------------------------------
 1 | RefLocation	RefBase	CfLocation	CfBase	SiteQual	LocalQual
 2 | 20	A	66	N	52	48
 3 | 21	C	67	N	52	47
 4 | 22	G	68	N	52	45
 5 | 23	G	69	N	45	45
 6 | 24	A	70	N	35	45
 7 | 25	G	71	N	35	45
 8 | 26	A	72	N	35	43
 9 | 27	C	73	N	35	42
10 | 28	C	74	N	52	41
11 | 29	G	75	N	52	40
12 | 30	A	76	N	52	40
13 | 31	A	77	N	35	40
14 | 32	G	78	N	44	41
15 | 33	G	79	N	35	42
16 | 34	A	80	a	35	41
17 | 35	G	81	g	35	41
18 | 36	A	82	a	40	41
19 | 60	A	106	T	55	53
20 | 61	C	107	G	55	54
21 | 169	A	215	G	55	54
22 | 170	A	216	G	55	54
23 | 172	A	218	G	55	55
24 | 177	T	223	C	55	55
25 | 391	G	437	A	55	55
26 | 592	A	638	C	55	53
27 | 1037	G	1083	-	30	24
28 | 1044	C	1089	-	38	25
29 | 1068	A	1112	-	30	27
30 | 1079	G	1122	-	30	22
31 | 1089	C	1131	-	16	15
32 | 


--------------------------------------------------------------------------------
/data/data_file:
--------------------------------------------------------------------------------
1 | some data
2 | 
3 | by yc
4 | 


--------------------------------------------------------------------------------
/data/matplotlib_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/data/matplotlib_example.png


--------------------------------------------------------------------------------
/data/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/data/plot.png


--------------------------------------------------------------------------------
/data/ref.fa:
--------------------------------------------------------------------------------
1 | >3k
2 | CGCCTCCCTCGCGCCATCAGNNNNNNNNNNNNNNagaAGCTTACTAACCAGCCAACTAGCTGGCTAGCAGGTAAACCTGCCAGCCTGCCGGCTCAGGTGAGCCAGTTAGTAGGCAAGTAAGCTCACCTGTAGGGGCTTTGGAGCAGGTATTGGAGTACAGGTGTAGGTTGGAGTTAGCCAGTAGGTTCACCTGATTACCCTGTTATCCCTACAGGTGAGCAGGCTAGCAAGTAGGTTCCAATGCCGGCTGGTAAGCATACCAACTCCAAAGTTCACCTGCAGGTGTAGGTACCTAGGCACCTGCACCTGGGCATAGGTGCTCCTAAGCTAGCAAACCGGTACCTATACTCAGGTGAGCTAGCAAGCTCAGGTGTAGGGATAACAGGGTAATAGCTAACCTACTAGTTGGCTAACCCCAACCAATACTTAGGAGCTGGCAGGCTAGTTTACTAGCTCAGGTGCAGGTGAGTAAGTACACCTGTGCCAGTAAGCACCTAAGCCAACCAGCCCAGGTGAGCCAACTTGCTGGCAAACCTACTGGTATACCATTACCCTGTTATCCCTAAGCTGGTAAGCTTACCCCTATACTCACCTGTGCCAGCCCAGGTGAGCAAGTTGGTATACCCACCTGCAGGTGAGTAGGCTAGTAAGCTAGCTAGTATGCTAGCTGGTTAGTTTGCCGGCTGGCTCCAAAACTAGTTGGTTGGCTCAGGTGTGCCGGTTTAGGGATAACAGGGTAATTGCTCCTACAGGTGAGTAGGCTTACCAGCTCAGGTGAGCAAGCTTGCTCCAATAGGTAGGTTGGAGCATGCCAGTTAGCTTTGGAGCTCAGGTGAGTTTGCCAGTAGGTAAACTAGTATACTTGCTAGCTGGCAAGCCGGTTAGTAGGCTCCTAATTACCCTGTTATCCCTACCAAAACCTGCCCCTAAGCTAGTATAGGAGCCGGTTAGCCAACCAGTACCAACCTAAGCACACCTGAGCTAGCAAACTAGTACCTATACTTGCCAGCAGGCTAGCTTACCAGTAAGTAGGCACAGGTGTGCCCCTAAGCCAGCTGGCAAGCTTAGGGATAACAGGGTAATGGCTGGCTTGCCAGCAGGTTTACCAACTAACCTAGGAACCAACTAACTTGCTCCAAAGCAAGCAAACTCACCTGGGCATGCCCCTAAGCTAGTAAACCCAGGTGAGCAGGTAGGTAAGTTTACCAGCCAACTTACCCAGGTGAACCAGTTCACCTGATTACCCTGTTATCCCTATGCTAGCATACTTGCTTGCCGGCATGCTTGCTAGTACCAAAACTAGCTGGTTGGCACAGGTGGGCTTGCTTAGGCACCTGAGCAGGCAGGCTAGTACCTAAGCCAACCGGCAAGTAAGTTAGTAGGCTCCAAAGTTCAGGTGTTGGAGTTAACTTAGGGATAACAGGGTAATAGTAGGTAGGTTAGCTGGTTAGTAAGCTTGCCTTGGAGCTTGCTAGTTTGCTAGTTTACCAACTAACCGGCAAGTTAACTTTGGCACCTGTTGGTAGGCCTAAGCTTGCCAGCCCACCTGAACCTGCCCAGGTGGGCACACCTGAGTATGCCTTGGATTACCCTGTTATCCCTAAGCACACCTGAGCAAGCTAGTACAGGTGCACCTGCAGGTGCCTACACCTGGGTAGGCTAACTCACCTGTGCCTGCCTGCTGGCACACCTGAACTGGTTGGCACCTATGCCAGCTTGCCAACCGGCTTAGGTAGGTACCAGCCGGTATACTAGCTAACTAACCTAGGGATAACAGGGTAATCACCTGAGTAAACCCCTAGGTAAGTACAGGTGTACCAGCTGGTTGGTTCCAACCTAAGCTTTGGTTGGTGCCGGCTGGTTTACCGGTATACTCCAACACCTGAGCTGGTACCTAGGCTTACTCACCTGCAGGTGGGCTGGTACCTATGCCAACCAACCATTACCCTGTTATCCCTACACCTGTTGGAGCTTTGGCACCTGAGCACACCTGGGCTGGCATGCTTAGGCACCTGGGTAGGCTTAGGCAGGTGAGCAGGCTAGCTGGTAGGTTAGCCGGTACACCTGAGTTTACTCAGGTGCCTAAGCTGGTTTAGGAGCTGGTATAGGGGCATTGGAGCATAGGGATAACAGGGTAATGGCTGGCAGGTTAACCAACTAACCAACTCCTAAGCCGGTAGGCTAGCTAGCATACCTGCTAGCCCCAACACCTGTACCAGCAGGCAAGCTGGCTCCTAAACTAGTACAGGTGAACCTGCCGGCTAGCTAGCTTAGGGGCTAGCCAGTAGGTTATTACCCTGTTATCCCTAAGCTAGCCTGCCAGCTCCTATGCTAGTTAGCAAGCTGGTAGGCTGGCTAGCCTGCCTACTTACCGGTTGGTAGGTAAACCCACCTGAGCATGCCGGTATGCCTAGGGGCTTGCCTGCCAGCCAACCTAGGTGCTGGCACCTATGCCTACTTAGGGATAACAGGGTAATAACTGGCTCCAACACCTGTACTAGCAAGCTTGCCAGCAAGTATAGGCACCTGAGCTAACTAGCTTAGGAACCCACCTGGGCATAGGAACCAGCTAGTTAGCTCCAAAGCTAACCCCTAGGTTGGTTTGCCAGCACACCTGTACTTACCCACCTGTACTATTACCCTGTTATCCCTAAGTTAACTCCTAAGCCCACCTGTACCAACCAGTAGGCATTGGAGTTGGCTGGTACCTAGGCTGGCTAGCCAGCTGGTAAGCAAGCAAGTTTACCCAGGTGGGCTCCTACAGGTGAGCTCCTAAGCTCACCTGGGTACCAAGGCTGGCAAGCAAGCCTAGGGATAACAGGGTAATAGCTGGCTAGTTGGTAGGCTAGCTTAGGGGCTGGCTAACCAGCAGGTAAGTAAGCACCAAAGCAGGTTGGTAAACCTTGGCAGGTGAGTTGGCTAGCTTTGGAACTAGCCAGTTTACCTAGGAACTAGTTCCTAAGCTAGTAGGTTAGTAtctacacaaggaacaaacactggatgtcactttcagttcaaattgtaacgctaatcactccgaacaggtcac
3 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | cfutils.yech.science


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | remote_theme: just-the-docs/just-the-docs
 2 | ga_tracking: G-EFHY8WFMHK
 3 | ga_tracking_anonymize_ip: true
 4 | color_scheme: light
 5 | favicon_ico: '/favicon.ico'
 6 | footer_content: 'Copyright &copy; 2021-2024, Chang Ye'
 7 | 
 8 | callouts:
 9 |   warning:
10 |     title: Warning
11 |     color: red
12 |   note:
13 |     title: Note
14 |     color: green
15 | 
16 | plugins:
17 |   - jekyll-spaceship
18 | 


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
 1 | # API Reference
 2 | 
 3 | ## cfutils.run.report_mutation
 4 | 
 5 | ```python
 6 | def report_mutation(
 7 |     query_ab1_file: str,
 8 |     subject_fasta_file: str,
 9 |     output_dir: Optional[str] = None,
10 |     file_basename: Optional[str] = None,
11 |     report_all_sites: bool = False,
12 |     report_mut_plot: bool = False,
13 | ) -> None:
14 |     """Report mutation within region, optionally generate plot and TSV output."""
15 | ```
16 | 
17 | - **query_ab1_file**: Path to the ABI file (Sanger sequencing data)
18 | - **subject_fasta_file**: Path to the reference FASTA file
19 | - **output_dir**: Output directory for results (optional)
20 | - **file_basename**: Output file basename (optional)
21 | - **report_all_sites**: If True, report all aligned sites; otherwise, only mutation sites
22 | - **report_mut_plot**: If True, generate a PDF plot of the mutation region
23 | 
24 | ---
25 | 
26 | ## cfutils.parser.parse_abi
27 | 
28 | ```python
29 | def parse_abi(path: str) -> SeqRecord:
30 |     """Parse an ABI file and return a SeqRecord object."""
31 | ```
32 | 
33 | ---
34 | 
35 | ## cfutils.show.plot_chromatograph
36 | 
37 | ```python
38 | def plot_chromatograph(
39 |     seq: SeqRecord,
40 |     region: Optional[Tuple[int, int]] = None,
41 |     ax: Optional[matplotlib.axes.Axes] = None,
42 |     show_bases: bool = True,
43 |     show_positions: bool = True,
44 |     color_map: Optional[dict] = None,
45 | ) -> matplotlib.axes.Axes:
46 |     """Plot a chromatogram for a given sequence region."""
47 | ```
48 | 
49 | ---
50 | 
51 | For more details, see the source code and docstrings in each module.
52 | 


--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
 1 | # Command Line Interface (CLI)
 2 | 
 3 | ## Detect and visualize mutations in one step
 4 | 
 5 | ```bash
 6 | cfutils mut --query ./data/B5-M13R_B07.ab1 --subject ./data/ref.fa --outdir ./data/ --plot
 7 | ```
 8 | 
 9 | ## For help on CLI options
10 | 
11 | ```bash
12 | cfutils mut --help
13 | ```
14 | 
15 | ## Running Tests
16 | 
17 | To run all tests:
18 | 
19 | ```bash
20 | make test
21 | ```
22 | 
23 | To clean up build and cache files:
24 | 
25 | ```bash
26 | make clean
27 | ```
28 | 


--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/docs/favicon.ico


--------------------------------------------------------------------------------
/docs/features.md:
--------------------------------------------------------------------------------
1 | # Features
2 | 
3 | - Visualize Sanger sequencing chromatograms
4 | - Call and report mutations by alignment
5 | - Trim and rescale traces
6 | - CLI and Python API support
7 | - Export results as TSV and PDF
8 | - Highlight and annotate bases and mutations
9 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Home
 3 | layout: home
 4 | nav_order: 1
 5 | description: "Chromatogram File Utils: Sanger sequencing visualization, alignment, mutation calling, and more."
 6 | permalink: /
 7 | ---
 8 | 
 9 | # Chromatogram File Utils
10 | {: .fs-9 }
11 | 
12 | A toolkit for Sanger sequencing data: visualize chromatograms, call and report mutations, trim and rescale traces, and more.
13 | {: .fs-6 .fw-300 }
14 | 
15 | [Get Started](installation.md){: .btn .btn-primary .fs-5 .mb-4 .mb-md-0 .mr-2 }
16 | [View on GitHub](https://github.com/y9c/cfutils){: .btn .fs-5 .mb-4 .mb-md-0 }
17 | 
18 | ---
19 | 
20 | {: .warning }
21 | > This documentation is for the latest development version of cfutils. For stable releases, see the [GitHub Releases](https://github.com/y9c/cfutils/releases).
22 | 
23 | cfutils is a Python toolkit and CLI for Sanger sequencing data analysis. It supports:
24 | 
25 | - Chromatogram visualization
26 | - Mutation calling and reporting
27 | - CLI and Python API
28 | - Export to PDF/TSV
29 | - Easy integration with bioinformatics workflows
30 | 
31 | ## Quick links
32 | 
33 | - [Installation](installation.md)
34 | - [CLI Usage](cli.md)
35 | - [API Reference](api.md)
36 | 
37 | {: .note }
38 | > cfutils is open source and welcomes contributions! See the [GitHub repo](https://github.com/y9c/cfutils) for details.
39 | 
40 | ---
41 | 
42 | ## About the project
43 | 
44 | cfutils is &copy; 2019-{{ "now" | date: "%Y" }} by [Chang Ye](https://github.com/y9c).
45 | 
46 | ### License
47 | 
48 | cfutils is distributed under the [MIT license](https://github.com/y9c/cfutils/blob/master/LICENSE).
49 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## From PyPI
 4 | 
 5 | ```bash
 6 | pip install --user cfutils
 7 | ```
 8 | 
 9 | ## From source
10 | 
11 | ```bash
12 | git clone git@github.com:y9c/cfutils.git
13 | cd cfutils
14 | make init
15 | ```
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "cfutils"
 3 | version = "0.0.0.dev62"
 4 | description = "Chromatogram File Utils"
 5 | authors = [{ name = "Ye Chang", email = "yech1990@gmail.com" }]
 6 | requires-python = "~=3.9"
 7 | readme = "README.md"
 8 | license = "MIT"
 9 | keywords = ["DNA", "mutation", "chromatogram", "biology"]
10 | dependencies = [
11 |     "click>=8.0.0,<9",
12 |  "ssw>=0.4.1,<0.5",
13 |  "matplotlib>=3.9.2,<4",
14 |  "numpy>=2.0.2",
15 | ]
16 | 
17 | [project.urls]
18 | Repository = "https://github.com/yech1990/cfutils"
19 | Documentation = "https://cf.readthedocs.io/"
20 | 
21 | [project.scripts]
22 | cfutils = "cfutils.cli:cli"
23 | 
24 | [build-system]
25 | requires = ["hatchling"]
26 | build-backend = "hatchling.build"
27 | 
28 | [tool.hatch.build.targets.wheel]
29 | packages = ["cfutils"]
30 | 
31 | [tool.black]
32 | line-length = 79
33 | 
34 | [dependency-groups]
35 | dev = [
36 |     "pytest>=8.3.5",
37 | ]
38 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/test/__init__.py


--------------------------------------------------------------------------------
/test/__init__.py.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/y9c/cfutils/a34e263d38ccba3821ff5c5b79df9988903c0ba6/test/__init__.py.py


--------------------------------------------------------------------------------
/test/test_advance.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Unit tests for cfutils advanced mutation reporting."""
 3 | 
 4 | import unittest
 5 | import tempfile
 6 | import os
 7 | from cfutils.run import report_mutation
 8 | 
 9 | class TestFunc(unittest.TestCase):
10 |     """Test advanced mutation reporting in cfutils."""
11 | 
12 |     def test_plot_mutation(self) -> None:
13 |         """Test report_mutation with plot output enabled and check output files."""
14 |         with tempfile.TemporaryDirectory() as tmpdir:
15 |             try:
16 |                 report_mutation(
17 |                     query_ab1_file="./data/B5-M13R_B07.ab1",
18 |                     subject_fasta_file="./data/ref.fa",
19 |                     output_dir=tmpdir,
20 |                     file_basename="test",
21 |                     report_all_sites=True,
22 |                     report_mut_plot=True,
23 |                 )
24 |                 pdf_path = os.path.join(tmpdir, "test.pdf")
25 |                 tsv_path = os.path.join(tmpdir, "test.tsv")
26 |                 if not os.path.exists(tsv_path):
27 |                     print(f"Temp dir contents: {os.listdir(tmpdir)}")
28 |                 self.assertTrue(os.path.exists(tsv_path), f"Missing TSV: {tsv_path}")
29 |                 if os.path.exists(tsv_path):
30 |                     with open(tsv_path) as f:
31 |                         lines = f.readlines()
32 |                     # If there are mutations, PDF should exist
33 |                     if len(lines) > 1:
34 |                         self.assertTrue(os.path.exists(pdf_path), f"Missing PDF: {pdf_path} (but TSV has mutations)")
35 |                     else:
36 |                         if not os.path.exists(pdf_path):
37 |                             print(f"Warning: No mutations found, so PDF was not generated. TSV content: {lines}")
38 |             except Exception as e:
39 |                 self.fail(f"report_mutation raised an exception: {e}")
40 | 
41 | if __name__ == "__main__":
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------
/test/test_align.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Unit tests for cfutils alignment functions."""
 3 | 
 4 | import unittest
 5 | from cfutils.align import align_chromatograph, call_mutations
 6 | from cfutils.parser import parse_abi, parse_fasta
 7 | 
 8 | 
 9 | class TestAlignFunc(unittest.TestCase):
10 |     """Test alignment and mutation calling in cfutils.align."""
11 | 
12 |     def setUp(self) -> None:
13 |         """Load test data for alignment tests."""
14 |         self.query_record = parse_abi("./data/B5-M13R_B07.ab1")
15 |         self.subject_record = parse_fasta("./data/ref.fa")
16 | 
17 |     def test_align_chromatograph(self) -> None:
18 |         """Test align_chromatograph returns a list of site pairs."""
19 |         sitepairs = align_chromatograph(self.query_record, self.subject_record)
20 |         self.assertIsInstance(sitepairs, list)
21 |         self.assertGreater(len(sitepairs), 0, "No site pairs found.")
22 | 
23 |     def test_call_mutations(self) -> None:
24 |         """Test call_mutations returns a list of mutations."""
25 |         mutations = call_mutations(self.query_record, self.subject_record)
26 |         self.assertIsInstance(mutations, list)
27 |         # Optionally check mutation structure if known
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/test/test_basic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Basic environment and sanity tests for cfutils."""
 3 | 
 4 | import sys
 5 | import unittest
 6 | 
 7 | 
 8 | class TestBasicFunc(unittest.TestCase):
 9 |     """Test basic environment requirements for cfutils."""
10 | 
11 |     def test_python_version(self) -> None:
12 |         """Ensure Python version is >= 3.6."""
13 |         self.assertGreaterEqual(sys.version_info, (3, 6), "Python 3.6+ is required.")
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     unittest.main()
18 | 


--------------------------------------------------------------------------------
/test/test_parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Unit tests for cfutils parser functions."""
 3 | 
 4 | import unittest
 5 | from cfutils.parser import parse_abi, parse_fasta
 6 | 
 7 | class TestParserFunc(unittest.TestCase):
 8 |     """Test parsing functions in cfutils.parser."""
 9 | 
10 |     def test_parse_abi(self) -> None:
11 |         """Test parse_abi returns a SeqRecord with expected attributes."""
12 |         record = parse_abi("./data/B5-M13R_B07.ab1")
13 |         self.assertIsNotNone(record)
14 |         self.assertTrue(hasattr(record, "seq"), "SeqRecord missing 'seq' attribute.")
15 | 
16 |     def test_parse_fasta(self) -> None:
17 |         """Test parse_fasta returns a SeqRecord with expected attributes."""
18 |         record = parse_fasta("./data/ref.fa")
19 |         self.assertIsNotNone(record)
20 |         self.assertTrue(hasattr(record, "seq"), "SeqRecord missing 'seq' attribute.")
21 | 
22 | if __name__ == "__main__":
23 |     unittest.main()


--------------------------------------------------------------------------------
/test/test_run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Unit tests for cfutils run module."""
 3 | 
 4 | import unittest
 5 | import tempfile
 6 | import os
 7 | from cfutils.run import report_mutation
 8 | 
 9 | class TestRunFunc(unittest.TestCase):
10 |     """Test report_mutation in cfutils.run."""
11 | 
12 |     def test_report_mutation(self) -> None:
13 |         """Test report_mutation creates output files without error."""
14 |         with tempfile.TemporaryDirectory() as tmpdir:
15 |             try:
16 |                 report_mutation(
17 |                     query_ab1_file="./data/B5-M13R_B07.ab1",
18 |                     subject_fasta_file="./data/ref.fa",
19 |                     output_dir=tmpdir,
20 |                     file_basename="test",
21 |                     report_all_sites=True,
22 |                     report_mut_plot=False,
23 |                 )
24 |                 tsv_path = os.path.join(tmpdir, "test.tsv")
25 |                 if not os.path.exists(tsv_path):
26 |                     print(f"Temp dir contents: {os.listdir(tmpdir)}")
27 |                 self.assertTrue(os.path.exists(tsv_path), f"Missing TSV: {tsv_path}")
28 |             except Exception as e:
29 |                 self.fail(f"report_mutation raised an exception: {e}")
30 | 
31 | if __name__ == "__main__":
32 |     unittest.main()


--------------------------------------------------------------------------------
/test/test_show.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Unit tests for cfutils show module."""
 3 | 
 4 | import unittest
 5 | import matplotlib.pyplot as plt
 6 | from cfutils.parser import parse_abi, parse_fasta
 7 | from cfutils.show import highlight_base, plot_chromatograph, annotate_mutation
 8 | from cfutils.align import SitePair
 9 | 
10 | class TestShowFunc(unittest.TestCase):
11 |     """Test visualization functions in cfutils.show."""
12 | 
13 |     def setUp(self) -> None:
14 |         """Set up test data and figure for plotting tests."""
15 |         self.query_record = parse_abi("./data/B5-M13R_B07.ab1")
16 |         self.subject_record = parse_fasta("./data/ref.fa")
17 |         self.fig, self.ax = plt.subplots(1, 1, figsize=(15, 6))
18 | 
19 |     def test_plot_chromatograph(self) -> None:
20 |         """Test plot_chromatograph function runs without error."""
21 |         plot_chromatograph(self.query_record, region=(10, 30), ax=self.ax)
22 |         self.assertTrue(True)
23 | 
24 |     def test_highlight_base(self) -> None:
25 |         """Test highlight_base overlays highlight on chromatograph."""
26 |         plot_chromatograph(self.query_record, region=(10, 20), ax=self.ax)
27 |         highlight_base(14, self.query_record, self.ax)
28 |         self.assertTrue(True)
29 | 
30 |     def test_annotate_mutation(self) -> None:
31 |         """Test annotate_mutation overlays mutation annotation."""
32 |         mutation = SitePair(ref_pos=10, ref_base='A', cf_pos=14, cf_base='T')
33 |         annotate_mutation(mutation, self.query_record, self.ax)
34 |         self.assertTrue(True)
35 | 
36 |     def tearDown(self) -> None:
37 |         """Close the matplotlib figure after each test."""
38 |         plt.close(self.fig)
39 | 
40 | if __name__ == "__main__":
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------