├── .github
    └── workflows
    │   └── publish.yaml
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── cemba_data
    ├── __init__.py
    ├── __main__.py
    ├── _yap_internal_cli_.py
    ├── bulk
    │   ├── Snakefile_template
    │   │   ├── __init__.py
    │   │   └── mc_bulk.Snakefile
    │   ├── __init__.py
    │   ├── atac_bulk.py
    │   ├── mc_bulk.py
    │   ├── mc_bulk_multigroup
    │   │   ├── __init__.py
    │   │   ├── mc_bulk_multigroup.py
    │   │   └── mc_bulk_multigroup_template.py
    │   └── mct_bulk.py
    ├── demultiplex
    │   ├── __init__.py
    │   ├── demultiplex.py
    │   ├── fastq_dataframe.py
    │   └── plateinfo_and_samplesheet.py
    ├── dmr
    │   ├── __init__.py
    │   ├── dmrseq
    │   │   ├── DMRseq.ipynb
    │   │   └── __init__.py
    │   └── dss
    │   │   ├── DSS.MultiGroup.SingleRegionDML.ipynb
    │   │   ├── DSS.TwoGroup.SingleRegionDML.ipynb
    │   │   ├── MultiGroup.py
    │   │   ├── TwoGroup.py
    │   │   └── __init__.py
    ├── files
    │   ├── V1_i7_i5_index.tsv
    │   ├── V2_i7_i5_index.tsv
    │   ├── __init__.py
    │   ├── default_config
    │   │   ├── __init__.py
    │   │   ├── mapping_config_4m.ini
    │   │   ├── mapping_config_m3c.ini
    │   │   ├── mapping_config_mc.ini
    │   │   ├── mapping_config_mct-nome.ini
    │   │   ├── mapping_config_mct.ini
    │   │   └── mapping_config_nome.ini
    │   ├── mapping_summary_template
    │   │   ├── 4m_template.ipynb
    │   │   ├── __init__.py
    │   │   ├── m3c_template.ipynb
    │   │   ├── mc_template.ipynb
    │   │   └── mct_template.ipynb
    │   ├── plate_info_template_v1.txt
    │   ├── plate_info_template_v2.txt
    │   ├── random_index_v1.fa
    │   ├── random_index_v2
    │   │   ├── __init__.py
    │   │   ├── random_index_v2.fa
    │   │   ├── random_index_v2.multiplex_group_1.fa
    │   │   ├── random_index_v2.multiplex_group_2.fa
    │   │   ├── random_index_v2.multiplex_group_3.fa
    │   │   ├── random_index_v2.multiplex_group_4.fa
    │   │   ├── random_index_v2.multiplex_group_5.fa
    │   │   └── random_index_v2.multiplex_group_6.fa
    │   ├── sample_sheet_header.txt
    │   ├── sbatch_template_schicluster.txt
    │   └── sbatch_template_yap.txt
    ├── hisat3n
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── gcp.md
    │   │   ├── hisat-3n-build.sh
    │   │   ├── hisat3n_mapping_env.yaml
    │   │   └── vm_init.sh
    │   ├── hisat3n_general.py
    │   ├── hisat3n_m3c.py
    │   ├── hisat3n_mct.py
    │   ├── snakefile
    │   │   ├── __init__.py
    │   │   ├── m3c.smk
    │   │   ├── mc-multi.smk
    │   │   ├── mc-multi_sort_input.smk
    │   │   ├── mc.smk
    │   │   ├── mct-multi.smk
    │   │   └── mct.smk
    │   ├── stats_col_names.py
    │   ├── stats_parser.py
    │   ├── summary.py
    │   └── utilities.py
    ├── mapping
    │   ├── Snakefile_template
    │   │   ├── 4m.Snakefile
    │   │   ├── __init__.py
    │   │   ├── m3c.Snakefile
    │   │   ├── mc.Snakefile
    │   │   └── mct.Snakefile
    │   ├── __init__.py
    │   ├── config.py
    │   ├── m3c
    │   │   └── __init__.py
    │   ├── mct
    │   │   ├── __init__.py
    │   │   ├── mct_bismark_bam_filter.py
    │   │   └── mct_star_bam_filter.py
    │   ├── pipelines
    │   │   ├── _4m.py
    │   │   ├── __init__.py
    │   │   ├── m3c.py
    │   │   ├── mc.py
    │   │   └── mct.py
    │   ├── stats
    │   │   ├── _4m.py
    │   │   ├── __init__.py
    │   │   ├── m3c.py
    │   │   ├── mc.py
    │   │   ├── mct.py
    │   │   ├── plate_info.py
    │   │   ├── plot.py
    │   │   └── utilities.py
    │   └── test_environment.py
    ├── qsub.py
    ├── sbatch.py
    ├── snm3C
    │   ├── __init__.py
    │   ├── prepare_dataset.py
    │   └── prepare_impute.py
    └── utilities.py
├── doc
    ├── Makefile
    ├── Mapping.ipynb
    ├── MappingSummary.ipynb
    ├── PipelineInput.ipynb
    ├── PlateInfoAndSampleSheet.ipynb
    ├── TODO_GenerateMCDS.ipynb
    ├── TODO_overview.ipynb
    ├── TechBasic.ipynb
    ├── archive
    │   └── MakeFastqDataframe.ipynb
    ├── conf.py
    ├── demultiplex.ipynb
    ├── files
    │   ├── MappingPipeline.png
    │   ├── molecularsteps.png
    │   ├── primerstructure.png
    │   ├── v1barcode.png
    │   └── v2barcode.png
    ├── index.rst
    ├── installation.ipynb
    └── make.bat
├── env.yaml
├── hisat3n_env.yml
├── pyproject.toml
├── requirements.txt
└── setup.py


/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |     push:
13 |         # Sequence of patterns matched against refs/tags
14 |         tags:
15 |             - "v*" # Push events to matching v*, i.e. v1.0, v20.15.10
16 | 
17 | permissions:
18 |     contents: read
19 | 
20 | jobs:
21 |     deploy:
22 |         runs-on: ubuntu-latest
23 | 
24 |         steps:
25 |             # build python package and deploy to pypi
26 |             - uses: actions/checkout@v3
27 |             - name: Set up Python
28 |               uses: actions/setup-python@v3
29 |               with:
30 |                   python-version: "3.8"
31 |             - name: Install dependencies
32 |               run: |
33 |                   python -m pip install --upgrade pip wheel twine build
34 |                   pip install build
35 |             - name: Build package
36 |               run: python -m build
37 |             - name: Publish package
38 |               uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
39 |               with:
40 |                   user: __token__
41 |                   password: ${{ secrets.PYPI_API_TOKEN_CEMBA_DATA }}
42 | 
43 |         # # build docker image and push to GCR
44 |         # - uses: actions/checkout@v3
45 |         # - uses: google-github-actions/setup-gcloud@v0
46 |         # - name: Get the version
47 |         #   id: get_tag_name
48 |         #   run: echo ::set-output name=GIT_TAG_NAME::${GITHUB_REF/refs\/tags\//}
49 |         # - uses: RafikFarhad/push-to-gcr-github-action@v4
50 |         #   with:
51 |         #     gcloud_service_key: ${{ secrets.GCLOUD_SERVICE_KEY }}
52 |         #     registry: gcr.io
53 |         #     project_id: prod-635e
54 |         #     image_name: wmb
55 |         #     image_tag: ${{ steps.get_tag_name.outputs.GIT_TAG_NAME}},latest
56 |         #     dockerfile: ./Dockerfile
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .idea/
  6 | .DS_Store
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | cemba_data/_version.py
107 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | - '3.7'
 4 | script: true
 5 | deploy:
 6 |   provider: pypi
 7 |   username: __token__
 8 |   on:
 9 |     tags: true
10 |   password:
11 |     secure: KGUWzdUpJgJGR4wOz8w3o16zscEFlRWF5Kdm3PSLTDuDY6OpNK0L4lywyMuhh178hCRZ72/5FmzXYoBXI1g2IODEvpWxmvbFe3kF8FPPD3BfgYsIsF0i4pNHPpdmIxZeuBaymf+SctVNY4o81mup7n3T05P9l8mATDOnSgP+5WLoHAk+ie7D9/H386xueGxfcKuUmzyZRlqUsjs7COXgDiG9VoyZi4KvUwlZz8+jriYjs9qL/t1rN2Mg0ZCDCzGghNDo36tnvRAX+TqGACj4xURXydCJGPx6hUPTJkbydIhGlvaVblCO8FYDsLuedUIblU5SMAUklkhh48VoR1k5+l2mxCkAOLCPYodZ2AS+wNhF5yMXbOhd4zmabw0uxfpfEVZOjcDi08YzbsRFyz5f8BuFkXwjWeaUpiNG8oj/6xZBpWzGNg5cQ+ZzqHXuavf5mzgrt+K0TxBGLfQ4san0EgbBYESkUaVWRaYt0LEhmkk58Wx27Um+C7lrl2Wxs6C0rnNXzho8jiAe2ZTHva8EhG1fJuUiLZ6YA2xobZVmZlFj/J/eEoZYRvLN1dEGhWwhcgenc/1rY1NW1mllGkGVzfvB/YqZEbk9Mo9PvNej5KLg63aoYJ0/tgL/fTdBE1S1LlisZPgFHdZ2RwkB6NxazXY2qWZQkLPqJ02aEuSDb1k=
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mambaorg/micromamba:0.23.0
 2 | COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml
 3 | RUN micromamba install -y -f /tmp/env.yaml && \
 4 |     micromamba clean --all --yes
 5 | 
 6 | ARG MAMBA_DOCKERFILE_ACTIVATE=1
 7 | 
 8 | RUN yap --version
 9 | RUN allcools --version
10 | 
11 | USER root
12 | # default argument when not provided in the --build-arg
13 | # to build the image with gcp, use
14 | # docker build --build-arg gcp=true -t mapping-gcp:tag .
15 | ARG gcp
16 | RUN if [ "$gcp" = "true" ] ; then \
17 |         apt-get update && \
18 |         apt-get install -y curl gnupg && \
19 |         echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | \
20 |         tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
21 |         curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \
22 |         apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && \
23 |         apt-get update -y && \
24 |         apt-get install google-cloud-sdk -y;  \
25 |       else echo 'no gcp install';  \
26 |     fi
27 | 
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 - 2020 Hanqing Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include cemba_data *.ini
2 | include LICENSE.txt
3 | recursive-exclude * __pycache__
4 | recursive-include cemba_data *.txt *.tsv *.csv *.fa *Snakefile *ipynb
5 | exclude doc
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [](http://www.network-science.de/ascii/)
 2 | <pre>
 3 |  **    **     **        *******
 4 | //**  **     ****      /**////**
 5 |  //****     **//**     /**   /**
 6 |   //**     **  //**    /*******
 7 |    /**    **********   /**////
 8 |    /**   /**//////**   /**
 9 |    /**   /**     /**   /**
10 |    //    //      //    //
11 | </pre>
12 | 
13 | # YAP (Yet Another Pipeline)
14 | Pipeline(s) for mapping and cluster-level aggregation of single nucleus methylome and multi-omic datasets.
15 | Technologies supported:
16 | - snmC-seq(1/2/3)
17 | - snmCT-seq (mC + RNA)
18 | - snmC2T-seq (mC + RNA + Chromatin Accessibility)
19 | - snm3C-seq (mC + Chromatin Conformation)
20 | - any NOMe treated version of the above
21 | 
22 | [See Documentation](https://hq-1.gitbook.io/mc/)
23 | 


--------------------------------------------------------------------------------
/cemba_data/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import version as __version__
2 | 


--------------------------------------------------------------------------------
/cemba_data/bulk/Snakefile_template/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/bulk/Snakefile_template/__init__.py


--------------------------------------------------------------------------------
/cemba_data/bulk/Snakefile_template/mc_bulk.Snakefile:
--------------------------------------------------------------------------------
  1 | 
  2 | # Example (required) parameters
  3 | # merge_allc_cpu = 10
  4 | # mcg_context = 'CGN'
  5 | # mch_context = 'CHN'
  6 | # bigwig_mch_bin_size = 50
  7 | # bigwig_mcg_bin_size = 1
  8 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE'
  9 | # group = 'GROUP_NAME'
 10 | 
 11 | # the main rule is the final target
 12 | rule main:
 13 |     input:
 14 |         f"{group}.{mcg_context}-both.frac.bw",
 15 |         f"{group}.{mcg_context}-both.cov.bw",
 16 |         f"{group}.{mch_context}-both.frac.bw",
 17 |         f"{group}.{mch_context}-both.cov.bw",
 18 |         f"{group}.{mcg_context}-Merge.allc.tsv.gz"
 19 | 
 20 | 
 21 | # Merge ALLC
 22 | rule merge_allc:
 23 |     input:
 24 |         f"{group}.allc_paths.txt"
 25 |     output:
 26 |         allc=f"{group}.allc.tsv.gz",
 27 |         tbi=f"{group}.allc.tsv.gz.tbi"
 28 |     threads:
 29 |         max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1)))
 30 |     resources:
 31 |         mem_mb=merge_allc_cpu * 5000
 32 |     shell:
 33 |         "allcools merge-allc "
 34 |         "--allc_paths {input} "
 35 |         "--output_path {output.allc} "
 36 |         "--chrom_size_path {chrom_size_path} "
 37 |         "--cpu {threads}"
 38 | 
 39 | 
 40 | # Extract mCG ALLC for DMR calling
 41 | rule extract_allc_mcg:
 42 |     input:
 43 |         f"{group}.allc.tsv.gz"
 44 |     output:
 45 |         allc_cg=f"{group}.{mcg_context}-Merge.allc.tsv.gz",
 46 |         allc_cg_tbi=f"{group}.{mcg_context}-Merge.allc.tsv.gz.tbi"
 47 |     threads:
 48 |         1
 49 |     resources:
 50 |         mem_mb=100
 51 |     shell:
 52 |         "allcools extract-allc "
 53 |         "--allc_path {input} "
 54 |         "--output_prefix {group} "
 55 |         "--mc_contexts {mcg_context} "
 56 |         "--chrom_size_path {chrom_size_path} "
 57 |         "--strandness merge "
 58 |         "--output_format allc "
 59 |         "--cpu {threads}"
 60 | 
 61 | 
 62 | # Generate mCH BigWig files
 63 | rule bigwig_ch:
 64 |     input:
 65 |         f"{group}.allc.tsv.gz"
 66 |     output:
 67 |         f"{group}.{mch_context}-both.cov.bw",
 68 |         f"{group}.{mch_context}-both.frac.bw"
 69 |     threads:
 70 |         1
 71 |     resources:
 72 |         mem_mb=100
 73 |     shell:
 74 |         "allcools allc-to-bigwig "
 75 |         "--allc_path {input} "
 76 |         "--output_prefix {group} "
 77 |         "--bin_size {bigwig_mch_bin_size} "
 78 |         "--mc_contexts {mch_context} "
 79 |         "--chrom_size_path {chrom_size_path}"
 80 | 
 81 | 
 82 | # Generate mCG BigWig files
 83 | rule bigwig_cg:
 84 |     input:
 85 |         f"{group}.allc.tsv.gz"
 86 |     output:
 87 |         f"{group}.{mcg_context}-both.cov.bw",
 88 |         f"{group}.{mcg_context}-both.frac.bw"
 89 |     threads:
 90 |         1
 91 |     resources:
 92 |         mem_mb=100
 93 |     shell:
 94 |         "allcools allc-to-bigwig "
 95 |         "--allc_path {input} "
 96 |         "--output_prefix {group} "
 97 |         "--bin_size {bigwig_mcg_bin_size} "
 98 |         "--mc_contexts {mcg_context} "
 99 |         "--chrom_size_path {chrom_size_path}"
100 | 


--------------------------------------------------------------------------------
/cemba_data/bulk/__init__.py:
--------------------------------------------------------------------------------
1 | from .mc_bulk import prepare_mc_bulk
2 | from .atac_bulk import prepare_atac_bulk
3 | from .mct_bulk import merge_mct_cluster_bam
4 | from .mc_bulk_multigroup.mc_bulk_multigroup import merge_bulk_multigroup
5 | 


--------------------------------------------------------------------------------
/cemba_data/bulk/mc_bulk.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pathlib
  3 | import cemba_data
  4 | 
  5 | PACKAGE_DIR = pathlib.Path(cemba_data.__path__[0])
  6 | 
  7 | 
  8 | def prepare_mc_bulk(allc_table,
  9 |                     output_dir,
 10 |                     chrom_size_path,
 11 |                     mch_context='CHN',
 12 |                     mcg_context='CGN',
 13 |                     bigwig_mch_bin_size=50,
 14 |                     bigwig_mcg_bin_size=1,
 15 |                     cpu_per_job=12,
 16 |                     total_cpu=60):
 17 |     """
 18 |     Prepare the snakefile for merging single-cell ALLC files into pseudo-bulk
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     allc_table
 23 |         Path of the allc table. The allc table is a two column tsv file.
 24 |         The first columns is the absolute ALLC file paths;
 25 |         the second column is the group name of each file.
 26 |     output_dir
 27 |         Path of the output directory, will be created if not exist.
 28 |     mch_context
 29 |         mCH contexts for generating the bigwig tracks
 30 |     mcg_context
 31 |         mCG contexts for generating the bigwig tracks and merge strand
 32 |     chrom_size_path
 33 |         Path of the chromosome size file path
 34 |     bigwig_mch_bin_size
 35 |         Bin size used to generate mCH bigwig
 36 |     bigwig_mcg_bin_size
 37 |         Bin size used to generate mCG bigwig
 38 |     cpu_per_job
 39 |         Number of CPUs to use in individual merge-allc job
 40 |     total_cpu
 41 |         Number of CPUs to use in total
 42 | 
 43 |     Returns
 44 |     -------
 45 | 
 46 |     """
 47 |     snakemake_template_path = PACKAGE_DIR / 'bulk/Snakefile_template/mc_bulk.Snakefile'
 48 |     output_dir = pathlib.Path(output_dir).absolute()
 49 |     output_dir.mkdir(exist_ok=True)
 50 | 
 51 |     merge_allc_cpu = int(cpu_per_job / 1.1)
 52 |     total_mem_mb = cpu_per_job * 5000
 53 | 
 54 |     # prepare ALLC path dict
 55 |     # allc_path to group
 56 |     if str(allc_table).endswith('csv'):
 57 |         allc_path = pd.read_csv(allc_table, index_col=0, header=None, squeeze=True)
 58 |     else:
 59 |         allc_path = pd.read_csv(allc_table, sep='\t', index_col=0, header=None, squeeze=True)
 60 |     file_not_exist = allc_path[allc_path.index.map(lambda i: not pathlib.Path(i).exists())]
 61 |     if file_not_exist.size != 0:
 62 |         path_str = "\n".join(file_not_exist.index.tolist())
 63 |         raise FileNotFoundError(f'{file_not_exist.size} files do not exist:'
 64 |                                 f'\n{path_str}')
 65 |     allc_dict = {group: paths.index.tolist() for group, paths in allc_path.groupby(allc_path)}
 66 | 
 67 |     # Prepare Snakefile
 68 |     snakemake_cmds = []
 69 |     for group, paths in allc_dict.items():
 70 |         # each group has a separate snakemake file
 71 |         group_dir = output_dir / group
 72 |         group_dir.mkdir(exist_ok=True)
 73 |         allc_list_path = group_dir / f'{group}.allc_paths.txt'
 74 |         with open(allc_list_path, 'w') as f:
 75 |             f.write('\n'.join(paths))
 76 |         snakemake_parameters = f"""
 77 | merge_allc_cpu = {merge_allc_cpu}
 78 | mch_context = '{mch_context}'
 79 | mcg_context = '{mcg_context}'
 80 | bigwig_mch_bin_size = {bigwig_mch_bin_size}
 81 | bigwig_mcg_bin_size = {bigwig_mcg_bin_size}
 82 | chrom_size_path = '{chrom_size_path}'
 83 | group = '{group}'
 84 | 
 85 | """
 86 |         with open(snakemake_template_path) as f:
 87 |             snakemake_template = f.read()
 88 |         snakemake_str = snakemake_parameters + snakemake_template
 89 |         with open(group_dir / f'Snakefile', 'w') as f:
 90 |             f.write(snakemake_str)
 91 |         snakemake_cmd = f'snakemake ' \
 92 |                         f'-d {group_dir.absolute()} ' \
 93 |                         f'--snakefile {group_dir.absolute()}/Snakefile ' \
 94 |                         f'-j {cpu_per_job} ' \
 95 |                         f'--default-resources mem_mb=100 ' \
 96 |                         f'--resources mem_mb={total_mem_mb} ' \
 97 |                         f'--rerun-incomplete'
 98 |         snakemake_cmds.append(snakemake_cmd)
 99 | 
100 |     qsub_dir = output_dir / 'qsub'
101 |     qsub_dir.mkdir(exist_ok=True)
102 |     with open(qsub_dir / 'snakemake_cmds.txt', 'w') as f:
103 |         f.write('\n'.join(snakemake_cmds))
104 |     with open(qsub_dir / 'qsub.sh', 'w') as f:
105 |         qsub_str = f"""
106 | yap qsub \
107 | --command_file_path {qsub_dir / 'snakemake_cmds.txt'} \
108 | --working_dir {qsub_dir} \
109 | --project_name merge \
110 | --total_cpu {total_cpu} \
111 | --qsub_global_parms "-pe smp={cpu_per_job};-l h_vmem=5G"
112 | """
113 |         f.write(qsub_str)
114 |         print(f'Execute this command to start pipeline:\nnohup sh {qsub_dir / "qsub.sh"} &')
115 |     return
116 | 


--------------------------------------------------------------------------------
/cemba_data/bulk/mc_bulk_multigroup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/bulk/mc_bulk_multigroup/__init__.py


--------------------------------------------------------------------------------
/cemba_data/bulk/mc_bulk_multigroup/mc_bulk_multigroup.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pathlib import Path
  3 | import shutil
  4 | from .mc_bulk_multigroup_template import MERGE_TEMPLATE, MERGE_EXTRACT_TEMPLATE
  5 | 
  6 | 
  7 | '''
  8 | the group_file is in csv format with the first column as allcpath and different group the others.
  9 | the group_file needs a header.
 10 | '''
 11 | 
 12 | def merge_bulk_multigroup(group_path, output_path, chrom_size_path, 
 13 |                           n_cpu=10, elem_snakegroup_num = 50,
 14 |                           cate_snakegroup_num = 10, ):
 15 |     
 16 | 
 17 |     outdir = Path(output_path)
 18 |     outdir.mkdir(parents=True, exist_ok=True)
 19 |     shutil.copyfile(group_path, outdir/'GROUP.csv')
 20 | 
 21 |     df = pd.read_csv(group_path)
 22 |     df = df.rename(columns={df.columns[0]:'_path'})
 23 |     sample_cates = df.columns[1:]
 24 | 
 25 |     df['_elem'] = pd.factorize(df[sample_cates].astype(str).apply('-'.join, axis=1))[0]
 26 |     countdict = df['_elem'].value_counts().to_dict()
 27 |     df['_elem'] = df['_elem'].apply(lambda x: f'{x}_{countdict[x]}')
 28 | 
 29 |     elem_grp_df = df.groupby('_elem')['_path'].apply(lambda x: x.unique()).to_frame()
 30 |     elem_grp_df.index.name = '_sample'
 31 |     elem_grp_df['_cate'] = '_elem'
 32 |     elem_grp_df = elem_grp_df.reset_index()[['_cate','_sample','_path']]
 33 | 
 34 |     df = df[df.columns[1:]].drop_duplicates()
 35 | 
 36 |     df['_path'] = output_path+'/_elem/'+df['_elem']+'.allc.tsv.gz'
 37 | 
 38 |     cate_grp_df = []
 39 |     for cate in sample_cates:
 40 |         catedf = df[[cate,'_path']].groupby(cate)['_path'].apply(lambda x: x.unique()).to_frame()
 41 |         catedf['_cate'] = cate
 42 |         catedf.index.name = '_sample'
 43 |         catedf = catedf.reset_index()
 44 |         cate_grp_df.append(catedf)
 45 |     cate_grp_df = pd.concat(cate_grp_df).reset_index(drop=True)[['_cate','_sample','_path']]   
 46 | 
 47 | 
 48 |     def prepare_snakefiles(grp_df, output_path, tag, n_per_snake=None, template=MERGE_TEMPLATE):
 49 |         outdir = Path(output_path)
 50 |         snkdir = outdir/'snakefiles'
 51 |         snkdir.mkdir(exist_ok=True)
 52 | 
 53 |         for cate in grp_df['_cate'].unique():
 54 |             catedir = outdir/cate
 55 |             catedir.mkdir(exist_ok=True)
 56 | 
 57 |         for _,(cate,sample,paths) in grp_df.iterrows():
 58 |             catedir = outdir/cate
 59 |             with open(catedir/f'{sample}.pathlist','w') as f:
 60 |                 f.write('\n'.join(paths))
 61 | 
 62 |         if n_per_snake is None:
 63 |             n_per_snake = len(grp_df)
 64 | 
 65 |         snk_ids = []
 66 |         for i, snkdf in grp_df.groupby(grp_df.index%n_per_snake):
 67 |             snk_id = f'{tag}_{i}'
 68 | 
 69 |             tocp_df = snkdf[snkdf['_path'].apply(len)==1]
 70 |             tomg_df = snkdf[snkdf['_path'].apply(len)>1]
 71 | 
 72 |             with open(snkdir/f'{snk_id}.snakefile', 'w') as f:
 73 |                 f.write(
 74 | f'''merge_allc_cpu = {n_cpu}
 75 | mcg_context = 'CGN'
 76 | chrom_size_path = '{chrom_size_path}'
 77 | merge_sample_prefixes = [{','.join("'"+tomg_df['_cate']+'/'+tomg_df['_sample']+"'")}]
 78 | copy_sample_prefixes = [{','.join("'"+tocp_df['_cate']+'/'+tocp_df['_sample']+"'")}]
 79 | group = "{snk_id}"
 80 | '''
 81 |                 )
 82 |                 f.write(template)
 83 |             snk_ids.append(snk_id)
 84 | 
 85 |         return snk_ids
 86 | 
 87 |     elem_snk_ids = prepare_snakefiles(elem_grp_df, output_path, 'elem',elem_snakegroup_num, template=MERGE_TEMPLATE)
 88 |     cate_snk_ids = prepare_snakefiles(cate_grp_df, output_path, 'cate',cate_snakegroup_num, template=MERGE_EXTRACT_TEMPLATE)
 89 | 
 90 |     def prepare_commands(snake_ids):
 91 |         cmds = [f'snakemake  -d {outdir.resolve()} --snakefile {outdir.resolve()}/snakefiles/{snkid}.snakefile '
 92 |                 f'-j {n_cpu} --default-resources mem_mb=100 --resources mem_mb=1000 --rerun-incomplete' \
 93 |                 for snkid in snake_ids]
 94 |         return cmds
 95 | 
 96 |     
 97 |     
 98 |     with open(outdir/'run_snakemake_cmds_1.txt', 'w') as f:
 99 |         f.write('\n'.join(prepare_commands(elem_snk_ids)))
100 |     with open(outdir/'run_snakemake_cmds_2.txt', 'w') as f:
101 |         f.write('\n'.join(prepare_commands(cate_snk_ids)))    
102 | 


--------------------------------------------------------------------------------
/cemba_data/bulk/mc_bulk_multigroup/mc_bulk_multigroup_template.py:
--------------------------------------------------------------------------------
  1 | MERGE_TEMPLATE = '''
  2 | # Example (required) parameters
  3 | # merge_allc_cpu = 10
  4 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE'
  5 | # merge_sample_prefixes = '[]'
  6 | # copy_sample_prefixes = '[]'
  7 | # group = 'GROUP_NAME'
  8 | sample_prefixes = merge_sample_prefixes + copy_sample_prefixes
  9 | 
 10 | # the main rule is the final target
 11 | rule main:
 12 |     input:
 13 |         expand("{sample}.allc.tsv.gz", sample=sample_prefixes),
 14 |         expand("{sample}.allc.tsv.gz.tbi", sample=sample_prefixes),
 15 | #    output:
 16 | #        f"{group}.finished"
 17 | #    shell:
 18 | #        "date > {output}"
 19 |         
 20 | 
 21 | 
 22 | # Merge ALLC
 23 | rule merge_allc:
 24 |     input:
 25 |         "{sample}.pathlist",
 26 |     output:
 27 |         allc="{sample}.allc.tsv.gz",
 28 |         tbi="{sample}.allc.tsv.gz.tbi"
 29 |     threads:
 30 |         max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1)))
 31 |     resources:
 32 |         mem_mb=merge_allc_cpu * 5000
 33 |     run:
 34 |         if wildcards.sample in merge_sample_prefixes:
 35 |             shell("allcools merge-allc "
 36 |                   "--allc_paths {input} "
 37 |                   "--output_path {output.allc} "
 38 |                   "--chrom_size_path {chrom_size_path} "
 39 |                   "--cpu {threads}")
 40 |         else:
 41 |             shell("cp $(cat {input}) {output.allc} ;"
 42 |                   "cp $(cat {input}).tbi {output.tbi} ;")
 43 | 
 44 | '''
 45 | 
 46 | MERGE_EXTRACT_TEMPLATE = '''
 47 | # Example (required) parameters
 48 | # merge_allc_cpu = 10
 49 | # mcg_context = 'CGN'
 50 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE'
 51 | # merge_sample_prefixes = '[]'
 52 | # copy_sample_prefixes = '[]'
 53 | # group = 'GROUP_NAME'
 54 | sample_prefixes = merge_sample_prefixes + copy_sample_prefixes
 55 | 
 56 | # the main rule is the final target
 57 | rule main:
 58 |     input:
 59 |         expand("{sample}.{mcg_context}-Merge.allc.tsv.gz", sample=sample_prefixes, mcg_context=[mcg_context]),
 60 |         expand("{sample}.{mcg_context}-Merge.allc.tsv.gz.tbi", sample=sample_prefixes, mcg_context=[mcg_context]),
 61 | #    output:
 62 | #        f"{group}.finished"
 63 | #    shell:
 64 | #        "date > {output}"
 65 |         
 66 | 
 67 | # Merge ALLC
 68 | rule merge_allc:
 69 |     input:
 70 |         "{sample}.pathlist",
 71 |     output:
 72 |         allc="{sample}.allc.tsv.gz",
 73 |         tbi="{sample}.allc.tsv.gz.tbi"
 74 |     threads:
 75 |         max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1)))
 76 |     resources:
 77 |         mem_mb=merge_allc_cpu * 5000
 78 |     run:
 79 |         if wildcards.sample in merge_sample_prefixes:
 80 |             shell("allcools merge-allc "
 81 |                   "--allc_paths {input} "
 82 |                   "--output_path {output.allc} "
 83 |                   "--chrom_size_path {chrom_size_path} "
 84 |                   "--cpu {threads}")
 85 |         else:
 86 |             shell("cp $(cat {input}) {output.allc} ;"
 87 |                   "cp $(cat {input}).tbi {output.tbi} ;")
 88 | 
 89 | # Extract mCG ALLC for DMR calling
 90 | rule extract_allc_mcg:
 91 |     input:
 92 |         "{sample}.allc.tsv.gz"
 93 |     output:
 94 |         allc_cg="{sample}.{mcg_context}-Merge.allc.tsv.gz",
 95 |         allc_cg_tbi="{sample}.{mcg_context}-Merge.allc.tsv.gz.tbi"
 96 |     threads:
 97 |         1
 98 |     resources:
 99 |         mem_mb=100
100 |     shell:
101 |         "allcools extract-allc "
102 |         "--allc_path {input} "
103 |         "--output_prefix {wildcards.sample} "
104 |         "--mc_contexts {mcg_context} "
105 |         "--chrom_size_path {chrom_size_path} "
106 |         "--strandness merge "
107 |         "--output_format allc "
108 |         "--cpu {threads}"
109 | '''
110 | 


--------------------------------------------------------------------------------
/cemba_data/bulk/mct_bulk.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import pandas as pd
  3 | import glob
  4 | import subprocess
  5 | from concurrent.futures import ProcessPoolExecutor, as_completed
  6 | import os
  7 | 
  8 | 
  9 | def merge_single_bam(bam_path, cell_id_to_cluster, output_prefix, header_dict):
 10 |     header = pysam.AlignmentHeader.from_dict(header_dict)
 11 |     clusters = set(cell_id_to_cluster.values())
 12 |     cluster_read_counts = {c: 0 for c in clusters}
 13 | 
 14 |     # write reads by cluster
 15 |     with pysam.AlignmentFile(bam_path, "rb") as bam_file:
 16 |         # open BAM handles for each cluster
 17 |         cluster_handles = {}
 18 |         for cluster in clusters:
 19 |             cluster_handles[cluster] = pysam.AlignmentFile(
 20 |                 f'{output_prefix}_{cluster}.bam', "wb", header=header)
 21 | 
 22 |         for read in bam_file:
 23 |             cell_id = read.get_tag('RG')
 24 |             try:
 25 |                 cluster = cell_id_to_cluster[cell_id]
 26 |                 # this removes RG tag
 27 |                 read.set_tag('RG', None)
 28 |                 cluster_handles[cluster].write(read)
 29 |                 cluster_read_counts[cluster] += 1
 30 |             except KeyError:
 31 |                 continue
 32 | 
 33 |     # close handles
 34 |     for handle in cluster_handles.values():
 35 |         handle.close()
 36 | 
 37 |     # delete empty out_bam
 38 |     for cluster, count in cluster_read_counts.items():
 39 |         bam_path = f'{output_prefix}_{cluster}.bam'
 40 |         if count == 0:
 41 |             subprocess.run(['rm', '-rf', bam_path])
 42 |     return cluster_read_counts
 43 | 
 44 | 
 45 | def merge_mct_cluster_bam(cell_id_to_cluster_path,
 46 |                           bam_list_path,
 47 |                           output_prefix,
 48 |                           cpu=10):
 49 |     cell_id_to_cluster = pd.read_csv(
 50 |         cell_id_to_cluster_path,
 51 |         index_col=0,
 52 |         header=None,
 53 |         squeeze=True).to_dict()
 54 |     bam_paths = pd.read_csv(bam_list_path, header=None, squeeze=True).tolist()
 55 | 
 56 |     # get header
 57 |     with pysam.AlignmentFile(bam_paths[0]) as bam:
 58 |         header_dict = bam.header.as_dict()
 59 |         # remove cell specific info
 60 |         keys_to_delete = ['PG', 'RG', 'CO']
 61 |         for k in keys_to_delete:
 62 |             if k in header_dict:
 63 |                 del header_dict[k]
 64 | 
 65 |     clusters = set(cell_id_to_cluster.values())
 66 |     total_cluster_read_counts = {c: 0 for c in clusters}
 67 | 
 68 |     # merge single bam files
 69 |     with ProcessPoolExecutor(cpu) as exe:
 70 |         futures = {}
 71 |         for i, path in enumerate(bam_paths):
 72 |             f = exe.submit(merge_single_bam,
 73 |                            bam_path=path,
 74 |                            cell_id_to_cluster=cell_id_to_cluster,
 75 |                            output_prefix=f'{output_prefix}{i:06d}',
 76 |                            header_dict=header_dict)
 77 |             futures[f] = path
 78 | 
 79 |         for f in as_completed(futures):
 80 |             cluster_read_counts = f.result()
 81 |             for k, v in cluster_read_counts.items():
 82 |                 total_cluster_read_counts[k] += v
 83 | 
 84 |     # merge cluster bam files
 85 |     with ProcessPoolExecutor(cpu) as exe:
 86 |         futures = {}
 87 |         for cluster in clusters:
 88 |             chunk_paths = list(glob.glob(f'{output_prefix}*_{cluster}.bam'))
 89 |             if len(chunk_paths) == 0:
 90 |                 continue
 91 |             merge_cmd = f'samtools merge --no-PG -c -o {output_prefix}_{cluster}.bam ' \
 92 |                         f'{output_prefix}*_{cluster}.bam && ' \
 93 |                         f'samtools index {output_prefix}_{cluster}.bam'
 94 |             f = exe.submit(subprocess.run,
 95 |                            merge_cmd,
 96 |                            shell=True,
 97 |                            check=True)
 98 |             futures[f] = chunk_paths
 99 | 
100 |         for f in as_completed(futures):
101 |             chunk_paths = futures[f]
102 |             f.result()
103 |             for path in chunk_paths:
104 |                 os.unlink(path)
105 |     return
106 | 


--------------------------------------------------------------------------------
/cemba_data/demultiplex/__init__.py:
--------------------------------------------------------------------------------
1 | from .plateinfo_and_samplesheet import print_plate_info, make_sample_sheet
2 | from .demultiplex import demultiplex_pipeline, update_snakemake
3 | 


--------------------------------------------------------------------------------
/cemba_data/demultiplex/fastq_dataframe.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generate raw FASTQ dataframe based on fixed name pattern
  3 | name pattern is based on samplesheet generated in plateinfo_and_samplesheet.py
  4 | """
  5 | 
  6 | import glob
  7 | import logging
  8 | import pathlib
  9 | 
 10 | import pandas as pd
 11 | 
 12 | # logger
 13 | log = logging.getLogger()
 14 | 
 15 | 
 16 | def _parse_v1_fastq_path(path):
 17 |     """
 18 |     UID pattern of V1 {sample_id_prefix}-{plate1}-{plate2}-{plate_pos}
 19 |     FASTQ name pattern of V1:
 20 |     {sample_id_prefix}-{plate1}-{plate2}-{plate_pos}_{internal_info}_{lane}_{read_type}_{internal_info}.fastq.gz
 21 |     """
 22 |     path = pathlib.Path(path)
 23 |     try:
 24 |         *_, plate1, plate2, multi_field = path.name.split('-')
 25 |         plate_pos, _, lane, read_type, _ = multi_field.split('_')
 26 |         try:
 27 |             assert plate_pos[0] in 'ABCDEFGH'
 28 |             assert int(plate_pos[1:]) in list(range(1, 13))
 29 |             assert lane in {'L001', 'L002', 'L003', 'L004'}
 30 |             assert read_type in {'R1', 'R2'}
 31 |             assert plate1 != plate2
 32 |         except AssertionError:
 33 |             raise ValueError
 34 |     except ValueError:
 35 |         raise ValueError(f'Found unknown name pattern in path {path}')
 36 |     name_dict = dict(plate1=plate1,
 37 |                      plate2=plate2,
 38 |                      plate_pos=plate_pos,
 39 |                      lane=lane,
 40 |                      read_type=read_type,
 41 |                      fastq_path=path,
 42 |                      uid=f'{plate1}-{plate2}-{plate_pos}')
 43 |     name_series = pd.Series(name_dict)
 44 |     return name_series
 45 | 
 46 | 
 47 | def _parse_v2_fastq_path(path):
 48 |     """
 49 |     UID pattern of V2 {sample_id_prefix}-{plate}-{multiplex_group}-{barcode_name}
 50 |     FASTQ name pattern of V1:
 51 |     {sample_id_prefix}-{plate}-{multiplex_group}-{barcode_name}_{internal_info}_{lane}_{read_type}_{internal_info}.fastq.gz
 52 |     """
 53 |     path = pathlib.Path(path)
 54 |     try:
 55 |         *_, plate, multiplex_group, multi_field = path.name.split('-')
 56 |         primer_name, _, lane, read_type, _ = multi_field.split('_')
 57 |         try:
 58 |             assert primer_name[0] in 'ABCDEFGHIJKLMNOP'
 59 |             assert int(primer_name[1:]) in list(range(1, 25))
 60 |             assert int(multiplex_group) in list(range(1, 7))
 61 |             assert lane in {'L001', 'L002', 'L003', 'L004'}
 62 |             assert read_type in {'R1', 'R2'}
 63 |         except AssertionError:
 64 |             raise ValueError
 65 |     except ValueError:
 66 |         raise ValueError(f'Found unknown name pattern in path {path}')
 67 |     name_dict = dict(plate=plate,
 68 |                      multiplex_group=multiplex_group,
 69 |                      primer_name=primer_name,
 70 |                      lane=lane,
 71 |                      read_type=read_type,
 72 |                      fastq_path=path,
 73 |                      uid=f'{plate}-{multiplex_group}-{primer_name}')
 74 |     name_series = pd.Series(name_dict)
 75 |     return name_series
 76 | 
 77 | 
 78 | def make_fastq_dataframe(file_path, barcode_version, output_path=None):
 79 |     """
 80 |     Generate fastq_dataframe for pipeline input.
 81 | 
 82 |     Parameters
 83 |     ----------
 84 |     file_path
 85 |         Accept 1. path pattern contain wildcard, 2. path list, 3. path of one file contain all the paths.
 86 |     barcode_version
 87 |         Only accept two options: 1) V1 for 8 random index; 2) V2 for 384 random index.
 88 |     output_path
 89 |         output path of the fastq dataframe
 90 |     Returns
 91 |     -------
 92 |         fastq_dataframe for pipeline input.
 93 |     """
 94 |     barcode_version = barcode_version.upper()
 95 |     if barcode_version == 'V1':
 96 |         parser = _parse_v1_fastq_path
 97 |     elif barcode_version == 'V2':
 98 |         parser = _parse_v2_fastq_path
 99 |     else:
100 |         raise ValueError(f'Primer Version can only be V1 or V2, got {barcode_version}.')
101 | 
102 |     if isinstance(file_path, str) and ('*' in file_path):
103 |         file_path = [str(pathlib.Path(p).absolute()) for p in glob.glob(file_path)]
104 |     elif isinstance(file_path, list):
105 |         pass
106 |     else:
107 |         with open(file_path) as f:
108 |             file_path = [line.strip() for line in f]
109 |     log.info(f'{len(file_path)} FASTQ file paths in input')
110 | 
111 |     fastq_data = []
112 |     for path in file_path:
113 |         name_series = parser(path)
114 |         fastq_data.append(name_series)
115 |     fastq_df = pd.DataFrame(fastq_data)
116 |     log.info(f'{fastq_df.shape[0]} valid fastq names.')
117 |     if fastq_df.shape[0] == 0:
118 |         log.info('No fastq name remained, check if the name pattern is correct.')
119 |         return None
120 | 
121 |     # make sure UID is unique
122 |     for _, df in fastq_df.groupby(['lane', 'read_type']):
123 |         if df['uid'].unique().size != df['uid'].size:
124 |             raise ValueError(f'UID column is not unique.')
125 |     if output_path is not None:
126 |         fastq_df.to_csv(output_path, index=False)
127 |     return fastq_df
128 | 


--------------------------------------------------------------------------------
/cemba_data/dmr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/dmr/__init__.py


--------------------------------------------------------------------------------
/cemba_data/dmr/dss/__init__.py:
--------------------------------------------------------------------------------
1 | from .TwoGroup import run_dss_two_group
2 | from .MultiGroup import run_dss_multi_group


--------------------------------------------------------------------------------
/cemba_data/files/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/__init__.py


--------------------------------------------------------------------------------
/cemba_data/files/default_config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/default_config/__init__.py


--------------------------------------------------------------------------------
/cemba_data/files/default_config/mapping_config_4m.ini:
--------------------------------------------------------------------------------
  1 | ; Mapping configurations
  2 | ;
  3 | ; INI format
  4 | ; [Section1]
  5 | ; KEY1 = VALUE1
  6 | ; KEY2 = VALUE2
  7 | ;
  8 | ; [Section2]
  9 | ; KEY1 = VALUE1
 10 | ; KEY2 = VALUE2
 11 | ;
 12 | ; lines start with ";" is comment.
 13 | ;
 14 | ; NOTE: Don't change any section or key names.
 15 | ; Custom keys won't work, only change value when adjust parameters.
 16 | ;
 17 | [mode]
 18 | mode = 4m
 19 | 
 20 | 
 21 | [multiplexIndex]
 22 | ; This section is for demultiplex step
 23 | ; V1: 8 random index version
 24 | ; V2: 384 random index version
 25 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE
 26 | 
 27 | 
 28 | [fastqTrim]
 29 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC
 30 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA
 31 | ; Universal illumina adapter
 32 | 
 33 | overlap = 6
 34 | ; least overlap of base and illumina adapter
 35 | 
 36 | r1_left_cut = 10
 37 | ; constant length to trim at 5 prime end, apply before quality trim.
 38 | ; Aim to cut random primer part, determined by random primer length.
 39 | ; Random primer can impact results https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/
 40 | 
 41 | r1_right_cut = 10
 42 | ; constant length to trim at 3 prime end, apply before quality trim.
 43 | 
 44 | r2_left_cut = 10
 45 | ; constant length to trim at 5 prime end, apply before quality trim.
 46 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain.
 47 | 
 48 | r2_right_cut = 10
 49 | ; constant length to trim at 3 prime end, apply before quality trim.
 50 | 
 51 | quality_threshold = 20
 52 | ; reads quality score threshold for trimming.
 53 | 
 54 | length_threshold = 30
 55 | ; reads length threshold after all trim steps.
 56 | 
 57 | total_read_pairs_min = 1
 58 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps.
 59 | 
 60 | total_read_pairs_max = 6000000
 61 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps.
 62 | 
 63 | 
 64 | [mapping reference]
 65 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR
 66 | ; reference directory of bismark
 67 | 
 68 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE
 69 | ; reference prefix for the HISAT-3N DNA mapping
 70 | 
 71 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE
 72 | ; reference prefix for the HISAT-3N RNA mapping
 73 | 
 74 | hisat3n_repeat_index_type = no-repeat
 75 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used.
 76 | ; if "no-repeat", will run hisat-3n in the normal mode.
 77 | 
 78 | 
 79 | [readSplit]
 80 | trim_on_both_end = 5
 81 | ; whether trim the unmapped reads before split.
 82 | 
 83 | split_left_size = 40
 84 | ; length of the left part of the split
 85 | 
 86 | split_right_size = 40
 87 | ; length of the right part of the split
 88 | 
 89 | split_middle_min_size = 30
 90 | ; minimum length of the middle part after the split, middle part shorter than this will not be used.
 91 | 
 92 | split_min_read_length = 30
 93 | ; minimum length of the read to perform split, read shorter than this will not be used.
 94 | 
 95 | 
 96 | [star]
 97 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR
 98 | ; reference directory of STAR
 99 | 
100 | 
101 | [bamFilter]
102 | mapq_threshold = 10
103 | ; reads MAPQ threshold
104 | 
105 | 
106 | [DNAReadsFilter]
107 | mc_rate_max_threshold = 0.5
108 | ; if read CH ratio >= mc_rate_max_threshold, skip this read
109 | 
110 | dna_cov_min_threshold = 3
111 | ; if read CH sites <= cov_min_threshold, skip this read
112 | 
113 | 
114 | [RNAReadsFilter]
115 | mc_rate_min_threshold = 0.9
116 | ; if read CH ratio <= mc_rate_min_threshold, skip this read
117 | 
118 | rna_cov_min_threshold = 3
119 | ; if read CH sites <= cov_min_threshold, skip this read
120 | 
121 | nome_flag_str = --nome
122 | ; if '--nome', will exclude GpC sites from the read-level methylation fraction calculation
123 | 
124 | 
125 | [callMethylation]
126 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA
127 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from
128 | 
129 | num_upstr_bases = 1
130 | ; number of base to include before mC
131 | ; change this to 1 for NOMe treatment to get GpCNN
132 | 
133 | num_downstr_bases = 2
134 | ; number of base to include after mC
135 | 
136 | compress_level = 5
137 | ; ALLC file compress level
138 | 
139 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN
140 | ; mC patterns to check when calculate ALLC summary
141 | 
142 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH
143 | ; alias for the above mC patterns in the summary table
144 | 
145 | 
146 | [featureCount]
147 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF
148 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference.
149 | 
150 | feature_type = gene
151 | ; type of feature to count, pass to featureCount -t parameter
152 | 
153 | id_type = gene_id
154 | ; type of feature id to use in the output file, pass to featureCount -g parameter
155 | 
156 | 
157 | [contact]
158 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH
159 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling
160 | ; chrom size file has two tab-separated columns and not header
161 | ; 1) chrom name, the same as ref fasta; 2) chrom size.
162 | 
163 | min_gap = 2500
164 | ; minimum gap distance for a read pair being considered as cis-long


--------------------------------------------------------------------------------
/cemba_data/files/default_config/mapping_config_m3c.ini:
--------------------------------------------------------------------------------
  1 | ; Mapping configurations
  2 | ;
  3 | ; INI format
  4 | ; [Section1]
  5 | ; KEY1 = VALUE1
  6 | ; KEY2 = VALUE2
  7 | ;
  8 | ; [Section2]
  9 | ; KEY1 = VALUE1
 10 | ; KEY2 = VALUE2
 11 | ;
 12 | ; lines start with ";" is comment.
 13 | ;
 14 | ; NOTE: Don't change any section or key names.
 15 | ; Custom keys won't work, only change value when adjust parameters.
 16 | ;
 17 | 
 18 | [mode]
 19 | mode = m3c
 20 | 
 21 | 
 22 | [multiplexIndex]
 23 | ; This section is for demultiplex step
 24 | ; V1: 8 random index version
 25 | ; V2: 384 random index version
 26 | ; put V1 or V2 here
 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE
 28 | 
 29 | 
 30 | [fastqTrim]
 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC
 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA
 33 | ; Universal illumina adapter
 34 | 
 35 | overlap = 6
 36 | ; least overlap of base and illumina adapter
 37 | 
 38 | r1_left_cut = 10
 39 | ; constant length to trim at 5 prime end, apply before quality trim.
 40 | ; Aim to cut random primer part, determined by random primer length.
 41 | ; Random primer can impact results, see bellow:
 42 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/
 43 | 
 44 | r1_right_cut = 10
 45 | ; constant length to trim at 3 prime end, apply before quality trim.
 46 | 
 47 | r2_left_cut = 10
 48 | ; constant length to trim at 5 prime end, apply before quality trim.
 49 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain.
 50 | 
 51 | r2_right_cut = 10
 52 | ; constant length to trim at 3 prime end, apply before quality trim.
 53 | 
 54 | quality_threshold = 20
 55 | ; reads quality score threshold for trimming.
 56 | 
 57 | length_threshold = 30
 58 | ; reads length threshold after all trim steps.
 59 | 
 60 | total_read_pairs_min = 1
 61 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps.
 62 | 
 63 | total_read_pairs_max = 6000000
 64 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps.
 65 | 
 66 | 
 67 | [mapping reference]
 68 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR
 69 | ; reference directory of bismark
 70 | 
 71 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE
 72 | ; reference prefix for the HISAT-3N DNA mapping
 73 | 
 74 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE
 75 | ; reference prefix for the HISAT-3N RNA mapping
 76 | 
 77 | hisat3n_repeat_index_type = no-repeat
 78 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used.
 79 | ; if "no-repeat", will run hisat-3n in the normal mode.
 80 | 
 81 | 
 82 | [readSplit]
 83 | trim_on_both_end = 5
 84 | ; whether trim the unmapped reads before split.
 85 | 
 86 | split_left_size = 40
 87 | ; length of the left part of the split
 88 | 
 89 | split_right_size = 40
 90 | ; length of the right part of the split
 91 | 
 92 | split_middle_min_size = 30
 93 | ; minimum length of the middle part after the split, middle part shorter than this will not be used.
 94 | 
 95 | split_min_read_length = 30
 96 | ; minimum length of the read to perform split, read shorter than this will not be used.
 97 | 
 98 | 
 99 | [bamFilter]
100 | mapq_threshold = 10
101 | ; reads MAPQ threshold
102 | 
103 | 
104 | [callMethylation]
105 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA
106 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from
107 | 
108 | num_upstr_bases = 0
109 | ; number of base to include before mC, use 0 for normal snmC, use 1 for NOMe treatment
110 | 
111 | num_downstr_bases = 2
112 | ; number of base to include after mC
113 | 
114 | compress_level = 5
115 | ; ALLC file compress level
116 | 
117 | mc_stat_feature = CHN CGN CCC
118 | ; this is based on the num_upstr_bases and num_downstr_bases
119 | ; mC patterns to check when calculate ALLC summary, separated by space
120 | 
121 | mc_stat_alias = mCH mCG mCCC
122 | ; alias for the above mC patterns in the summary table,
123 | ; separated by space and follow the same order as mc_stat_feature
124 | 
125 | 
126 | [contact]
127 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH
128 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling
129 | ; chrom size file has two tab-separated columns and not header
130 | ; 1) chrom name, the same as ref fasta; 2) chrom size.
131 | 
132 | min_gap = 2500
133 | ; minimum gap distance for a read pair being considered as cis-long


--------------------------------------------------------------------------------
/cemba_data/files/default_config/mapping_config_mc.ini:
--------------------------------------------------------------------------------
  1 | ; Mapping configurations
  2 | ;
  3 | ; INI format
  4 | ; [Section1]
  5 | ; KEY1 = VALUE1
  6 | ; KEY2 = VALUE2
  7 | ;
  8 | ; [Section2]
  9 | ; KEY1 = VALUE1
 10 | ; KEY2 = VALUE2
 11 | ;
 12 | ; lines start with ";" is comment.
 13 | ;
 14 | ; NOTE: Don't change any section or key names.
 15 | ; Custom keys won't work, only change value when adjust parameters.
 16 | ;
 17 | 
 18 | [mode]
 19 | mode = mc
 20 | 
 21 | 
 22 | [multiplexIndex]
 23 | ; This section is for demultiplex step
 24 | ; V1: 8 random index version
 25 | ; V2: 384 random index version
 26 | ; put V1 or V2 here
 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE
 28 | 
 29 | 
 30 | [fastqTrim]
 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC
 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA
 33 | ; Universal illumina adapter
 34 | 
 35 | overlap = 6
 36 | ; least overlap of base and illumina adapter
 37 | 
 38 | r1_left_cut = 10
 39 | ; constant length to trim at 5 prime end, apply before quality trim.
 40 | ; Aim to cut random primer part, determined by random primer length.
 41 | ; Random primer can impact results, see bellow:
 42 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/
 43 | 
 44 | r1_right_cut = 10
 45 | ; constant length to trim at 3 prime end, apply before quality trim.
 46 | 
 47 | r2_left_cut = 10
 48 | ; constant length to trim at 5 prime end, apply before quality trim.
 49 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain.
 50 | 
 51 | r2_right_cut = 10
 52 | ; constant length to trim at 3 prime end, apply before quality trim.
 53 | 
 54 | quality_threshold = 20
 55 | ; reads quality score threshold for trimming.
 56 | 
 57 | length_threshold = 30
 58 | ; reads length threshold after all trim steps.
 59 | 
 60 | total_read_pairs_min = 1
 61 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps.
 62 | 
 63 | total_read_pairs_max = 6000000
 64 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps.
 65 | 
 66 | 
 67 | [mapping reference]
 68 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR
 69 | ; reference directory of bismark
 70 | 
 71 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE
 72 | ; reference prefix for the HISAT-3N DNA mapping
 73 | 
 74 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE
 75 | ; reference prefix for the HISAT-3N RNA mapping
 76 | 
 77 | hisat3n_repeat_index_type = no-repeat
 78 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used.
 79 | ; if "no-repeat", will run hisat-3n in the normal mode.
 80 | 
 81 | unmapped_fastq = False
 82 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose.
 83 | 
 84 | [bamFilter]
 85 | mapq_threshold = 10
 86 | ; reads MAPQ threshold
 87 | 
 88 | 
 89 | [callMethylation]
 90 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA
 91 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from
 92 | 
 93 | num_upstr_bases = 0
 94 | ; number of base to include before mC, use 0 for normal snmC, use 1 for NOMe treatment
 95 | 
 96 | num_downstr_bases = 2
 97 | ; number of base to include after mC
 98 | 
 99 | compress_level = 5
100 | ; ALLC file compress level
101 | 
102 | mc_stat_feature = CHN CGN CCC
103 | ; this is based on the num_upstr_bases and num_downstr_bases
104 | ; mC patterns to check when calculate ALLC summary, separated by space
105 | 
106 | mc_stat_alias = mCH mCG mCCC
107 | ; alias for the above mC patterns in the summary table,
108 | ; separated by space and follow the same order as mc_stat_feature
109 | 
110 | [allcPostprocessing]
111 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH
112 | ; This file is needed when extract mCG sites from ALLC file.
113 | ; The UCSC chrom sizes file contain two tab separated columns
114 | ; the 1st column is the names of chromosomes, the names should be the same as your reference_fasta
115 | ; the 2nd column is the length of chromosomes.
116 | 


--------------------------------------------------------------------------------
/cemba_data/files/default_config/mapping_config_mct-nome.ini:
--------------------------------------------------------------------------------
  1 | ; Mapping configurations
  2 | ;
  3 | ; INI format
  4 | ; [Section1]
  5 | ; KEY1 = VALUE1
  6 | ; KEY2 = VALUE2
  7 | ;
  8 | ; [Section2]
  9 | ; KEY1 = VALUE1
 10 | ; KEY2 = VALUE2
 11 | ;
 12 | ; lines start with ";" is comment.
 13 | ;
 14 | ; NOTE: Don't change any section or key names.
 15 | ; Custom keys won't work, only change value when adjust parameters.
 16 | ;
 17 | [mode]
 18 | # for mCAT, we still using mCT mode for simplicity,
 19 | # the two differences specifically changed in this file for NOMe treatment are:
 20 | # 1. [callMethylation] num_upstr_bases = 1
 21 | # 2. [callMethylation] mc_stat_feature and mc_stat_alias changed
 22 | mode = mct
 23 | 
 24 | 
 25 | [multiplexIndex]
 26 | ; This section is for demultiplex step
 27 | ; V1: 8 random index version
 28 | ; V2: 384 random index version
 29 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE
 30 | 
 31 | 
 32 | [fastqTrim]
 33 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC
 34 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA
 35 | ; Universal illumina adapter
 36 | 
 37 | overlap = 6
 38 | ; least overlap of base and illumina adapter
 39 | 
 40 | r1_left_cut = 10
 41 | ; constant length to trim at 5 prime end, apply before quality trim.
 42 | ; Aim to cut random primer part, determined by random primer length.
 43 | ; Random primer can impact results, see bellow
 44 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/
 45 | 
 46 | r1_right_cut = 10
 47 | ; constant length to trim at 3 prime end, apply before quality trim.
 48 | 
 49 | r2_left_cut = 10
 50 | ; constant length to trim at 5 prime end, apply before quality trim.
 51 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain.
 52 | 
 53 | r2_right_cut = 10
 54 | ; constant length to trim at 3 prime end, apply before quality trim.
 55 | 
 56 | quality_threshold = 20
 57 | ; reads quality score threshold for trimming.
 58 | 
 59 | length_threshold = 30
 60 | ; reads length threshold after all trim steps.
 61 | 
 62 | total_read_pairs_min = 1
 63 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps.
 64 | 
 65 | total_read_pairs_max = 6000000
 66 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps.
 67 | 
 68 | 
 69 | [mapping reference]
 70 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR
 71 | ; reference directory of bismark
 72 | 
 73 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE
 74 | ; reference prefix for the HISAT-3N DNA mapping
 75 | 
 76 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE
 77 | ; reference prefix for the HISAT-3N RNA mapping
 78 | 
 79 | hisat3n_repeat_index_type = no-repeat
 80 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used.
 81 | ; if "no-repeat", will run hisat-3n in the normal mode.
 82 | 
 83 | unmapped_fastq = False
 84 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose.
 85 | 
 86 | 
 87 | [star]
 88 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR
 89 | ; reference directory of STAR
 90 | 
 91 | 
 92 | [bamFilter]
 93 | mapq_threshold = 10
 94 | ; reads MAPQ threshold
 95 | 
 96 | 
 97 | [DNAReadsFilter]
 98 | mc_rate_max_threshold = 0.5
 99 | ; if read CH ratio >= mc_rate_max_threshold, skip this read
100 | 
101 | dna_cov_min_threshold = 3
102 | ; if read CH sites <= cov_min_threshold, skip this read
103 | 
104 | [RNAReadsFilter]
105 | mc_rate_min_threshold = 0.9
106 | ; if read CH ratio <= mc_rate_min_threshold, skip this read
107 | 
108 | rna_cov_min_threshold = 3
109 | ; if read CH sites <= cov_min_threshold, skip this read
110 | 
111 | nome_flag_str =
112 | ; if '--nome', will exclude GpC sites from the read-level methylation fraction calculation
113 | 
114 | 
115 | [callMethylation]
116 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA
117 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from
118 | 
119 | num_upstr_bases = 1
120 | ; number of base to include before mC
121 | ; change this to 1 for NOMe treatment to get GpCNN
122 | 
123 | num_downstr_bases = 2
124 | ; number of base to include after mC
125 | 
126 | compress_level = 5
127 | ; ALLC file compress level
128 | 
129 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN
130 | ; mC patterns to check when calculate ALLC summary
131 | 
132 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH
133 | ; alias for the above mC patterns in the summary table
134 | 
135 | [featureCount]
136 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF
137 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference.
138 | 
139 | feature_type = gene
140 | ; type of feature to count, pass to featureCount -t parameter
141 | 
142 | id_type = gene_id
143 | ; type of feature id to use in the output file, pass to featureCount -g parameter
144 | 
145 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH
146 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling
147 | ; chrom size file has two tab-separated columns and not header
148 | ; 1) chrom name, the same as ref fasta; 2) chrom size.
149 | 


--------------------------------------------------------------------------------
/cemba_data/files/default_config/mapping_config_mct.ini:
--------------------------------------------------------------------------------
  1 | ; Mapping configurations
  2 | ;
  3 | ; INI format
  4 | ; [Section1]
  5 | ; KEY1 = VALUE1
  6 | ; KEY2 = VALUE2
  7 | ;
  8 | ; [Section2]
  9 | ; KEY1 = VALUE1
 10 | ; KEY2 = VALUE2
 11 | ;
 12 | ; lines start with ";" is comment.
 13 | ;
 14 | ; NOTE: Don't change any section or key names.
 15 | ; Custom keys won't work, only change value when adjust parameters.
 16 | ;
 17 | 
 18 | [mode]
 19 | mode = mct
 20 | 
 21 | 
 22 | [multiplexIndex]
 23 | ; This section is for demultiplex step
 24 | ; V1: 8 random index version
 25 | ; V2: 384 random index version
 26 | ; put V1 or V2 here
 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE
 28 | 
 29 | 
 30 | [fastqTrim]
 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC
 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA
 33 | ; Universal illumina adapter
 34 | 
 35 | overlap = 6
 36 | ; least overlap of base and illumina adapter
 37 | 
 38 | r1_left_cut = 10
 39 | ; constant length to trim at 5 prime end, apply before quality trim.
 40 | ; Aim to cut random primer part, determined by random primer length.
 41 | ; Random primer can impact results https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/
 42 | 
 43 | r1_right_cut = 10
 44 | ; constant length to trim at 3 prime end, apply before quality trim.
 45 | 
 46 | r2_left_cut = 10
 47 | ; constant length to trim at 5 prime end, apply before quality trim.
 48 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain.
 49 | 
 50 | r2_right_cut = 10
 51 | ; constant length to trim at 3 prime end, apply before quality trim.
 52 | 
 53 | quality_threshold = 20
 54 | ; reads quality score threshold for trimming.
 55 | 
 56 | length_threshold = 30
 57 | ; reads length threshold after all trim steps.
 58 | 
 59 | total_read_pairs_min = 1
 60 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps.
 61 | 
 62 | total_read_pairs_max = 6000000
 63 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps.
 64 | 
 65 | 
 66 | [mapping reference]
 67 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR
 68 | ; reference directory of bismark
 69 | 
 70 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE
 71 | ; reference prefix for the HISAT-3N DNA mapping
 72 | 
 73 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE
 74 | ; reference prefix for the HISAT-3N RNA mapping
 75 | 
 76 | hisat3n_repeat_index_type = no-repeat
 77 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used.
 78 | ; if "no-repeat", will run hisat-3n in the normal mode.
 79 | 
 80 | 
 81 | unmapped_fastq = False
 82 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose.
 83 | 
 84 | 
 85 | [star]
 86 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR
 87 | ; reference directory of STAR
 88 | 
 89 | 
 90 | [bamFilter]
 91 | mapq_threshold = 10
 92 | ; reads MAPQ threshold
 93 | 
 94 | 
 95 | [DNAReadsFilter]
 96 | mc_rate_max_threshold = 0.5
 97 | ; if read CH ratio >= mc_rate_max_threshold, skip this read
 98 | 
 99 | dna_cov_min_threshold = 3
100 | ; if read CH sites <= cov_min_threshold, skip this read
101 | 
102 | 
103 | [RNAReadsFilter]
104 | mc_rate_min_threshold = 0.9
105 | ; if read CH ratio <= mc_rate_min_threshold, skip this read
106 | 
107 | rna_cov_min_threshold = 3
108 | ; if read CH sites <= cov_min_threshold, skip this read
109 | 
110 | nome_flag_str =
111 | 
112 | [callMethylation]
113 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA
114 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from
115 | 
116 | num_upstr_bases = 0
117 | ; number of base to include before mC
118 | 
119 | num_downstr_bases = 2
120 | ; number of base to include after mC
121 | 
122 | compress_level = 5
123 | ; ALLC file compress level
124 | 
125 | mc_stat_feature = CHN CGN CCC
126 | ; mC patterns to check when calculate ALLC summary
127 | 
128 | mc_stat_alias = mCH mCG mCCC
129 | ; alias for the above mC patterns in the summary table
130 | 
131 | [featureCount]
132 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF
133 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference.
134 | 
135 | feature_type = gene
136 | ; type of feature to count, pass to featureCount -t parameter
137 | 
138 | id_type = gene_id
139 | ; type of feature id to use in the output file, pass to featureCount -g parameter
140 | 
141 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH
142 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling
143 | ; chrom size file has two tab-separated columns and not header
144 | ; 1) chrom name, the same as ref fasta; 2) chrom size.
145 | 


--------------------------------------------------------------------------------
/cemba_data/files/default_config/mapping_config_nome.ini:
--------------------------------------------------------------------------------
  1 | ; Mapping configurations
  2 | ;
  3 | ; INI format
  4 | ; [Section1]
  5 | ; KEY1 = VALUE1
  6 | ; KEY2 = VALUE2
  7 | ;
  8 | ; [Section2]
  9 | ; KEY1 = VALUE1
 10 | ; KEY2 = VALUE2
 11 | ;
 12 | ; lines start with ";" is comment.
 13 | ;
 14 | ; NOTE: Don't change any section or key names.
 15 | ; Custom keys won't work, only change value when adjust parameters.
 16 | ;
 17 | [mode]
 18 | # for NOMe treated snmC, we still using mc mode for simplicity,
 19 | # the two differences specifically changed in this file for NOMe treatment are:
 20 | # 1. [callMethylation] num_upstr_bases = 1
 21 | # 2. [callMethylation] mc_stat_feature and mc_stat_alias changed
 22 | mode = mc
 23 | 
 24 | 
 25 | [multiplexIndex]
 26 | ; This section is for demultiplex step
 27 | ; V1: 8 random index version
 28 | ; V2: 384 random index version
 29 | ; put V1 or V2 here
 30 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE
 31 | 
 32 | 
 33 | [fastqTrim]
 34 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC
 35 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA
 36 | ; Universal illumina adapter
 37 | 
 38 | overlap = 6
 39 | ; least overlap of base and illumina adapter
 40 | 
 41 | r1_left_cut = 10
 42 | ; constant length to trim at 5 prime end, apply before quality trim.
 43 | ; Aim to cut random primer part, determined by random primer length.
 44 | ; Random primer can impact results, see bellow:
 45 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/
 46 | 
 47 | r1_right_cut = 10
 48 | ; constant length to trim at 3 prime end, apply before quality trim.
 49 | 
 50 | r2_left_cut = 10
 51 | ; constant length to trim at 5 prime end, apply before quality trim.
 52 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain.
 53 | 
 54 | r2_right_cut = 10
 55 | ; constant length to trim at 3 prime end, apply before quality trim.
 56 | 
 57 | quality_threshold = 20
 58 | ; reads quality score threshold for trimming.
 59 | 
 60 | length_threshold = 30
 61 | ; reads length threshold after all trim steps.
 62 | 
 63 | total_read_pairs_min = 1
 64 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps.
 65 | 
 66 | total_read_pairs_max = 6000000
 67 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps.
 68 | 
 69 | [mapping reference]
 70 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR
 71 | ; reference directory of bismark
 72 | 
 73 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE
 74 | ; reference prefix for the HISAT-3N DNA mapping
 75 | 
 76 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE
 77 | ; reference prefix for the HISAT-3N RNA mapping
 78 | 
 79 | hisat3n_repeat_index_type = no-repeat
 80 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used.
 81 | ; if "no-repeat", will run hisat-3n in the normal mode.
 82 | 
 83 | unmapped_fastq = False
 84 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose.
 85 | 
 86 | [bamFilter]
 87 | mapq_threshold = 10
 88 | ; reads MAPQ threshold
 89 | 
 90 | 
 91 | [callMethylation]
 92 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA
 93 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from
 94 | 
 95 | num_upstr_bases = 1
 96 | ; number of base to include before mC
 97 | 
 98 | num_downstr_bases = 2
 99 | ; number of base to include after mC
100 | 
101 | compress_level = 5
102 | ; ALLC file compress level
103 | 
104 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN
105 | ; mC patterns to check when calculate ALLC summary
106 | 
107 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH
108 | ; alias for the above mC patterns in the summary table
109 | 
110 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH
111 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling
112 | ; chrom size file has two tab-separated columns and not header
113 | ; 1) chrom name, the same as ref fasta; 2) chrom size.
114 | 


--------------------------------------------------------------------------------
/cemba_data/files/mapping_summary_template/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/mapping_summary_template/__init__.py


--------------------------------------------------------------------------------
/cemba_data/files/plate_info_template_v1.txt:
--------------------------------------------------------------------------------
  1 | #                               .__
  2 | #   ___________    _____ ______ |  |   ____
  3 | #  /  ___/\__  \  /     \\____ \|  | _/ __ \
  4 | #  \___ \  / __ \|  Y Y  \  |_> >  |_\  ___/
  5 | # /____  >(____  /__|_|  /   __/|____/\___  >
  6 | #      \/      \/      \/|__|             \/
  7 | #        .__                   __  ._.
  8 | #   _____|  |__   ____   _____/  |_| |
  9 | #  /  ___/  |  \_/ __ \_/ __ \   __\ |
 10 | #  \___ \|   Y  \  ___/\  ___/|  |  \|
 11 | # /____  >___|  /\___  >\___  >__|  __
 12 | #      \/     \/     \/     \/      \/
 13 | #
 14 | # ____   ________
 15 | # \   \ /   /_   |
 16 | #  \   Y   / |   |
 17 | #   \     /  |   |
 18 | #    \___/   |___|
 19 | #
 20 | #
 21 | # PlateInfo template of single cell sequencing demultiplex
 22 | #
 23 | # This file template contain 3 sections.
 24 | #
 25 | # [CriticalInfo]
 26 | # [LibraryInfo]
 27 | # [PlateInfo]
 28 | #
 29 | # The final sample id will be values of each part concatenated by "-" in the following order
 30 | # [Values in LibraryInfo] + [Additional values in PlateInfo] + [Sample UID determined by library strategy]
 31 | #
 32 | # Empty lines and line start with "#" will be ignored. You can remove these if you understand the template.
 33 | #
 34 | 
 35 | 
 36 | # =====================================================================================================
 37 | 
 38 | [CriticalInfo]
 39 | 
 40 | # =====================================================================================================
 41 | 
 42 | # Explain:
 43 | # Every key=value pairs are required. key name can not be change.
 44 | # Some values have limited options, they are:
 45 | # n_random_index choice: 8 (V1), if your n_random_index=384, use V2 template!
 46 | # input_plate_size choice: 384
 47 | #
 48 | # Example:
 49 | # n_random_index=8
 50 | # input_plate_size=384
 51 | # pool_id=Pool_NN
 52 | # tube_label=Pool_NN_MM_AA_BB  # often times 2 libraries are pooled together on Nova-Seq, but there is no rule on this.
 53 | # email=your-email@salk.edu
 54 | #
 55 | 
 56 | # if your n_random_index=384, use V2 template!
 57 | n_random_index=8
 58 | input_plate_size=384
 59 | pool_id=
 60 | tube_label=
 61 | email=
 62 | 
 63 | 
 64 | # =====================================================================================================
 65 | 
 66 | [LibraryInfo]
 67 | 
 68 | # =====================================================================================================
 69 | #
 70 | # Explain:
 71 | # library metadata that applies to all plates
 72 | # this whole part is optional, may contain any "key=value" pairs necessary to describe the library.
 73 | # All the values will be concatenate by "-" into the sample id and present in file name. Use UNIX path safe characters.
 74 | # Any character does not belong to [a-zA-Z0-9] will be replaced by "_"
 75 | # Here are the recommended information to include, you can define your own based on your needs,
 76 | # non of these information is actually used in demultiplex or mapping:
 77 | # these keys are ALL optional, but better be consistent throughout the project.
 78 | #
 79 | # Example:
 80 | # lib_comp_date=180101
 81 | # project=CEMBA
 82 | # organism=mm
 83 | # dev_stage_age=P56
 84 | # tissue_cell_type=1A
 85 | # exp_cond=1
 86 | # bio_rep=1
 87 | # tech_rep=1
 88 | # lib_type=snmC-seq2
 89 | # sequencer=NovaSeq
 90 | # se_pe=pe
 91 | # read_length=150
 92 | #
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | # =====================================================================================================
 99 | 
100 | [PlateInfo]
101 | 
102 | # =====================================================================================================
103 | 
104 | # Explain:
105 | # Plate metadata that specific to certain plates, a tab separated table
106 | # First row must be header start with: plate_id	primer_quarter
107 | # First 2 columns are required and must be in the order of: plate_id	primer_quarter
108 | # You can add more plate specific info into additional columns, those info will be appended to LibraryInfo as part of sample_id.
109 | # All the values will be concatenate by "-" into the sample id and present in file name.
110 | # So better not to include "-" in value and use UNIX path safe characters.
111 | #
112 | # If your experiment design contain sup-plate difference (e.g. some rows come from 1 sample, some rows come from another),
113 | # you should maintain your own metadata about this and added into the mapping summary table later after mapping by yourself
114 | # Because here the plate info is just for barcode demultiplexing, so that we can get single cell data AND the plate position of each cell
115 | # with the plate position, it should be very convenient for you to add any custom information you designed in your experiment.
116 | #
117 | # primer_quarter valid values are:
118 | # Set1_Q1, Set1_Q2, Set1_Q3, Set1_Q4
119 | # SetB_Q1, SetB_Q2, SetB_Q3, SetB_Q4
120 | #
121 | # Example:
122 | # plate_id	primer_quarter
123 | # CEMBA190530_9C_1	SetB_Q1
124 | # CEMBA190530_9C_2	SetB_Q1
125 | # CEMBA190530_9C_3	SetB_Q2
126 | # CEMBA190530_9C_4	SetB_Q2
127 | # CEMBA190620_9C_1	SetB_Q3
128 | # CEMBA190620_9C_2	SetB_Q3
129 | # CEMBA190620_9C_3	SetB_Q4
130 | # CEMBA190620_9C_4	SetB_Q4
131 | #
132 | # Remember the columns MUST be separate by tab not space
133 | #
134 | 
135 | 
136 | # =====================================================================================================
137 | # if your n_random_index=384, use V2 template!
138 | # =====================================================================================================
139 | 
140 | plate_id	primer_quarter
141 | 
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/cemba_data/files/plate_info_template_v2.txt:
--------------------------------------------------------------------------------
  1 | #                               .__
  2 | #   ___________    _____ ______ |  |   ____
  3 | #  /  ___/\__  \  /     \\____ \|  | _/ __ \
  4 | #  \___ \  / __ \|  Y Y  \  |_> >  |_\  ___/
  5 | # /____  >(____  /__|_|  /   __/|____/\___  >
  6 | #      \/      \/      \/|__|             \/
  7 | #        .__                   __  ._.
  8 | #   _____|  |__   ____   _____/  |_| |
  9 | #  /  ___/  |  \_/ __ \_/ __ \   __\ |
 10 | #  \___ \|   Y  \  ___/\  ___/|  |  \|
 11 | # /____  >___|  /\___  >\___  >__|  __
 12 | #      \/     \/     \/     \/      \/
 13 | #
 14 | # ____   ____________
 15 | # \   \ /   /\_____  \
 16 | #  \   Y   /  /  ____/
 17 | #   \     /  /       \
 18 | #    \___/   \_______ \
 19 | #                    \/
 20 | #
 21 | # PlateInfo template of single cell sequencing demultiplex
 22 | #
 23 | # This file template contain 3 sections.
 24 | #
 25 | # [CriticalInfo]
 26 | # [LibraryInfo]
 27 | # [PlateInfo]
 28 | #
 29 | # The final sample id will be values of each part concatenated by "-" in the following order
 30 | # [Values in LibraryInfo] + [Additional values in PlateInfo] + [Sample UID determined by library strategy]
 31 | #
 32 | # Empty lines and line start with "#" will be ignored. You can remove these if you understand the template.
 33 | #
 34 | 
 35 | 
 36 | # =====================================================================================================
 37 | 
 38 | [CriticalInfo]
 39 | 
 40 | # =====================================================================================================
 41 | 
 42 | # Explain:
 43 | # Every key=value pairs are required. key name can not be change.
 44 | # Some values have limited options, they are:
 45 | # n_random_index choice: 384 (V2), if your n_random_index=8, use V1 template!
 46 | # input_plate_size choice: 384
 47 | #
 48 | #
 49 | # Example:
 50 | # n_random_index=8
 51 | # input_plate_size=384
 52 | # pool_id=Pool_73
 53 | # tube_label=Pool_72_73_9A_10C  # often times 2 library are pooled together on Nova-Seq
 54 | # email=your-email@salk.edu
 55 | #
 56 | 
 57 | # if your n_random_index=8, use V1 template!
 58 | n_random_index=384
 59 | input_plate_size=384
 60 | pool_id=
 61 | tube_label=
 62 | email=
 63 | 
 64 | 
 65 | # =====================================================================================================
 66 | 
 67 | [LibraryInfo]
 68 | 
 69 | # =====================================================================================================
 70 | #
 71 | # Explain:
 72 | # library metadata that applies to all plates
 73 | # this whole part is optional, may contain any "key=value" pairs necessary to describe the library.
 74 | # All the values will be concatenate by "-" into the sample id and present in file name. Use UNIX path safe characters.
 75 | # Any character does not belong to [a-zA-Z0-9] will be replaced by "_"
 76 | # Here are the recommended information to include, you can define your own based on your needs,
 77 | # non of these information is actually used in demultiplex or mapping:
 78 | # these keys are ALL optional, but better be consistent throughout the project.
 79 | #
 80 | # Example:
 81 | # lib_comp_date=180101
 82 | # project=CEMBA
 83 | # organism=mm
 84 | # dev_stage_age=P56
 85 | # tissue_cell_type=1A
 86 | # exp_cond=1
 87 | # bio_rep=1
 88 | # tech_rep=1
 89 | # lib_type=snmC-seq2
 90 | # sequencer=NovaSeq
 91 | # se_pe=pe
 92 | # read_length=150
 93 | #
 94 | #
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | # =====================================================================================================
101 | 
102 | [PlateInfo]
103 | 
104 | # =====================================================================================================
105 | 
106 | # Explain:
107 | # Plate metadata that specific to certain plates, a tab separated table
108 | # First row must be header start with: plate_id	primer_quarter
109 | # First 3 columns are required and must be in the order of: plate_id	multiplex_group	primer_name
110 | # You can add more plate specific info into additional columns, those info will be appended to LibraryInfo as part of sample_id.
111 | # All the values will be concatenate by "-" into the sample id and present in file name.
112 | # So better not to include "-" in value and use UNIX path safe characters.
113 | #
114 | # If your experiment design contain sup-plate difference (e.g. some rows come from 1 sample, some rows come from another),
115 | # you should maintain your own metadata about this and added into the mapping summary table later after mapping by yourself
116 | # Because here the plate info is just for barcode demultiplexing, so that we can get single cell data AND the plate position of each cell
117 | # with the plate position, it should be very convenient for you to add any custom information you designed in your experiment.
118 | #
119 | # primer_name valid values are:
120 | # [A-P][1-24]
121 | #
122 | # Example:
123 | # plate_id	multiplex_group	primer_name
124 | # Plate_1	1	B1
125 | # Plate_1	2	B3
126 | # Plate_1	3	B5
127 | # Plate_1	4	B7
128 | # Plate_1	5	B9
129 | # Plate_1	6	B11
130 | #
131 | # Remember the columns MUST be separate by tab, not space or comma
132 | #
133 | 
134 | 
135 | # =====================================================================================================
136 | # if your n_random_index=8, use V1 template!
137 | # =====================================================================================================
138 | 
139 | plate_id	multiplex_group	primer_name
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v1.fa:
--------------------------------------------------------------------------------
 1 | >AD001
 2 | ^ATCACG
 3 | >AD002
 4 | ^CGATGT
 5 | >AD004
 6 | ^TGACCA
 7 | >AD006
 8 | ^GCCAAT
 9 | >AD007
10 | ^CAGATC
11 | >AD008
12 | ^ACTTGA
13 | >AD010
14 | ^TAGCTT
15 | >AD012
16 | ^CTTGTA
17 | 


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/random_index_v2/__init__.py


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v2/random_index_v2.multiplex_group_1.fa:
--------------------------------------------------------------------------------
  1 | >A1
  2 | ^ACGATCAG
  3 | >A13
  4 | ^ATCATGCG
  5 | >C1
  6 | ^GTAGCGTA
  7 | >C13
  8 | ^GTCCTAAG
  9 | >E1
 10 | ^GATCAAGG
 11 | >E13
 12 | ^TACCGGAT
 13 | >G1
 14 | ^CAGTCACA
 15 | >G13
 16 | ^ACCTCAGT
 17 | >I1
 18 | ^TACTGCTC
 19 | >I13
 20 | ^GTGGTATG
 21 | >K1
 22 | ^AGCTACCA
 23 | >K13
 24 | ^CAGACGTT
 25 | >M1
 26 | ^AGGTCAAC
 27 | >M13
 28 | ^CAATCAGG
 29 | >O1
 30 | ^AACAGGTG
 31 | >O13
 32 | ^CTACAAGG
 33 | >A2
 34 | ^TGATAGGC
 35 | >A14
 36 | ^ACAACGTG
 37 | >C2
 38 | ^CAGGTAAG
 39 | >C14
 40 | ^AATTCCGG
 41 | >E2
 42 | ^ACAAGCTC
 43 | >E14
 44 | ^GTGATCCA
 45 | >G2
 46 | ^AACCGTGT
 47 | >G14
 48 | ^GTCCTTGA
 49 | >I2
 50 | ^ATTCCGCT
 51 | >I14
 52 | ^ACTGCGAA
 53 | >K2
 54 | ^CACGCAAT
 55 | >K14
 56 | ^AAGCGACT
 57 | >M2
 58 | ^AGAAGGAC
 59 | >M14
 60 | ^CGAATACG
 61 | >O2
 62 | ^AGCAGACA
 63 | >O14
 64 | ^GCCTTAAC
 65 | >B1
 66 | ^GAACGAAG
 67 | >B13
 68 | ^GACTACGA
 69 | >D1
 70 | ^ATACGCAG
 71 | >D13
 72 | ^CCTGTCAA
 73 | >F1
 74 | ^GTTGCTGT
 75 | >F13
 76 | ^CGAATTGC
 77 | >H1
 78 | ^CCAAGGTT
 79 | >H13
 80 | ^TCTACGCA
 81 | >J1
 82 | ^TGCACTTG
 83 | >J13
 84 | ^AGAGCAGA
 85 | >L1
 86 | ^GATGCTAC
 87 | >L13
 88 | ^CGACCTAA
 89 | >N1
 90 | ^TCAGCCTT
 91 | >N13
 92 | ^CCGTTATG
 93 | >P1
 94 | ^TGACCGTT
 95 | >P13
 96 | ^AGCTAAGC
 97 | >B2
 98 | ^AGGCAATG
 99 | >B14
100 | ^ACGCTTCT
101 | >D2
102 | ^GCGTTAGA
103 | >D14
104 | ^TCAATCCG
105 | >F2
106 | ^CTAGGTTG
107 | >F14
108 | ^GCATAGTC
109 | >H2
110 | ^CTCGGTAA
111 | >H14
112 | ^CAACTTGG
113 | >J2
114 | ^CCTAAGTC
115 | >J14
116 | ^TTCCTCCT
117 | >L2
118 | ^AAGCGTTC
119 | >L14
120 | ^CTTAGGAC
121 | >N2
122 | ^CAACTGAC
123 | >N14
124 | ^CTCACCAA
125 | >P2
126 | ^CTCTATCG
127 | >P14
128 | ^CGCAATGT
129 | 


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v2/random_index_v2.multiplex_group_2.fa:
--------------------------------------------------------------------------------
  1 | >A3
  2 | ^TCGAGAGT
  3 | >A15
  4 | ^TGTTCCGT
  5 | >C3
  6 | ^AGAGTCCA
  7 | >C15
  8 | ^TATGGCAC
  9 | >E3
 10 | ^TCTTCGAC
 11 | >E15
 12 | ^TTGCAACG
 13 | >G3
 14 | ^TCGATGAC
 15 | >G15
 16 | ^CGTCTTCA
 17 | >I3
 18 | ^GACGAACT
 19 | >I15
 20 | ^CCAACTTC
 21 | >K3
 22 | ^AGATTGCG
 23 | >K15
 24 | ^CTGAACGT
 25 | >M3
 26 | ^TACACACG
 27 | >M15
 28 | ^TCGTGCAT
 29 | >O3
 30 | ^AGTCGAAG
 31 | >O15
 32 | ^CGATGTTC
 33 | >A4
 34 | ^CATCCAAG
 35 | >A16
 36 | ^TGCTGTGA
 37 | >C4
 38 | ^GTATCGAG
 39 | >C16
 40 | ^TCTAGGAG
 41 | >E4
 42 | ^GAACCTTC
 43 | >E16
 44 | ^ACTGGTGT
 45 | >G4
 46 | ^CGCGTATT
 47 | >G16
 48 | ^CAGGTTCA
 49 | >I4
 50 | ^AAGCTCAC
 51 | >I16
 52 | ^TCTGTCGT
 53 | >K4
 54 | ^AGCTTCAG
 55 | >K16
 56 | ^CCTACCTA
 57 | >M4
 58 | ^GCGTATCA
 59 | >M16
 60 | ^TGCTTGCT
 61 | >O4
 62 | ^GTTAAGCG
 63 | >O16
 64 | ^GTTGGCAT
 65 | >B3
 66 | ^ACCTAGAC
 67 | >B15
 68 | ^TTACGTGC
 69 | >D3
 70 | ^AAGACCGT
 71 | >D15
 72 | ^CTATGCCT
 73 | >F3
 74 | ^AGAACCAG
 75 | >F15
 76 | ^CAAGAAGC
 77 | >H3
 78 | ^ACGTATGG
 79 | >H15
 80 | ^TGGCTCTT
 81 | >J3
 82 | ^TCACTCGA
 83 | >J15
 84 | ^CTTCGGTT
 85 | >L3
 86 | ^AGGAACAC
 87 | >L15
 88 | ^CTCTCAGA
 89 | >N3
 90 | ^AAGCATCG
 91 | >N15
 92 | ^CTAGCAGT
 93 | >P3
 94 | ^CATCTGCT
 95 | >P15
 96 | ^GTTCCATG
 97 | >B4
 98 | ^TCACCTAG
 99 | >B16
100 | ^GAGTAGAG
101 | >D4
102 | ^TTGCGAGA
103 | >D16
104 | ^GACTTGTG
105 | >F4
106 | ^GTGTCCTT
107 | >F16
108 | ^CTCCTGAA
109 | >H4
110 | ^TACAGAGC
111 | >H16
112 | ^TCAGTAGG
113 | >J4
114 | ^TTCGTACG
115 | >J16
116 | ^GCTGTAAG
117 | >L4
118 | ^CGATTCTG
119 | >L16
120 | ^ATAGTCGG
121 | >N4
122 | ^TGCTCTAC
123 | >N16
124 | ^CAGAACTG
125 | >P4
126 | ^ACTCTCCA
127 | >P16
128 | ^CCTAGAGA
129 | 


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v2/random_index_v2.multiplex_group_3.fa:
--------------------------------------------------------------------------------
  1 | >A5
  2 | ^CTAGCTCA
  3 | >A17
  4 | ^ATTAGCCG
  5 | >C5
  6 | ^GCTACTCT
  7 | >C17
  8 | ^TCGGATTC
  9 | >E5
 10 | ^ATCGTGGT
 11 | >E17
 12 | ^CACTTCAC
 13 | >G5
 14 | ^GAAGTGCT
 15 | >G17
 16 | ^TGCGTAAC
 17 | >I5
 18 | ^CTTCGCAA
 19 | >I17
 20 | ^GACGTCAT
 21 | >K5
 22 | ^CACACATC
 23 | >K17
 24 | ^TTGGACTG
 25 | >M5
 26 | ^CAAGTCGT
 27 | >M17
 28 | ^TAACGTCG
 29 | >O5
 30 | ^TGGAAGCA
 31 | >O17
 32 | ^ACCGGTTA
 33 | >A6
 34 | ^GTGAGACT
 35 | >A18
 36 | ^CCAAGTAG
 37 | >C6
 38 | ^TTCACGGA
 39 | >C18
 40 | ^ATCCGTTG
 41 | >E6
 42 | ^AGCGAGAT
 43 | >E18
 44 | ^CTAACCTG
 45 | >G6
 46 | ^AGTTCGCA
 47 | >G18
 48 | ^CCAACACT
 49 | >I6
 50 | ^TGATCACG
 51 | >I18
 52 | ^CTCAAGCT
 53 | >K6
 54 | ^CCTCGTTA
 55 | >K18
 56 | ^ATCTCCTG
 57 | >M6
 58 | ^CAACACAG
 59 | >M18
 60 | ^CTCGAACA
 61 | >O6
 62 | ^CATGGATC
 63 | >O18
 64 | ^CAACCTCT
 65 | >B5
 66 | ^TACGACGT
 67 | >B17
 68 | ^ACTGCTTG
 69 | >D5
 70 | ^CTCCAATC
 71 | >D17
 72 | ^TTCGGCTA
 73 | >F5
 74 | ^GATGTCGA
 75 | >F17
 76 | ^CACCAGTT
 77 | >H5
 78 | ^AAGGACCA
 79 | >H17
 80 | ^CCTTCCAT
 81 | >J5
 82 | ^CACTGTAG
 83 | >J17
 84 | ^ACAACAGC
 85 | >L5
 86 | ^ACCATCCT
 87 | >L17
 88 | ^AGGCTGAA
 89 | >N5
 90 | ^GCCAATAC
 91 | >N17
 92 | ^GCCAGAAT
 93 | >P5
 94 | ^CGCTGATA
 95 | >P17
 96 | ^GCATCCTA
 97 | >B6
 98 | ^CATACGGA
 99 | >B18
100 | ^ATGCCTAG
101 | >D6
102 | ^ACACCGAT
103 | >D18
104 | ^CCGATGTA
105 | >F6
106 | ^TACCTGCA
107 | >F18
108 | ^AACGCACA
109 | >H6
110 | ^GCATAACG
111 | >H18
112 | ^ACAGCAAG
113 | >J6
114 | ^TCCTGGTA
115 | >J18
116 | ^GACATCTC
117 | >L6
118 | ^GCAACCAT
119 | >L18
120 | ^GAGACCAA
121 | >N6
122 | ^CATCACGT
123 | >N18
124 | ^AGAAGCCT
125 | >P6
126 | ^CAGCATAC
127 | >P18
128 | ^TACTAGCG
129 | 


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v2/random_index_v2.multiplex_group_4.fa:
--------------------------------------------------------------------------------
  1 | >A7
  2 | ^ATCGTCTC
  3 | >A19
  4 | ^CGATCGAT
  5 | >C7
  6 | ^CTCTGGAT
  7 | >C19
  8 | ^AACAGCGA
  9 | >E7
 10 | ^CGGTAATC
 11 | >E19
 12 | ^TAGCCATG
 13 | >G7
 14 | ^CTTCCTTC
 15 | >G19
 16 | ^AACACGCT
 17 | >I7
 18 | ^ATGGCGAT
 19 | >I19
 20 | ^ACGTCCAA
 21 | >K7
 22 | ^GAGCAATC
 23 | >K19
 24 | ^GTCTGCAA
 25 | >M7
 26 | ^AGCTAGTG
 27 | >M19
 28 | ^AAGGCGTA
 29 | >O7
 30 | ^CTCGTTCT
 31 | >O19
 32 | ^GAACGGTT
 33 | >A8
 34 | ^CTGATGAG
 35 | >A20
 36 | ^AACTGAGG
 37 | >C8
 38 | ^GAGCTCTA
 39 | >C20
 40 | ^GATAGCCA
 41 | >E8
 42 | ^CCGTAACT
 43 | >E20
 44 | ^AGCCAACT
 45 | >G8
 46 | ^TAGTCAGC
 47 | >G20
 48 | ^GAGAGTAC
 49 | >I8
 50 | ^CAATGCGA
 51 | >I20
 52 | ^AACCACTC
 53 | >K8
 54 | ^TGAGACGA
 55 | >K20
 56 | ^TCACGATG
 57 | >M8
 58 | ^TCCACGTT
 59 | >M20
 60 | ^ACATGGAG
 61 | >O8
 62 | ^ACAGAGGT
 63 | >O20
 64 | ^TGGATGGT
 65 | >B7
 66 | ^TTGAGCTC
 67 | >B19
 68 | ^GCCTATGT
 69 | >D7
 70 | ^TCTGGACA
 71 | >D19
 72 | ^ACCGACAA
 73 | >F7
 74 | ^AGGAGGTT
 75 | >F19
 76 | ^GTATTCCG
 77 | >H7
 78 | ^TATGCGGT
 79 | >H19
 80 | ^ATACTGGC
 81 | >J7
 82 | ^GTACGATC
 83 | >J19
 84 | ^AGCCGTAA
 85 | >L7
 86 | ^GAACGTGA
 87 | >L19
 88 | ^ATCGGAGA
 89 | >N7
 90 | ^GACACAGT
 91 | >N19
 92 | ^CGAGAGAA
 93 | >P7
 94 | ^TCGTCTGA
 95 | >P19
 96 | ^CCATGAAC
 97 | >B8
 98 | ^GTCATCGT
 99 | >B20
100 | ^CAACTCCA
101 | >D8
102 | ^CGTATCTC
103 | >D20
104 | ^TAGGAGCT
105 | >F8
106 | ^CCTTAGGT
107 | >F20
108 | ^TAGTCTCG
109 | >H8
110 | ^GATCAGAC
111 | >H20
112 | ^GAATGGCA
113 | >J8
114 | ^CATTGACG
115 | >J20
116 | ^CAACCGTA
117 | >L8
118 | ^AATCCAGC
119 | >L20
120 | ^AACAAGGC
121 | >N8
122 | ^GCCACTTA
123 | >N20
124 | ^CACGATTC
125 | >P8
126 | ^TACTCCAG
127 | >P20
128 | ^CGTCCATT
129 | 


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v2/random_index_v2.multiplex_group_5.fa:
--------------------------------------------------------------------------------
  1 | >A9
  2 | ^TCGACAAG
  3 | >A21
  4 | ^GATCTTGC
  5 | >C9
  6 | ^AGATCGTC
  7 | >C21
  8 | ^CCAACGAA
  9 | >E9
 10 | ^AGTTGTGC
 11 | >E21
 12 | ^ACAGGCAT
 13 | >G9
 14 | ^CGAACAAC
 15 | >G21
 16 | ^ACTCGATC
 17 | >I9
 18 | ^ACATGCCA
 19 | >I21
 20 | ^GATCCACT
 21 | >K9
 22 | ^ATAGAGCG
 23 | >K21
 24 | ^CCACATTG
 25 | >M9
 26 | ^CTCCTAGT
 27 | >M21
 28 | ^TCTTACGG
 29 | >O9
 30 | ^ACGAGAAC
 31 | >O21
 32 | ^CTGTACCA
 33 | >A10
 34 | ^ACGGTACA
 35 | >A22
 36 | ^AGGTAGGA
 37 | >C10
 38 | ^GTCAGTCA
 39 | >C22
 40 | ^TATGACCG
 41 | >E10
 42 | ^TCAGACAC
 43 | >E22
 44 | ^CCAGTTGA
 45 | >G10
 46 | ^AACACCAC
 47 | >G22
 48 | ^AGATACGG
 49 | >I10
 50 | ^ATGCGTCA
 51 | >I22
 52 | ^CTTACAGC
 53 | >K10
 54 | ^CACAGGAA
 55 | >K22
 56 | ^CCACAACA
 57 | >M10
 58 | ^ATCGCAAC
 59 | >M22
 60 | ^ACAAGACG
 61 | >O10
 62 | ^TAAGTGGC
 63 | >O22
 64 | ^CTATCCAC
 65 | >B9
 66 | ^AGTACACG
 67 | >B21
 68 | ^GTACCACA
 69 | >D9
 70 | ^AACACTGG
 71 | >D21
 72 | ^CGTAGATG
 73 | >F9
 74 | ^AATCGCTG
 75 | >F21
 76 | ^TTCGAAGC
 77 | >H9
 78 | ^AAGGAAGG
 79 | >H21
 80 | ^AACCTACG
 81 | >J9
 82 | ^TGGTGAAG
 83 | >J21
 84 | ^CTCTTGTC
 85 | >L9
 86 | ^TAGAACGC
 87 | >L21
 88 | ^GATACCTG
 89 | >N9
 90 | ^AAGAGGCA
 91 | >N21
 92 | ^AACTCGGA
 93 | >P9
 94 | ^CACATGGT
 95 | >P21
 96 | ^ATCCACGA
 97 | >B10
 98 | ^TTACCGAC
 99 | >B22
100 | ^AAGTCCTC
101 | >D10
102 | ^AAGGAGAC
103 | >D22
104 | ^CAACGAGT
105 | >F10
106 | ^CACAGACT
107 | >F22
108 | ^ACTCTGAG
109 | >H10
110 | ^CGCAACTA
111 | >H22
112 | ^CGGATCAA
113 | >J10
114 | ^ACCTCTTC
115 | >J22
116 | ^TGCGATAG
117 | >L10
118 | ^AGTGCATC
119 | >L22
120 | ^CCAGTATC
121 | >N10
122 | ^GCTTCACA
123 | >N22
124 | ^AAGCTGGT
125 | >P10
126 | ^GAGGCATT
127 | >P22
128 | ^TCGCTATC
129 | 


--------------------------------------------------------------------------------
/cemba_data/files/random_index_v2/random_index_v2.multiplex_group_6.fa:
--------------------------------------------------------------------------------
  1 | >A11
  2 | ^CCTTGGAA
  3 | >A23
  4 | ^AGGATAGC
  5 | >C11
  6 | ^GCTCAGTT
  7 | >C23
  8 | ^CAGTGCTT
  9 | >E11
 10 | ^AATGACGC
 11 | >E23
 12 | ^AGGTGTTG
 13 | >G11
 14 | ^AACAACCG
 15 | >G23
 16 | ^TGAGCTGT
 17 | >I11
 18 | ^GTCAACAG
 19 | >I23
 20 | ^AGCCTATC
 21 | >K11
 22 | ^GACCGATA
 23 | >K23
 24 | ^GATGGAGT
 25 | >M11
 26 | ^ACTCCTAC
 27 | >M23
 28 | ^CGTGTGAT
 29 | >O11
 30 | ^AAGCCTGA
 31 | >O23
 32 | ^GCGCATAT
 33 | >A12
 34 | ^CTCGACTT
 35 | >A24
 36 | ^TTCGCCAT
 37 | >C12
 38 | ^CACGTCTA
 39 | >C24
 40 | ^CGATTGGA
 41 | >E12
 42 | ^CGAAGTCA
 43 | >E24
 44 | ^AAGTGCAG
 45 | >G12
 46 | ^GTAAGCAC
 47 | >G24
 48 | ^GTTCTTCG
 49 | >I12
 50 | ^TACATCGG
 51 | >I24
 52 | ^AGTCTTGG
 53 | >K12
 54 | ^ACTCAACG
 55 | >K24
 56 | ^AGGTCTGT
 57 | >M12
 58 | ^ACGTCGTT
 59 | >M24
 60 | ^CGCCTTAT
 61 | >O12
 62 | ^AGTCAGGT
 63 | >O24
 64 | ^GATCTCAG
 65 | >B11
 66 | ^TGTCAGTG
 67 | >B23
 68 | ^TAGTGGTG
 69 | >D11
 70 | ^TTGGTGCA
 71 | >D23
 72 | ^CTGTATGC
 73 | >F11
 74 | ^AGTGACCT
 75 | >F23
 76 | ^AGACCTTG
 77 | >H11
 78 | ^AGCGTGTA
 79 | >H23
 80 | ^CATACTCG
 81 | >J11
 82 | ^TAGCTGAG
 83 | >J23
 84 | ^CAGATCCT
 85 | >L11
 86 | ^AACCAGAG
 87 | >L23
 88 | ^TCCTGACT
 89 | >N11
 90 | ^GAAGACTG
 91 | >N23
 92 | ^ACAGTTCG
 93 | >P11
 94 | ^CGAGTTAG
 95 | >P23
 96 | ^GAGAAGGT
 97 | >B12
 98 | ^ACCTTCGA
 99 | >B24
100 | ^GTCGATTG
101 | >D12
102 | ^TGTCGACT
103 | >D24
104 | ^TGTGTCAG
105 | >F12
106 | ^TCGAACCT
107 | >F24
108 | ^GTTATGGC
109 | >H12
110 | ^TCCGATCA
111 | >H24
112 | ^ACTGCACT
113 | >J12
114 | ^CATTCGTC
115 | >J24
116 | ^TGGTTCGA
117 | >L12
118 | ^GCATTGGT
119 | >L24
120 | ^CCTCGAAT
121 | >N12
122 | ^ACCGAATG
123 | >N24
124 | ^GCAATGAG
125 | >P12
126 | ^ACACCTCA
127 | >P24
128 | ^AATGGTCG
129 | 


--------------------------------------------------------------------------------
/cemba_data/files/sample_sheet_header.txt:
--------------------------------------------------------------------------------
 1 | [Header],,,,,,,,,,
 2 | IEMFileVersion,4,,,,,,,,,
 3 | Date,,,,,,,,,,
 4 | Workflow,GenerateFASTQ,,,,,,,,,
 5 | Application,HiSeq_FASTQ_Only,,,,,,,,,
 6 | Assay,TruSeq_HT,,,,,,,,,
 7 | Description,,,,,,,,,,
 8 | Chemistry,,,,,,,,,,
 9 | ,,,,,,,,,,
10 | [Reads],,,,,,,,,,
11 | 151,,,,,,,,,,
12 | 151,,,,,,,,,,
13 | ,,,,,,,,,,
14 | [Settings],,,,,,,,,,
15 | Adapter,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,,,,,,,,,
16 | AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT,,,,,,,,,
17 | ,,,,,,,,,,
18 | [Data],,,,,,,,,,
19 | 


--------------------------------------------------------------------------------
/cemba_data/files/sbatch_template_schicluster.txt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Notes from TACC:
 4 | #
 5 | #   -- Launch this script by executing
 6 | #   -- Copy/edit this script as desired.  Launch by executing
 7 | #      "sbatch knl.openmp.slurm" on a Stampede2 login node.
 8 | #
 9 | #   -- OpenMP codes run on a single node (upper case N = 1).
10 | #        OpenMP ignores the value of lower case n,
11 | #        but slurm needs a plausible value to schedule the job.
12 | #
13 | #   -- Default value of OMP_NUM_THREADS is 1; be sure to change it!
14 | #
15 | #   -- Increase thread count gradually while looking for optimal setting.
16 | #        If there is sufficient memory available, the optimal setting
17 | #        is often 68 (1 thread per core) or 136 (2 threads per core).
18 | #
19 | #----------------------------------------------------
20 | 
21 | #SBATCH -J {job_name}           # Job name
22 | #SBATCH -o {log_dir}/{job_name}.o%j       # Name of stdout output file
23 | #SBATCH -e {log_dir}/{job_name}.e%j       # Name of stderr error file
24 | #SBATCH -p {queue}              # Queue (partition) name
25 | #SBATCH -N 1                    # Total # of nodes (must be 1 for OpenMP)
26 | #SBATCH -n 1                    # Total # of mpi tasks (should be 1 for OpenMP)
27 | #SBATCH -t {time_str}           # Run time (hh:mm:ss)
28 | {email_str}
29 | {email_type_str}
30 | 
31 | #----------------------------------------------------
32 | # Clone the whole miniconda into /tmp so the snakemake command do not access $WORK
33 | mkdir /tmp/test_{env_dir_random}
34 | 
35 | # use micromamba
36 | export PATH=/work/05622/lhq/stampede2/bin:$PATH
37 | micromamba shell init -s bash -p /tmp/test_{env_dir_random}
38 | source ~/.bashrc
39 | 
40 | # activate base environment
41 | micromamba activate
42 | 
43 | # create schicluster environment
44 | micromamba create -y -n schicluster python=3.8 numpy scipy scikit-learn h5py \
45 | joblib cooler pandas statsmodels rpy2 anndata xarray snakemake pybedtools htslib=1.9 pysam=0.18
46 | micromamba activate schicluster
47 | 
48 | # export correct PYTHONPATH
49 | export PYTHONPATH=/tmp/test_{env_dir_random}/envs/schicluster/lib/python3.8/site-packages
50 | 
51 | # install schicluster
52 | pip install schicluster
53 | which hicluster
54 | 
55 | # Installation finished
56 | #----------------------------------------------------
57 | 
58 | 
59 | # ---------------------------------------------------
60 | # actual command
61 | 
62 | # print some info
63 | date
64 | hostname
65 | pwd
66 | # If you want to profile the job (CPU, MEM usage, etc.)
67 | # load remora with
68 | # "module load remora"
69 | # and change the command to
70 | # "remora {command}"
71 | 
72 | 
73 | # Set thread count (default value is 1)...
74 | export OMP_NUM_THREADS=48
75 | 
76 | for i in `seq 1 5`
77 | do
78 |     {command} --batch summary=${{i}}/5
79 | done
80 | 
81 | # {command}
82 | 
83 | # delete everything in /tmp
84 | 
85 | rm -rf /tmp/test*
86 | # ---------------------------------------------------
87 | 


--------------------------------------------------------------------------------
/cemba_data/files/sbatch_template_yap.txt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Notes from TACC:
 4 | #
 5 | #   -- Launch this script by executing
 6 | #   -- Copy/edit this script as desired.  Launch by executing
 7 | #      "sbatch knl.openmp.slurm" on a Stampede2 login node.
 8 | #
 9 | #   -- OpenMP codes run on a single node (upper case N = 1).
10 | #        OpenMP ignores the value of lower case n,
11 | #        but slurm needs a plausible value to schedule the job.
12 | #
13 | #   -- Default value of OMP_NUM_THREADS is 1; be sure to change it!
14 | #
15 | #   -- Increase thread count gradually while looking for optimal setting.
16 | #        If there is sufficient memory available, the optimal setting
17 | #        is often 68 (1 thread per core) or 136 (2 threads per core).
18 | #
19 | #----------------------------------------------------
20 | 
21 | #SBATCH -J {job_name}           # Job name
22 | #SBATCH -o {log_dir}/{job_name}.o%j       # Name of stdout output file
23 | #SBATCH -e {log_dir}/{job_name}.e%j       # Name of stderr error file
24 | #SBATCH -p {queue}              # Queue (partition) name
25 | #SBATCH -N 1                    # Total # of nodes (must be 1 for OpenMP)
26 | #SBATCH -n 1                    # Total # of mpi tasks (should be 1 for OpenMP)
27 | #SBATCH -t {time_str}           # Run time (hh:mm:ss)
28 | {email_str}
29 | {email_type_str}
30 | 
31 | 
32 | #----------------------------------------------------
33 | # Clone the whole miniconda into /tmp so the snakemake command do not access $WORK
34 | mkdir /tmp/test_{env_dir_random}
35 | tar -xf /work2/05622/lhq/test_conda.tar -C /tmp/test_{env_dir_random}
36 | export CONDA_PREFIX=/tmp/test_{env_dir_random}/test/miniconda3
37 | export CONDA_PYTHON_EXE=/tmp/test_{env_dir_random}/test/miniconda3/bin/python
38 | export CONDA_EXE=/tmp/test_{env_dir_random}/test/miniconda3/bin/conda
39 | export PATH=/dev/shm/bin:/tmp/test_{env_dir_random}/test/miniconda3/envs/mapping/bin:/tmp/test_{env_dir_random}/test/miniconda3/bin:/opt/apps/cmake/3.16.1/bin:/opt/apps/intel18/python2/2.7.15/bin:/opt/apps/autotools/1.1/bin:/opt/apps/git/2.24.1/bin:/opt/apps/libfabric/1.7.0/bin:/opt/apps/intel18/impi/18.0.2/bin:/opt/intel/compilers_and_libraries_2018.2.199/linux/mpi/intel64/bin:/opt/intel/compilers_and_libraries_2018.2.199/linux/bin/intel64:/opt/apps/gcc/6.3.0/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/opt/dell/srvadmin/bin:.
40 | find /tmp/test_{env_dir_random}/test/miniconda3/ -type f -print0 | sed 's/ /\\ /g; s/(/\\(/g; s/)/\\)/g' | xargs -0 -P 30 -I % sh -c '/bin/sed -i "s/\/tmp\/test\/miniconda3\/envs\/mapping\/bin\/python/\/tmp\/test_{env_dir_random}\/test\/miniconda3\/envs\/mapping\/bin\/python/" %'
41 | 
42 | pip install cemba_data --upgrade
43 | pip install schicluster --upgrade
44 | 
45 | # Check the path
46 | which python
47 | which snakemake
48 | which yap
49 | which allcools
50 | which bismark
51 | 
52 | # Installation finished
53 | #----------------------------------------------------
54 | 
55 | 
56 | # ---------------------------------------------------
57 | # actual command
58 | 
59 | # print some info
60 | date
61 | hostname
62 | pwd
63 | # If you want to profile the job (CPU, MEM usage, etc.)
64 | # load remora with
65 | # "module load remora"
66 | # and change the command to
67 | # "remora {command}"
68 | 
69 | 
70 | # Set thread count (default value is 1)...
71 | export OMP_NUM_THREADS=48
72 | 
73 | {command}
74 | 
75 | # delete everything in /tmp
76 | 
77 | rm -rf /tmp/test*
78 | # ---------------------------------------------------
79 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/__init__.py:
--------------------------------------------------------------------------------
 1 | from .hisat3n_general import \
 2 |     separate_unique_and_multi_align_reads, \
 3 |     convert_hisat_bam_strandness, \
 4 |     make_snakefile_hisat3n
 5 | from .utilities import validate_cwd_fastq_paths, read_mapping_config
 6 | from .hisat3n_mct import select_mct_reads, aggregate_feature_counts
 7 | from .summary import snmc_summary, snmct_summary, snm3c_summary
 8 | from .hisat3n_m3c import \
 9 |     split_hisat3n_unmapped_reads, \
10 |     call_chromatin_contacts, \
11 |     remove_overlap_read_parts
12 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from .hisat3n_m3c import remove_overlap_read_parts
 4 | 
 5 | 
 6 | @click.command('remove_overlap_read_parts')
 7 | @click.argument('in_bam_path')
 8 | @click.argument('out_bam_path')
 9 | def _remove_overlap_read_parts(in_bam_path, out_bam_path):
10 |     remove_overlap_read_parts(in_bam_path, out_bam_path)
11 |     return
12 | 
13 | 
14 | @click.group()
15 | def _main():
16 |     return
17 | 
18 | 
19 | def main():
20 |     _main.add_command(_remove_overlap_read_parts)
21 |     _main()
22 |     return
23 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/config/__init__.py


--------------------------------------------------------------------------------
/cemba_data/hisat3n/config/gcp.md:
--------------------------------------------------------------------------------
 1 | # Setup GCP image for mapping
 2 | 
 3 | ## Create base image
 4 | 
 5 | ```bash
 6 | # init install system tools
 7 | sudo yum install -y zsh tree wget screen git nfs-utils make gcc
 8 | 
 9 | # install mambaforge
10 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
11 | sh Mambaforge-Linux-x86_64.sh -b -p $HOME/mambaforge
12 | rm -f Mambaforge-Linux-x86_64.sh
13 | ./mambaforge/bin/mamba init zsh
14 | ./mambaforge/bin/mamba init bash
15 | exec /bin/zsh
16 | mamba install -y gxx
17 | 
18 | # Create mapping env hisat3n_env.yml
19 | wget https://raw.githubusercontent.com/lhqing/cemba_data/master/hisat3n_env.yml
20 | mamba env update -f hisat3n_env.yml  # this should install things in the base env
21 | 
22 | # Install packages
23 | mkdir -p ~/pkg
24 | 
25 | # install hisat-3n
26 | cd ~/pkg
27 | git clone https://github.com/DaehwanKimLab/hisat2.git hisat-3n
28 | cd hisat-3n
29 | git checkout hisat-3n-dev-directional-mapping-reverse
30 | make
31 | # put hisat-3n in the PATH
32 | echo 'export PATH=$HOME/pkg/hisat-3n:$PATH' >> ~/.bashrc
33 | source ~/.bashrc 
34 | echo 'export PATH=$HOME/pkg/hisat-3n:$PATH' >> ~/.zshrc 
35 | source ~/.zshrc
36 | 
37 | # make sure allcools and yap is upto date
38 | cd ~/pkg
39 | git clone https://github.com/lhqing/cemba_data.git
40 | cd cemba_data
41 | pip install -e .
42 | 
43 | cd ~/pkg
44 | git clone https://github.com/lhqing/ALLCools.git
45 | cd ALLCoools
46 | pip install -e .
47 | 
48 | ## Create genome reference
49 | 
50 | # add genome reference file
51 | # prepare and copy specific genome reference file to $HOME
52 | 
53 | # prepare a $HOME/mapping.yaml file the records the path of required genome reference files
54 | 
55 | # clean unnecessary cache files
56 | mamba clean -y -a
57 | ```
58 | 
59 | ## Actual mapping
60 | 
61 | ```bash
62 | mkdir -p ~/mapping
63 | cd ~/mapping
64 | gsutil cp gs://PATH/TO/FASTQ_DIR/fastq ./
65 | cp ~/pkg/cemba_data/hisat3n/snakefile/SNAKEFILE_YOU_WANT_TO_USE ./Snakefile
66 | 
67 | # run snakemake
68 | snakemake --configfile ~/mapping.yaml -j
69 | ```
70 | 
71 | ## Build hisat-3n index
72 | ```bash
73 | # non-repeat index
74 | hisat-3n-build --base-change C,T genome.fa genome
75 | # repeat index
76 | hisat-3n-build --base-change T,C --repeat-index genome.fa genome
77 | # Build the repeat HISAT-3N integrated index with splice site information
78 | hisat-3n-build --base-change C,T --repeat-index --ss genome.ss --exon genome.exon genome.fa genome 
79 | ```
80 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/config/hisat-3n-build.sh:
--------------------------------------------------------------------------------
 1 | # normal index
 2 | hisat-3n-build --base-change C,T -p THREAD \
 3 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.fa \
 4 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl
 5 | 
 6 | # repeat index
 7 | hisat-3n-build --base-change C,T -p THREAD --repeat-index \
 8 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.fa \
 9 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.repeat
10 | 
11 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/config/hisat3n_mapping_env.yaml:
--------------------------------------------------------------------------------
 1 | name: mapping
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |  - python=3.8
 8 |  - pip
 9 |  - bedtools
10 |  - cutadapt
11 |  - htslib>=1.9
12 |  - natsort
13 |  - picard
14 |  - pybedtools
15 |  - pyBigWig
16 |  - pysam
17 |  - samtools
18 |  - seaborn
19 |  - snakemake
20 |  - subread=2.0
21 |  - yaml
22 |  - pip:
23 |     - allcools
24 |     - cemba_data
25 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/config/vm_init.sh:
--------------------------------------------------------------------------------
 1 | sudo yum install -y zsh tree wget screen git nfs-utils make gcc
 2 | 
 3 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
 4 | sh Mambaforge-Linux-x86_64.sh -b -p $HOME/mambaforge
 5 | rm -f Mambaforge-Linux-x86_64.sh
 6 | ./mambaforge/bin/mamba init zsh
 7 | ./mambaforge/bin/mamba init bash
 8 | 
 9 | mamba install -y gxx
10 | exec /bin/zsh
11 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/hisat3n_general.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import pathlib
  3 | import cemba_data
  4 | import subprocess
  5 | from ..utilities import get_configuration
  6 | 
  7 | 
  8 | def bam_read_to_fastq_read(read, read_type=None):
  9 |     if read_type is None:
 10 |         if read.is_read1:
 11 |             read_type = '1'
 12 |         else:
 13 |             read_type = '2'
 14 | 
 15 |     fastq_record = f"@{read.qname}_{read_type}\n" \
 16 |                    f"{read.query_sequence}\n" \
 17 |                    f"+\n" \
 18 |                    f"{read.qual}\n"
 19 |     return fastq_record
 20 | 
 21 | 
 22 | def separate_unique_and_multi_align_reads(in_bam_path,
 23 |                                           out_unique_path,
 24 |                                           out_multi_path,
 25 |                                           out_unmappable_path=None,
 26 |                                           unmappable_format='auto',
 27 |                                           mapq_cutoff=10,
 28 |                                           qlen_cutoff=30,
 29 |                                           primary_only=True,
 30 |                                           read_type=None):
 31 |     """
 32 |     Separate unique aligned, multi-aligned, and unaligned reads from hisat-3n bam file.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     in_bam_path
 37 |         Path to hisat-3n bam file.
 38 |     out_unique_path
 39 |         Path to output unique aligned bam file.
 40 |     out_multi_path
 41 |         Path to output multi-aligned bam file.
 42 |     out_unmappable_path
 43 |         Path to output unmappable file.
 44 |     unmappable_format
 45 |         Format of unmappable file, only "bam" and "fastq" supported.
 46 |     mapq_cutoff
 47 |         MAPQ cutoff for uniquely aligned reads,
 48 |         note that for hisat-3n, unique aligned reads always have MAPQ=60
 49 |     qlen_cutoff
 50 |         read length cutoff for any reads
 51 |     primary_only
 52 |         If True, only primary alignments (FLAG 256) are considered for multi-aligned reads.
 53 |     read_type
 54 |         read type, only None, "1" and "2" supported. If the BAM file is paired-end, use None.
 55 |     Returns
 56 |     -------
 57 |     None
 58 |     """
 59 |     if out_unmappable_path is not None:
 60 |         if unmappable_format == 'auto':
 61 |             if out_unmappable_path.endswith('.bam'):
 62 |                 unmappable_format = 'bam'
 63 |             elif out_unmappable_path.endswith('.fastq'):
 64 |                 unmappable_format = 'fastq'
 65 |             else:
 66 |                 raise ValueError(f'Unmappable format {unmappable_format} not supported.')
 67 |         else:
 68 |             if unmappable_format not in ['bam', 'fastq']:
 69 |                 raise ValueError(f'Unmappable format {unmappable_format} not supported.')
 70 | 
 71 |     with pysam.AlignmentFile(in_bam_path, index_filename=None) as bam:
 72 |         header = bam.header
 73 |         with pysam.AlignmentFile(out_unique_path, header=header, mode='wb') as unique_bam, \
 74 |                 pysam.AlignmentFile(out_multi_path, header=header, mode='wb') as multi_bam:
 75 |             if out_unmappable_path is not None:
 76 |                 if unmappable_format == 'bam':
 77 |                     unmappable_file = pysam.AlignmentFile(out_unmappable_path, header=header, mode='wb')
 78 |                 else:
 79 |                     unmappable_file = open(out_unmappable_path, 'w')
 80 |             else:
 81 |                 unmappable_file = None
 82 | 
 83 |             for read in bam:
 84 |                 # skip reads that are too short
 85 |                 if read.qlen < qlen_cutoff:
 86 |                     continue
 87 | 
 88 |                 if read.mapq > mapq_cutoff:
 89 |                     unique_bam.write(read)
 90 |                 elif read.mapq > 0:
 91 |                     if primary_only and read.is_secondary:
 92 |                         # skip secondary alignments if primary_only is True,
 93 |                         # read.is_secondary is True when FLAG contains 256.
 94 |                         continue
 95 |                     multi_bam.write(read)
 96 |                 else:
 97 |                     # unmappable reads
 98 |                     if unmappable_file is not None:
 99 |                         if unmappable_format == 'bam':
100 |                             unmappable_file.write(read)
101 |                         else:
102 |                             unmappable_file.write(bam_read_to_fastq_read(read, read_type=read_type))
103 | 
104 |             if unmappable_file is not None:
105 |                 unmappable_file.close()
106 |     return
107 | 
108 | 
109 | def convert_hisat_bam_strandness(in_bam_path, out_bam_path):
110 |     with pysam.AlignmentFile(in_bam_path) as in_bam, \
111 |             pysam.AlignmentFile(out_bam_path, header=in_bam.header, mode='wb') as out_bam:
112 |         for read in in_bam:
113 |             if read.get_tag('YZ') == '+':
114 |                 read.is_forward = True
115 |                 if read.is_paired:
116 |                     read.mate_is_forward = True
117 |             else:
118 |                 read.is_forward = False
119 |                 if read.is_paired:
120 |                     read.mate_is_forward = False
121 |             out_bam.write(read)
122 |     return
123 | 
124 | 
125 | def make_snakefile_hisat3n(output_dir):
126 |     output_dir = pathlib.Path(output_dir)
127 | 
128 |     mapping_config_name = list(output_dir.glob('mapping_config.*'))[0].name
129 | 
130 |     config = get_configuration(output_dir / mapping_config_name)
131 |     try:
132 |         mode = config['mode']
133 |     except KeyError:
134 |         raise KeyError('mode not found in the config file.')
135 | 
136 |     skip_dirs = ['stats', 'snakemake', 'scool']
137 |     mapping_job_dirs = [p for p in output_dir.glob('*')
138 |                         if p.is_dir() and (p.name not in skip_dirs)]
139 | 
140 |     snakemake_dir = output_dir / 'snakemake'
141 |     snakemake_dir.mkdir(exist_ok=True)
142 |     stats_dir = output_dir / 'stats'
143 |     stats_dir.mkdir(exist_ok=True)
144 | 
145 |     package_dir = cemba_data.__path__[0]
146 |     snakefile_path = f'{package_dir}/hisat3n/snakefile/{mode.lower()}.smk'
147 |     if not pathlib.Path(snakefile_path).exists():
148 |         print('Possible snakefile templates:')
149 |         for p in pathlib.Path(f'{package_dir}/hisat3n/snakefile/').glob('*.smk'):
150 |             print(p)
151 |         raise ValueError(f'Mode {mode} not supported, '
152 |                          f'because Snakefile {snakefile_path} not found.')
153 | 
154 |     for p in mapping_job_dirs:
155 |         subprocess.run(['cp', f'{output_dir}/{mapping_config_name}',
156 |                         f'{p}/{mapping_config_name}'], check=True)
157 |         subprocess.run(['cp', snakefile_path, f'{p}/Snakefile'], check=True)
158 | 
159 |     # leave a flag to indicate using hisat-3n pipeline
160 |     subprocess.run(['touch', f'{output_dir}/snakemake/hisat3n'], check=True)
161 |     return
162 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/snakefile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/snakefile/__init__.py


--------------------------------------------------------------------------------
/cemba_data/hisat3n/stats_col_names.py:
--------------------------------------------------------------------------------
 1 | # "DELETE" means this column will be removed from the output file
 2 | # "" means the name will not be changed
 3 | # "CertainMetricNames" means the name will be changed to "CertainMetricNames"
 4 | 
 5 | 
 6 | COL_NAMES = {
 7 |     ('cell_parser_cutadapt_trim_stats', 'status'): 'DELETE',
 8 |     ('cell_parser_cutadapt_trim_stats', 'in_reads'): 'InputReadPairs',
 9 |     ('cell_parser_cutadapt_trim_stats', 'in_bp'): 'InputReadPairsBP',
10 |     ('cell_parser_cutadapt_trim_stats', 'too_short'): 'DELETE',
11 |     ('cell_parser_cutadapt_trim_stats', 'too_long'): 'DELETE',
12 |     ('cell_parser_cutadapt_trim_stats', 'too_many_n'): 'DELETE',
13 |     ('cell_parser_cutadapt_trim_stats', 'out_reads'): 'TrimmedReadPairs',
14 |     ('cell_parser_cutadapt_trim_stats', 'w/adapters'): 'R1WithAdapters',
15 |     ('cell_parser_cutadapt_trim_stats', 'qualtrim_bp'): 'R1QualTrimBP',
16 |     ('cell_parser_cutadapt_trim_stats', 'out_bp'): 'R1TrimmedReadsBP',
17 |     ('cell_parser_cutadapt_trim_stats', 'w/adapters2'): 'R2WithAdapters',
18 |     ('cell_parser_cutadapt_trim_stats', 'qualtrim2_bp'): 'R2QualTrimBP',
19 |     ('cell_parser_cutadapt_trim_stats', 'out2_bp'): 'R2TrimmedReadsBP',
20 |     ('cell_parser_hisat_summary', 'ReadPairsMappedInPE'): 'DELETE',
21 |     ('cell_parser_hisat_summary', 'PEUnmappableReadPairs'): 'DELETE',
22 |     ('cell_parser_hisat_summary', 'PEUniqueMappedReadPairs'): 'DELETE',
23 |     ('cell_parser_hisat_summary', 'PEMultiMappedReadPairs'): 'DELETE',
24 |     ('cell_parser_hisat_summary', 'PEDiscordantlyUniqueMappedReadPairs'): 'DELETE',
25 |     ('cell_parser_hisat_summary', 'ReadsMappedInSE'): 'DELETE',
26 |     ('cell_parser_hisat_summary', 'SEUnmappableReads'): 'DELETE',
27 |     ('cell_parser_hisat_summary', 'SEUniqueMappedReads'): 'DELETE',
28 |     ('cell_parser_hisat_summary', 'SEMultiMappedReads'): 'DELETE',
29 |     ('cell_parser_hisat_summary', 'UniqueMappedReads'): 'UniqueMappedReads',
30 |     ('cell_parser_hisat_summary', 'MultiMappedReads'): 'MultiMappedReads',
31 |     ('cell_parser_hisat_summary', 'UniqueMappingRate'): 'UniqueMappingRate',
32 |     ('cell_parser_hisat_summary', 'MultiMappingRate'): 'MultiMappingRate',
33 |     ('cell_parser_hisat_summary', 'OverallMappingRate'): 'OverallMappingRate',
34 |     ('cell_parser_picard_dedup_stat', 'LIBRARY'): 'DELETE',
35 |     ('cell_parser_picard_dedup_stat', 'UNPAIRED_READS_EXAMINED'): 'DELETE',
36 |     ('cell_parser_picard_dedup_stat', 'READ_PAIRS_EXAMINED'): 'DELETE',
37 |     ('cell_parser_picard_dedup_stat', 'SECONDARY_OR_SUPPLEMENTARY_RDS'): 'DELETE',
38 |     ('cell_parser_picard_dedup_stat', 'UNMAPPED_READS'): 'DELETE',
39 |     ('cell_parser_picard_dedup_stat', 'UNPAIRED_READ_DUPLICATES'): 'DELETE',
40 |     ('cell_parser_picard_dedup_stat', 'READ_PAIR_DUPLICATES'): 'DELETE',
41 |     ('cell_parser_picard_dedup_stat', 'READ_PAIR_OPTICAL_DUPLICATES'): 'DELETE',
42 |     ('cell_parser_picard_dedup_stat', 'PERCENT_DUPLICATION'): 'DELETE',
43 |     ('cell_parser_picard_dedup_stat', 'ESTIMATED_LIBRARY_SIZE'): 'DELETE',
44 |     ('cell_parser_picard_dedup_stat', 'FinalReads'): '',
45 |     ('cell_parser_picard_dedup_stat', 'DuplicatedReads'): '',
46 |     ('cell_parser_picard_dedup_stat', 'PCRDuplicationRate'): '',
47 |     ('cell_parser_feature_count_summary', 'Assigned'): 'AssignedRNAReads',
48 |     ('cell_parser_feature_count_summary', 'Unassigned_Unmapped'): 'DELETE',
49 |     ('cell_parser_feature_count_summary', 'Unassigned_Read_Type'): 'DELETE',
50 |     ('cell_parser_feature_count_summary', 'Unassigned_Singleton'): 'DELETE',
51 |     ('cell_parser_feature_count_summary', 'Unassigned_MappingQuality'): 'DELETE',
52 |     ('cell_parser_feature_count_summary', 'Unassigned_Chimera'): 'DELETE',
53 |     ('cell_parser_feature_count_summary', 'Unassigned_FragmentLength'): 'DELETE',
54 |     ('cell_parser_feature_count_summary', 'Unassigned_Duplicate'): 'DELETE',
55 |     ('cell_parser_feature_count_summary', 'Unassigned_MultiMapping'): 'DELETE',
56 |     ('cell_parser_feature_count_summary', 'Unassigned_Secondary'): 'DELETE',
57 |     ('cell_parser_feature_count_summary', 'Unassigned_NonSplit'): 'DELETE',
58 |     ('cell_parser_feature_count_summary', 'Unassigned_NoFeatures'): 'DELETE',
59 |     ('cell_parser_feature_count_summary', 'Unassigned_Overlapping_Length'): 'DELETE',
60 |     ('cell_parser_feature_count_summary', 'Unassigned_Ambiguity'): 'DELETE',
61 |     ('cell_parser_feature_count_summary', 'Unassigned_Total'): 'UnassignedRNAReads',
62 |     ('cell_parser_feature_count_summary', 'AssignedRNAReadsRate'): 'AssignedRNAReadsRate',
63 |     ('cell_parser_call_chromatin_contacts', 'cis'): 'CisContacts',
64 |     ('cell_parser_call_chromatin_contacts', 'ciscut'): 'CisCutContacts',
65 |     ('cell_parser_call_chromatin_contacts', 'cis_multi'): 'CisMultiContacts',
66 |     ('cell_parser_call_chromatin_contacts', 'ciscut_multi'): 'CisCutMultiContacts',
67 |     ('cell_parser_call_chromatin_contacts', 'trans'): 'TransContacts',
68 |     ('cell_parser_call_chromatin_contacts', 'transcut',): 'TransCutContacts',
69 |     ('cell_parser_call_chromatin_contacts', 'trans_multi'): 'TransMultiContacts',
70 |     ('cell_parser_call_chromatin_contacts', 'transcut_multi'): 'TransCutMultiContacts',
71 |     ('cell_parser_call_chromatin_contacts', 'chimeric'): 'ChimericContacts',
72 |     ('cell_parser_call_chromatin_contacts', 'no'): 'NoContacts',
73 |     ('cell_parser_call_chromatin_contacts', 'mapped_frag'): 'MappedFragments',
74 |     ('cell_parser_call_chromatin_contacts', 'dedup_frag'): 'DeduppedContacts',
75 |     ('cell_parser_call_chromatin_contacts', 'dup_rate'): 'ContactsDeduplicationRate',
76 |     ('cell_parser_call_chromatin_contacts', 'TotalCisContacts'): '',
77 |     ('cell_parser_call_chromatin_contacts', 'TotalTransContacts'): '',
78 |     ('cell_parser_call_chromatin_contacts', 'TotalMultiContacts'): '',
79 |     ('cell_parser_call_chromatin_contacts', 'CisContactsRatio'): '',
80 |     ('cell_parser_call_chromatin_contacts', 'TransContactsRatio'): '',
81 |     ('cell_parser_call_chromatin_contacts', 'MultiContactsRatio'): '',
82 | }
83 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/summary.py:
--------------------------------------------------------------------------------
  1 | from .stats_parser import *
  2 | 
  3 | 
  4 | def snmc_summary():
  5 |     """
  6 |     Generate snmC pipeline MappingSummary.csv.gz and save into cwd
  7 | 
  8 |     Returns
  9 |     -------
 10 |     pd.DataFrame
 11 |     """
 12 |     all_stats = []
 13 | 
 14 |     # fastq trimming stats
 15 |     df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt',
 16 |                                 cell_parser_cutadapt_trim_stats)
 17 |     all_stats.append(df)
 18 | 
 19 |     # hisat-3n mapping
 20 |     df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt',
 21 |                                 cell_parser_hisat_summary)
 22 |     all_stats.append(df)
 23 | 
 24 |     # uniquely mapped reads dedup
 25 |     df = parse_single_stats_set(f'bam/*.unique_align.deduped.matrix.txt',
 26 |                                 cell_parser_picard_dedup_stat, prefix='UniqueAlign')
 27 |     all_stats.append(df)
 28 | 
 29 |     # multi mapped reads dedup
 30 |     df = parse_single_stats_set(f'bam/*.multi_align.deduped.matrix.txt',
 31 |                                 cell_parser_picard_dedup_stat, prefix='MultiAlign')
 32 |     all_stats.append(df)
 33 | 
 34 |     # allc count
 35 |     df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv',
 36 |                                 cell_parser_allc_count)
 37 |     all_stats.append(df)
 38 | 
 39 |     # concatenate all stats
 40 |     all_stats = pd.concat(all_stats, axis=1)
 41 |     all_stats.index.name = 'cell'
 42 |     all_stats.to_csv(f'MappingSummary.csv.gz')
 43 |     return all_stats
 44 | 
 45 | 
 46 | def snmct_summary():
 47 |     """
 48 |     Generate snmCT pipeline MappingSummary.csv.gz and save into cwd
 49 | 
 50 |     Returns
 51 |     -------
 52 |     pd.DataFrame
 53 |     """
 54 |     all_stats = []
 55 | 
 56 |     # fastq trimming stats
 57 |     df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt',
 58 |                                 cell_parser_cutadapt_trim_stats)
 59 |     all_stats.append(df)
 60 | 
 61 |     # hisat-3n DNA mapping
 62 |     df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt',
 63 |                                 cell_parser_hisat_summary, prefix='DNA')
 64 |     all_stats.append(df)
 65 | 
 66 |     # hisat-3n RNA mapping
 67 |     df = parse_single_stats_set(f'rna_bam/*.hisat3n_rna_summary.txt',
 68 |                                 cell_parser_hisat_summary, prefix='RNA')
 69 |     all_stats.append(df)
 70 | 
 71 |     # uniquely mapped reads dedup
 72 |     df = parse_single_stats_set(f'bam/*.unique_align.deduped.matrix.txt',
 73 |                                 cell_parser_picard_dedup_stat, prefix='DNAUniqueAlign')
 74 |     all_stats.append(df)
 75 | 
 76 |     # multi mapped reads dedup
 77 |     df = parse_single_stats_set(f'bam/*.multi_align.deduped.matrix.txt',
 78 |                                 cell_parser_picard_dedup_stat, prefix='DNAMultiAlign')
 79 |     all_stats.append(df)
 80 | 
 81 |     # uniquely mapped dna reads selection
 82 |     df = parse_single_stats_set('bam/*.hisat3n_dna.unique_align.deduped.dna_reads.reads_mch_frac.csv',
 83 |                                 cell_parser_reads_mc_frac_profile, prefix='UniqueAlign')
 84 |     all_stats.append(df)
 85 | 
 86 |     # multi mapped dna reads selection
 87 |     df = parse_single_stats_set('bam/*.hisat3n_dna.multi_align.deduped.dna_reads.reads_mch_frac.csv',
 88 |                                 cell_parser_reads_mc_frac_profile, prefix='MultiAlign')
 89 |     all_stats.append(df)
 90 | 
 91 |     # uniquely mapped rna reads selection
 92 |     df = parse_single_stats_set('rna_bam/*.hisat3n_rna.unique_align.rna_reads.reads_mch_frac.csv',
 93 |                                 cell_parser_reads_mc_frac_profile)
 94 |     all_stats.append(df)
 95 | 
 96 |     # allc count
 97 |     df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv',
 98 |                                 cell_parser_allc_count)
 99 |     all_stats.append(df)
100 | 
101 |     # feature count
102 |     df = parse_single_stats_set(f'rna_bam/*.feature_count.tsv.summary',
103 |                                 cell_parser_feature_count_summary)
104 |     all_stats.append(df)
105 | 
106 |     # concatenate all stats
107 |     all_stats = pd.concat(all_stats, axis=1)
108 |     all_stats.index.name = 'cell'
109 |     all_stats.to_csv(f'MappingSummary.csv.gz')
110 |     return all_stats
111 | 
112 | 
113 | def snm3c_summary():
114 |     """
115 |     Generate snm3C pipeline MappingSummary.csv.gz and save into cwd
116 | 
117 |     Returns
118 |     -------
119 |     pd.DataFrame
120 |     """
121 |     all_stats = []
122 | 
123 |     # fastq trimming stats
124 |     df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt',
125 |                                 cell_parser_cutadapt_trim_stats)
126 |     all_stats.append(df)
127 | 
128 |     # hisat-3n mapping PE
129 |     df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt',
130 |                                 cell_parser_hisat_summary)
131 |     all_stats.append(df)
132 | 
133 |     # hisat-3n mapping split-reads SE
134 |     df = parse_single_stats_set(f'bam/*.hisat3n_dna_split_reads_summary.txt',
135 |                                 cell_parser_hisat_summary, prefix='SplitReads')
136 |     all_stats.append(df)
137 | 
138 |     # uniquely mapped reads dedup
139 |     df = parse_single_stats_set(f'bam/*.all_reads.deduped.matrix.txt',
140 |                                 cell_parser_picard_dedup_stat, prefix='UniqueAlign')
141 |     all_stats.append(df)
142 | 
143 |     # call chromatin contacts
144 |     df = parse_single_stats_set(f'hic/*.all_reads.contact_stats.csv',
145 |                                 cell_parser_call_chromatin_contacts)
146 |     all_stats.append(df)
147 | 
148 |     # allc count
149 |     df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv',
150 |                                 cell_parser_allc_count)
151 |     all_stats.append(df)
152 | 
153 |     # concatenate all stats
154 |     all_stats = pd.concat(all_stats, axis=1)
155 |     all_stats.index.name = 'cell'
156 |     all_stats.to_csv(f'MappingSummary.csv.gz')
157 |     return all_stats
158 | 


--------------------------------------------------------------------------------
/cemba_data/hisat3n/utilities.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import re
 3 | import yaml
 4 | import pandas as pd
 5 | 
 6 | from ..utilities import get_configuration
 7 | 
 8 | 
 9 | def _read_yaml_config(config_path):
10 |     with open(config_path, 'r') as f:
11 |         config = yaml.safe_load(f)
12 |     return config
13 | 
14 | 
15 | def _read_ini_config(config_path):
16 |     return get_configuration(config_path)
17 | 
18 | 
19 | def read_mapping_config(cwd: str = '.'):
20 |     tried = []
21 |     yaml_path = None
22 |     for name in ['config', 'mapping_config']:
23 |         for config_dir in [cwd, f'{cwd}/..']:
24 |             for suffix in ['yaml', 'yml']:
25 |                 path = f'{config_dir}/{name}.{suffix}'
26 |                 tried.append(path)
27 |                 if pathlib.Path(path).exists():
28 |                     yaml_path = path
29 |     default_path = f'~/mapping_config.yaml'
30 |     if pathlib.Path(default_path).exists():
31 |         yaml_path = default_path
32 | 
33 |     ini_path = None
34 |     for name in ['config', 'mapping_config']:
35 |         for config_dir in [cwd, f'{cwd}/..']:
36 |             path = f'{config_dir}/{name}.ini'
37 |             tried.append(path)
38 |             if pathlib.Path(path).exists():
39 |                 ini_path = path
40 | 
41 |     if yaml_path is not None:
42 |         config = _read_yaml_config(yaml_path)
43 |     elif ini_path is not None:
44 |         config = _read_ini_config(ini_path)
45 |     else:
46 |         config = {}
47 |     return config
48 | 
49 | 
50 | def validate_cwd_fastq_paths(cwd: str = '.'):
51 |     """
52 |     Validate fastq paths in the fastq subdirectory of cwd.
53 |     Parameters
54 |     ----------
55 |     cwd :
56 |         Path of the current working directory.
57 | 
58 |     Returns
59 |     -------
60 |     fastq_table : pandas.DataFrame
61 |     """
62 |     # get all fastq file paths
63 |     fastq_paths = [p
64 |                    for p in pathlib.Path(f'{cwd}/fastq/').glob('*.[fq.gz][fastq.gz]')
65 |                    if 'trim' not in p.name]
66 | 
67 |     # parse cell id and match fastq pairs
68 |     fastq_pattern = re.compile(r'(?P<cell_id>.+)(-|_)(?P<read_type>(R1|R2|r1|r2)).(fastq|fq)(.gz)*')
69 |     fastq_records = {}
70 |     for p in fastq_paths:
71 |         match = fastq_pattern.match(p.name)
72 |         if match is None:
73 |             # print(f'WARNING: {p} has FASTQ file path suffix, but do not match '
74 |             #       f'expected file name pattern {fastq_pattern}')
75 |             pass
76 |         else:
77 |             cell_id = match.group('cell_id')
78 |             read_type = match.group('read_type')
79 |             fastq_records[cell_id, read_type.upper()] = str(p)
80 | 
81 |     if len(fastq_records) == 0:
82 |         raise ValueError('No fastq files found in fastq folder, '
83 |                          'or no fastq files match expected file name pattern')
84 | 
85 |     fastq_table = pd.Series(fastq_records).unstack()
86 |     if 'R1' not in fastq_table.columns or 'R2' not in fastq_table.columns:
87 |         raise ValueError('No R1 or R2 fastq files found')
88 |     fastq_table = fastq_table[['R1', 'R2']].copy()
89 | 
90 |     # raise error if fastq file not paired
91 |     missing_file = fastq_table.isna().sum(axis=1) > 0
92 |     if missing_file.sum() > 0:
93 |         for cell in missing_file[missing_file].index:
94 |             print(f'{cell} missing R1 or R2 FASTQ file.')
95 |         raise FileNotFoundError(f'FASTQ files in {pathlib.Path(f"{cwd}/fastq/").absolute()} is not all paired.')
96 |     return fastq_table
97 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/Snakefile_template/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/mapping/Snakefile_template/__init__.py


--------------------------------------------------------------------------------
/cemba_data/mapping/Snakefile_template/mc.Snakefile:
--------------------------------------------------------------------------------
  1 | 
  2 | # Snakemake rules below
  3 | # suitable for snmC-seq2, snmC-seq3, NOMe-seq
  4 | 
  5 | # use diff mcg_context for normal mC or NOMe
  6 | mcg_context = 'CGN' if num_upstr_bases == 0 else 'HCGN'
  7 | 
  8 | # the summary rule is the final target
  9 | rule summary:
 10 |     input:
 11 |         expand("allc/{cell_id}.allc.tsv.gz", cell_id=CELL_IDS),
 12 |         expand("allc-{mcg_context}/{cell_id}.{mcg_context}-Merge.allc.tsv.gz", cell_id=CELL_IDS,
 13 |                mcg_context=mcg_context),
 14 |         # also add all the stats path here,
 15 |         # once summary is generated, snakemake will delete these stats
 16 |         expand("allc/{cell_id}.allc.tsv.gz.count.csv", cell_id=CELL_IDS),
 17 |         expand("fastq/{cell_id}-R1.trimmed.stats.tsv", cell_id=CELL_IDS),
 18 |         expand("fastq/{cell_id}-R2.trimmed.stats.tsv", cell_id=CELL_IDS),
 19 |         expand("bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.matrix.txt", cell_id=CELL_IDS),
 20 |         expand("bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.matrix.txt", cell_id=CELL_IDS),
 21 |         expand("bam/{cell_id}-R1.trimmed_bismark_bt2_SE_report.txt", cell_id=CELL_IDS),
 22 |         expand("bam/{cell_id}-R2.trimmed_bismark_bt2_SE_report.txt", cell_id=CELL_IDS),
 23 |     output:
 24 |         "MappingSummary.csv.gz"
 25 |     shell:
 26 |         "yap-internal summary --output_dir ./"
 27 | 
 28 | # Trim reads
 29 | rule trim_r1:
 30 |     input:
 31 |         "fastq/{cell_id}-R1.fq.gz"
 32 |     output:
 33 |         fq=temp("fastq/{cell_id}-R1.trimmed.fq.gz"),
 34 |         stats=temp("fastq/{cell_id}-R1.trimmed.stats.tsv")
 35 |     threads:
 36 |         2
 37 |     shell:
 38 |         "cutadapt --report=minimal -a {r1_adapter} {input} 2> {output.stats} | "
 39 |         "cutadapt --report=minimal -O 6 -q 20 -u {r1_left_cut} -u -{r1_right_cut} -m 30 "
 40 |         "-o {output.fq} - >> {output.stats}"
 41 | 
 42 | rule trim_r2:
 43 |     input:
 44 |         "fastq/{cell_id}-R2.fq.gz"
 45 |     output:
 46 |         fq=temp("fastq/{cell_id}-R2.trimmed.fq.gz"),
 47 |         stats=temp("fastq/{cell_id}-R2.trimmed.stats.tsv")
 48 |     threads:
 49 |         2
 50 |     shell:
 51 |         "cutadapt --report=minimal -a {r2_adapter} {input} 2> {output.stats} | "
 52 |         "cutadapt --report=minimal -O 6 -q 20 -u {r2_left_cut} -u -{r2_right_cut} -m 30 "
 53 |         "-o {output.fq} - >> {output.stats}"
 54 | 
 55 | # bismark mapping, R1 and R2 separately
 56 | rule bismark_r1:
 57 |     input:
 58 |         "fastq/{cell_id}-R1.trimmed.fq.gz"
 59 |     output:
 60 |         bam=temp("bam/{cell_id}-R1.trimmed_bismark_bt2.bam"),
 61 |         stats=temp("bam/{cell_id}-R1.trimmed_bismark_bt2_SE_report.txt")
 62 |     threads:
 63 |         3
 64 |     resources:
 65 |         mem_mb=14000
 66 |     shell:
 67 |         # map R1 with --pbat mode
 68 |         "bismark {bismark_reference} {unmapped_param_str} --bowtie2 {input} "
 69 |         "--pbat -o bam/ --temp_dir bam/"
 70 | 
 71 | rule bismark_r2:
 72 |     input:
 73 |         "fastq/{cell_id}-R2.trimmed.fq.gz"
 74 |     output:
 75 |         bam=temp("bam/{cell_id}-R2.trimmed_bismark_bt2.bam"),
 76 |         stats=temp("bam/{cell_id}-R2.trimmed_bismark_bt2_SE_report.txt")
 77 |     threads:
 78 |         3
 79 |     resources:
 80 |         mem_mb=14000
 81 |     shell:
 82 |         # map R2 with normal SE mode
 83 |         "bismark {bismark_reference} {unmapped_param_str} --bowtie2 {input} "
 84 |         "-o bam/ --temp_dir bam/"
 85 | 
 86 | # filter bam
 87 | rule filter_r1_bam:
 88 |     input:
 89 |         "bam/{cell_id}-R1.trimmed_bismark_bt2.bam"
 90 |     output:
 91 |         temp("bam/{cell_id}-R1.trimmed_bismark_bt2.filter.bam")
 92 |     shell:
 93 |         "samtools view -b -h -q 10 -o {output} {input}"
 94 | 
 95 | rule filter_r2_bam:
 96 |     input:
 97 |         "bam/{cell_id}-R2.trimmed_bismark_bt2.bam"
 98 |     output:
 99 |         temp("bam/{cell_id}-R2.trimmed_bismark_bt2.filter.bam")
100 |     shell:
101 |         "samtools view -b -h -q 10 -o {output} {input}"
102 | 
103 | # sort bam
104 | rule sort_r1_bam:
105 |     input:
106 |         "bam/{cell_id}-R1.trimmed_bismark_bt2.filter.bam"
107 |     output:
108 |         temp("bam/{cell_id}-R1.trimmed_bismark_bt2.sorted.bam")
109 |     resources:
110 |         mem_mb=1000
111 |     shell:
112 |         "samtools sort -o {output} {input}"
113 | 
114 | rule sort_r2_bam:
115 |     input:
116 |         "bam/{cell_id}-R2.trimmed_bismark_bt2.filter.bam"
117 |     output:
118 |         temp("bam/{cell_id}-R2.trimmed_bismark_bt2.sorted.bam")
119 |     resources:
120 |         mem_mb=1000
121 |     shell:
122 |         "samtools sort -o {output} {input}"
123 | 
124 | # remove PCR duplicates
125 | rule dedup_r1_bam:
126 |     input:
127 |         "bam/{cell_id}-R1.trimmed_bismark_bt2.sorted.bam"
128 |     output:
129 |         bam=temp("bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.bam"),
130 |         stats=temp("bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.matrix.txt")
131 |     resources:
132 |         mem_mb=1000
133 |     shell:
134 |         "picard MarkDuplicates I={input} O={output.bam} M={output.stats} "
135 |         "REMOVE_DUPLICATES=true TMP_DIR=bam/temp/"
136 | 
137 | rule dedup_r2_bam:
138 |     input:
139 |         "bam/{cell_id}-R2.trimmed_bismark_bt2.sorted.bam"
140 |     output:
141 |         bam=temp("bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.bam"),
142 |         stats=temp("bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.matrix.txt")
143 |     resources:
144 |         mem_mb=1000
145 |     shell:
146 |         "picard MarkDuplicates I={input} O={output.bam} M={output.stats} "
147 |         "REMOVE_DUPLICATES=true TMP_DIR=bam/temp/"
148 | 
149 | # merge R1 and R2, get final bam
150 | rule merge_bam:
151 |     input:
152 |         "bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.bam",
153 |         "bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.bam"
154 |     output:
155 |         "bam/{cell_id}.final.bam"
156 |     shell:
157 |         "samtools merge -f {output} {input}"
158 | 
159 | # generate ALLC
160 | rule allc:
161 |     input:
162 |         "bam/{cell_id}.final.bam"
163 |     output:
164 |         allc="allc/{cell_id}.allc.tsv.gz",
165 |         stats=temp("allc/{cell_id}.allc.tsv.gz.count.csv")
166 |     threads:
167 |         2
168 |     resources:
169 |         mem_mb=500
170 |     shell:
171 |         'allcools bam-to-allc '
172 |         '--bam_path {input} '
173 |         '--reference_fasta {reference_fasta} '
174 |         '--output_path {output.allc} '
175 |         '--cpu 1 '
176 |         '--num_upstr_bases {num_upstr_bases} '
177 |         '--num_downstr_bases {num_downstr_bases} '
178 |         '--compress_level {compress_level} '
179 |         '--save_count_df'
180 | 
181 | 
182 | # CGN extraction from ALLC
183 | rule cgn_extraction:
184 |     input:
185 |         "allc/{cell_id}.allc.tsv.gz",
186 |     output:
187 |         "allc-{mcg_context}/{cell_id}.{mcg_context}-Merge.allc.tsv.gz",
188 |     params:
189 |         prefix="allc-{mcg_context}/{cell_id}",
190 |     threads:
191 |         1
192 |     resources:
193 |         mem_mb=100
194 |     shell:
195 |         'allcools extract-allc '
196 |         '--strandness merge '
197 |         '--allc_path  {input} '
198 |         '--output_prefix {params.prefix} '
199 |         '--mc_contexts {mcg_context} '
200 |         '--chrom_size_path {chrom_size_path} '
201 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import print_default_mapping_config
2 | from .pipelines import prepare_run, start_from_cell_fastq
3 | from .stats import final_summary
4 | from .stats.plot import *
5 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/config.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import cemba_data
  4 | from ..utilities import MAPPING_MODE_CHOICES
  5 | 
  6 | # Load defaults
  7 | PACKAGE_DIR = pathlib.Path(cemba_data.__path__[0])
  8 | 
  9 | 
 10 | def print_default_mapping_config(mode,
 11 |                                  barcode_version,
 12 |                                  genome_fasta,
 13 |                                  bismark_ref=None,
 14 |                                  hisat3n_dna_ref=None,
 15 |                                  hisat3n_rna_ref=None,
 16 |                                  star_ref=None,
 17 |                                  gtf=None,
 18 |                                  nome=False,
 19 |                                  chrom_size_path=None):
 20 |     mode = mode.lower()
 21 |     if mode not in MAPPING_MODE_CHOICES:
 22 |         raise ValueError(f'Unknown mode {mode}')
 23 | 
 24 |     barcode_version = barcode_version.upper()
 25 |     if barcode_version not in ['V1', 'V2']:
 26 |         raise ValueError(f'Unknown mode {barcode_version}')
 27 | 
 28 |     if bismark_ref is not None:
 29 |         bismark_ref = pathlib.Path(bismark_ref).absolute()
 30 |     else:
 31 |         if hisat3n_rna_ref is None or hisat3n_dna_ref is None:
 32 |             raise ValueError('bismark_ref is required if hisat3n_rna_ref and hisat3n_dna_ref are not specified.')
 33 |         hisat3n_rna_ref = pathlib.Path(hisat3n_rna_ref).absolute()
 34 |         hisat3n_dna_ref = pathlib.Path(hisat3n_dna_ref).absolute()
 35 | 
 36 |     if mode == 'mct':
 37 |         if star_ref is None:
 38 |             if hisat3n_rna_ref is None:
 39 |                 raise ValueError('star_ref or hisat3n_rna_ref is required if mode is mct.')
 40 |         else:
 41 |             star_ref = pathlib.Path(star_ref).absolute()
 42 |         if gtf is None:
 43 |             raise ValueError('gtf must be provided when mode is mct.')
 44 |         gtf = pathlib.Path(gtf).absolute()
 45 | 
 46 |     if chrom_size_path is None:
 47 |         raise ValueError('chrom_size_path must be provided.')
 48 |     chrom_size_path = pathlib.Path(chrom_size_path).absolute()
 49 | 
 50 |     if mode == 'm3c':
 51 |         pass
 52 | 
 53 |     if mode == '4m':
 54 |         if (star_ref is None) and (hisat3n_rna_ref is None):
 55 |             raise ValueError('star_ref or hisat3n_rna_ref is required if mode is mct.')
 56 |         star_ref = pathlib.Path(star_ref).absolute()
 57 | 
 58 |         if gtf is None:
 59 |             raise ValueError('gtf must be provided when mode is mct.')
 60 |         gtf = pathlib.Path(gtf).absolute()
 61 | 
 62 |     genome_fasta = pathlib.Path(genome_fasta).absolute()
 63 | 
 64 |     if mode == 'mc':
 65 |         if nome:
 66 |             config_path = PACKAGE_DIR / 'files/default_config/mapping_config_nome.ini'
 67 |         else:
 68 |             config_path = PACKAGE_DIR / 'files/default_config/mapping_config_mc.ini'
 69 |         with open(config_path) as f:
 70 |             config_content = f.read()
 71 |     elif mode == 'mct':
 72 |         if nome:
 73 |             config_path = PACKAGE_DIR / 'files/default_config/mapping_config_mct-nome.ini'
 74 |         else:
 75 |             config_path = PACKAGE_DIR / 'files/default_config/mapping_config_mct.ini'
 76 |         with open(config_path) as f:
 77 |             config_content = f.read()
 78 |         if hisat3n_rna_ref is None:
 79 |             config_content = config_content.replace('CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR', str(star_ref))
 80 |         else:
 81 |             config_content = config_content.replace('CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE',
 82 |                                                     str(hisat3n_rna_ref))
 83 |         config_content = config_content.replace('CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF', str(gtf))
 84 |     elif mode == 'm3c':
 85 |         config_path = PACKAGE_DIR / 'files/default_config/mapping_config_m3c.ini'
 86 |         with open(config_path) as f:
 87 |             config_content = f.read()
 88 |     elif mode == '4m':
 89 |         config_path = PACKAGE_DIR / 'files/default_config/mapping_config_4m.ini'
 90 |         with open(config_path) as f:
 91 |             config_content = f.read()
 92 |         if hisat3n_rna_ref is None:
 93 |             config_content = config_content.replace('CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR', str(star_ref))
 94 |         else:
 95 |             config_content = config_content.replace('CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE',
 96 |                                                     str(hisat3n_rna_ref))
 97 |         config_content = config_content.replace('CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF', str(gtf))
 98 |         config_content = config_content.replace('CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH', str(chrom_size_path))
 99 |     else:
100 |         raise
101 | 
102 |     config_content = config_content.replace('CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH', str(chrom_size_path))
103 |     config_content = config_content.replace('USE_CORRECT_BARCODE_VERSION_HERE', barcode_version)
104 |     if hisat3n_dna_ref is None:
105 |         config_content = config_content.replace('CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR', str(bismark_ref))
106 |     else:
107 |         config_content = config_content.replace('CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE', str(hisat3n_dna_ref))
108 |     config_content = config_content.replace('CHANGE_THIS_TO_YOUR_REFERENCE_FASTA', str(genome_fasta))
109 |     print(config_content)
110 |     return
111 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/mct/__init__.py:
--------------------------------------------------------------------------------
1 | from .mct_bismark_bam_filter import select_dna_reads
2 | from .mct_star_bam_filter import select_rna_reads
3 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/mct/mct_bismark_bam_filter.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import re
  3 | import pysam
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def read_mc_level(read, frac=True, nome=False):
  8 |     bismark_tag = read.get_tag('XM')
  9 |     if nome:
 10 |         m_c = 0
 11 |         normal_c = 0
 12 |         seq = read.seq.upper()
 13 |         read_length = len(seq)
 14 |         for pos, xm_base in enumerate(bismark_tag):
 15 |             if xm_base in '.ZzUu':
 16 |                 # skip unrelated base (.), CpG (Zz), CpUnknown (Uu)
 17 |                 continue
 18 |             # Skip GpC
 19 |             try:
 20 |                 if read.is_reverse:
 21 |                     if (pos == read_length) or (read.seq[pos + 1] == 'C'):
 22 |                         continue
 23 |                 else:
 24 |                     if (pos == 0) or (read.seq[pos - 1] == 'G'):
 25 |                         continue
 26 |             except IndexError:
 27 |                 # start or end of the read
 28 |                 continue
 29 |             if xm_base in 'xh':
 30 |                 normal_c += 1
 31 |             elif xm_base in 'XH':
 32 |                 m_c += 1
 33 |             else:
 34 |                 pass
 35 |     else:
 36 |         m_c = bismark_tag.count('X') + bismark_tag.count('H')
 37 |         normal_c = bismark_tag.count('x') + bismark_tag.count('h')
 38 | 
 39 |     total_c = m_c + normal_c
 40 |     if total_c == 0:
 41 |         return 0, 0
 42 |     else:
 43 |         if frac:
 44 |             read_mc_rate = m_c / total_c
 45 |             return read_mc_rate, total_c
 46 |         else:
 47 |             return m_c, total_c
 48 | 
 49 | 
 50 | def select_dna_reads_normal(input_bam,
 51 |                             output_bam,
 52 |                             mc_rate_max_threshold=0.5,
 53 |                             cov_min_threshold=3,
 54 |                             nome=False):
 55 |     read_profile_dict = defaultdict(int)
 56 |     # init dict to make sure the series has something
 57 |     read_profile_dict[(50, 50)] = 0
 58 |     with pysam.AlignmentFile(input_bam) as f:
 59 |         with pysam.AlignmentFile(output_bam, header=f.header,
 60 |                                  mode='wb') as out_f:
 61 |             for read in f:
 62 |                 mc_frac, cov = read_mc_level(read, nome=nome)
 63 |                 read_profile_dict[(int(100 * mc_frac), cov)] += 1
 64 | 
 65 |                 # split reads
 66 |                 if (mc_frac > mc_rate_max_threshold) or (cov <
 67 |                                                          cov_min_threshold):
 68 |                     continue
 69 |                 out_f.write(read)
 70 |     with open(str(output_bam) + '.reads_profile.csv', 'w') as stat_f:
 71 |         stat_f.write('mc_frac,cov,count\n')
 72 |         for (mc_frac, cov), count in read_profile_dict.items():
 73 |             stat_f.write(f'{mc_frac},{cov},{count}\n')
 74 |     return
 75 | 
 76 | 
 77 | def select_dna_reads_split_reads(input_bam,
 78 |                                  output_bam,
 79 |                                  mc_rate_max_threshold=0.5,
 80 |                                  cov_min_threshold=3,
 81 |                                  nome=False):
 82 |     splited_read_name_pattern = re.compile('.+-[lrm]$')
 83 | 
 84 |     # first pass: determine read methylation level
 85 |     read_level_mcs = defaultdict(int)
 86 |     read_level_covs = defaultdict(int)
 87 |     with pysam.AlignmentFile(input_bam) as f:
 88 |         for read in f:
 89 |             mc, cov = read_mc_level(read, frac=False, nome=nome)
 90 |             read_name = read.qname
 91 |             if splited_read_name_pattern.search(read_name):
 92 |                 read_level_mcs[read_name[:-2]] += mc
 93 |                 read_level_covs[read_name[:-2]] += cov
 94 |             else:
 95 |                 read_level_mcs[read_name] += mc
 96 |                 read_level_covs[read_name] += cov
 97 |     read_level_data = pd.DataFrame({
 98 |         'mc': read_level_mcs,
 99 |         'cov': read_level_covs
100 |     })
101 |     read_level_data['mc_frac'] = read_level_data['mc'] / (read_level_data['cov'] +
102 |                                                        0.001)
103 |     read_level_data['mc_frac'] = (read_level_data['mc_frac'] * 100).astype(int)
104 |     if read_level_data.shape[0] == 0:
105 |         # in case there is no read at all:
106 |         with open(f'{output_bam}.reads_profile.csv', 'w') as f:
107 |             f.write('mc_frac,cov,count\n')
108 |             f.write('0,1,0\n')
109 |     else:
110 |         profile = read_level_data.groupby('mc_frac')['cov'].value_counts()
111 |         profile.name = 'count'
112 |         profile = profile.reset_index()
113 |         profile.to_csv(f'{output_bam}.reads_profile.csv', index=None)
114 | 
115 |     # filter reads
116 |     use_reads = read_level_data[
117 |         (read_level_data['mc_frac'] < mc_rate_max_threshold)
118 |         & (read_level_data['cov'] >= cov_min_threshold)].index.tolist()
119 |     use_reads = set(use_reads)
120 |     del read_level_data
121 | 
122 |     # second pass: write passed reads
123 |     with pysam.AlignmentFile(input_bam) as f:
124 |         with pysam.AlignmentFile(output_bam, header=f.header,
125 |                                  mode='wb') as out_f:
126 |             for read in f:
127 |                 read_name = read.qname
128 |                 if (read_name in use_reads) or (read_name[:-2] in use_reads):
129 |                     # read name or read name without suffix
130 |                     out_f.write(read)
131 |     return
132 | 
133 | 
134 | def select_dna_reads(input_bam,
135 |                      output_bam,
136 |                      mc_rate_max_threshold=0.5,
137 |                      cov_min_threshold=3,
138 |                      nome=False,
139 |                      assay_type='mc'):
140 |     if assay_type == 'mc':
141 |         select_dna_reads_normal(input_bam,
142 |                                 output_bam,
143 |                                 mc_rate_max_threshold=mc_rate_max_threshold,
144 |                                 cov_min_threshold=cov_min_threshold,
145 |                                 nome=nome)
146 |     elif assay_type == 'm3c':
147 |         select_dna_reads_split_reads(input_bam,
148 |                                      output_bam,
149 |                                      mc_rate_max_threshold=mc_rate_max_threshold,
150 |                                      cov_min_threshold=cov_min_threshold,
151 |                                      nome=nome)
152 |     else:
153 |         raise ValueError(f'Unknown assay_type {assay_type}.')
154 |     return
155 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/pipelines/_4m.py:
--------------------------------------------------------------------------------
 1 | def _4m_config_str(config):
 2 |     """Change the dtype of parameters and make a appropriate string"""
 3 |     int_parameters = {
 4 |         'overlap': 6,
 5 |         'r1_left_cut': 10,
 6 |         'r1_right_cut': 10,
 7 |         'r2_left_cut': 10,
 8 |         'r2_right_cut': 10,
 9 |         'quality_threshold': 20,
10 |         'length_threshold': 30,
11 |         'total_read_pairs_min': 1,
12 |         'total_read_pairs_max': 6000000,
13 |         'mapq_threshold': 10,
14 |         'num_upstr_bases': 0,
15 |         'num_downstr_bases': 2,
16 |         'compress_level': 5,
17 |         'dna_cov_min_threshold': 3,
18 |         'rna_cov_min_threshold': 3,
19 |         'split_left_size': 40,
20 |         'split_right_size': 40,
21 |         'split_middle_min_size': 30,
22 |         'min_gap': 2500,
23 |         'trim_on_both_end': 5
24 |     }
25 | 
26 |     float_parameters = {
27 |         'mc_rate_max_threshold': 0.5,
28 |         'mc_rate_min_threshold': 0.9
29 |     }
30 | 
31 |     str_parameters = {
32 |         'mode': 'mc',
33 |         'barcode_version': 'required',
34 |         'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC',
35 |         'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA',
36 |         'bismark_reference': 'required',
37 |         'reference_fasta': 'required',
38 |         'star_reference': 'required',
39 |         'hisat3n_dna_reference': 'required',
40 |         'hisat3n_rna_reference': 'required',
41 |         'hisat3n_repeat_index_type': 'no-repeat',
42 |         'gtf_path': 'required',
43 |         'feature_type': 'gene',
44 |         'id_type': 'gene_id',
45 |         'mc_stat_feature': 'CHN CGN CCC',
46 |         'mc_stat_alias': 'mCH mCG mCCC',
47 |         'chrom_size_path': 'required',
48 |         'nome_flag_str': '--nome'
49 |     }
50 |     if 'hisat3n_dna_reference' in config:
51 |         del str_parameters['bismark_reference']
52 |         del str_parameters['star_reference']
53 | 
54 |     typed_config = {}
55 |     for k, default in int_parameters.items():
56 |         if k in config:
57 |             typed_config[k] = int(config[k])
58 |         else:
59 |             if default != 'required':
60 |                 typed_config[k] = default
61 |             else:
62 |                 raise ValueError(f'Required parameter {k} not found in config. '
63 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
64 | 
65 |     for k, default in float_parameters.items():
66 |         if k in config:
67 |             typed_config[k] = float(config[k])
68 |         else:
69 |             if default != 'required':
70 |                 typed_config[k] = default
71 |             else:
72 |                 raise ValueError(f'Required parameter {k} not found in config.')
73 | 
74 |     for k, default in str_parameters.items():
75 |         if k in config:
76 |             typed_config[k] = f"'{config[k]}'"
77 |         else:
78 |             if default != 'required':
79 |                 typed_config[k] = f"'{default}'"
80 |             else:
81 |                 raise ValueError(f'Required parameter {k} not found in config. '
82 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
83 | 
84 |     config_str = ""
85 |     for k, v in typed_config.items():
86 |         config_str += f"{k} = {v}\n"
87 |     return config_str
88 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/pipelines/m3c.py:
--------------------------------------------------------------------------------
 1 | def m3c_config_str(config):
 2 |     """Change the dtype of parameters and make a appropriate string"""
 3 |     int_parameters = {
 4 |         'overlap': 6,
 5 |         'r1_left_cut': 10,
 6 |         'r1_right_cut': 10,
 7 |         'r2_left_cut': 10,
 8 |         'r2_right_cut': 10,
 9 |         'quality_threshold': 20,
10 |         'length_threshold': 30,
11 |         'total_read_pairs_min': 1,
12 |         'total_read_pairs_max': 6000000,
13 |         'mapq_threshold': 10,
14 |         'num_upstr_bases': 0,
15 |         'num_downstr_bases': 2,
16 |         'compress_level': 5,
17 |         'split_left_size': 40,
18 |         'split_right_size': 40,
19 |         'split_middle_min_size': 30,
20 |         'min_gap': 2500,
21 |         'trim_on_both_end': 5
22 |     }
23 | 
24 |     str_parameters = {
25 |         'mode': 'mc',
26 |         'barcode_version': 'required',
27 |         'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC',
28 |         'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA',
29 |         'bismark_reference': 'required',
30 |         'hisat3n_dna_reference': 'required',
31 |         'hisat3n_repeat_index_type': 'no-repeat',
32 |         'reference_fasta': 'required',
33 |         'mc_stat_feature': 'CHN CGN CCC',
34 |         'mc_stat_alias': 'mCH mCG mCCC',
35 |         'chrom_size_path': 'required'
36 |     }
37 |     if 'hisat3n_dna_reference' in config:
38 |         del str_parameters['bismark_reference']
39 |     else:
40 |         del str_parameters['hisat3n_dna_reference']
41 |         del str_parameters['hisat3n_repeat_index_type']
42 | 
43 |     typed_config = {}
44 |     for k, default in int_parameters.items():
45 |         if k in config:
46 |             typed_config[k] = int(config[k])
47 |         else:
48 |             if default != 'required':
49 |                 typed_config[k] = default
50 |             else:
51 |                 raise ValueError(f'Required parameter {k} not found in config. '
52 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
53 | 
54 |     for k, default in str_parameters.items():
55 |         if k in config:
56 |             typed_config[k] = f"'{config[k]}'"
57 |         else:
58 |             if default != 'required':
59 |                 typed_config[k] = f"'{default}'"
60 |             else:
61 |                 raise ValueError(f'Required parameter {k} not found in config. '
62 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
63 | 
64 |     config_str = ""
65 |     for k, v in typed_config.items():
66 |         config_str += f"{k} = {v}\n"
67 |     return config_str
68 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/pipelines/mc.py:
--------------------------------------------------------------------------------
 1 | def mc_config_str(config):
 2 |     """Change the dtype of parameters and make a appropriate string"""
 3 |     int_parameters = {
 4 |         'overlap': 6,
 5 |         'r1_left_cut': 10,
 6 |         'r1_right_cut': 10,
 7 |         'r2_left_cut': 10,
 8 |         'r2_right_cut': 10,
 9 |         'quality_threshold': 20,
10 |         'length_threshold': 30,
11 |         'total_read_pairs_min': 1,
12 |         'total_read_pairs_max': 6000000,
13 |         'mapq_threshold': 10,
14 |         'num_upstr_bases': 0,
15 |         'num_downstr_bases': 2,
16 |         'compress_level': 5
17 |     }
18 | 
19 |     bool_parameters = {'unmapped_fastq': False}
20 | 
21 |     str_parameters = {
22 |         'mode': 'mc',
23 |         'barcode_version': 'required',
24 |         'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC',
25 |         'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA',
26 |         'bismark_reference': 'required',
27 |         'hisat3n_dna_reference': 'required',
28 |         'hisat3n_repeat_index_type': 'no-repeat',
29 |         'reference_fasta': 'required',
30 |         'chrom_size_path': 'required',
31 |         'mc_stat_feature': 'CHN CGN CCC',
32 |         'mc_stat_alias': 'mCH mCG mCCC'
33 |     }
34 |     if 'hisat3n_dna_reference' in config:
35 |         del str_parameters['bismark_reference']
36 |     else:
37 |         del str_parameters['hisat3n_dna_reference']
38 |         del str_parameters['hisat3n_repeat_index_type']
39 | 
40 |     typed_config = {}
41 |     for k, default in int_parameters.items():
42 |         if k in config:
43 |             typed_config[k] = int(config[k])
44 |         else:
45 |             if default != 'required':
46 |                 typed_config[k] = default
47 |             else:
48 |                 raise ValueError(f'Required parameter {k} not found in config.')
49 | 
50 |     for k, default in bool_parameters.items():
51 |         if k in config:
52 |             v = config[k]
53 |             if v.lower().startswith('t'):
54 |                 v = True
55 |             else:
56 |                 v = False
57 |             typed_config[k] = v
58 |         else:
59 |             if default != 'required':
60 |                 typed_config[k] = default
61 |             else:
62 |                 raise ValueError(f'Required parameter {k} not found in config. '
63 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
64 |     # judge unmapped_fastq specifically
65 |     unmapped_param_str = '--un' if typed_config['unmapped_fastq'] else ''
66 |     typed_config['unmapped_param_str'] = f"'{unmapped_param_str}'"
67 | 
68 |     for k, default in str_parameters.items():
69 |         if k in config:
70 |             typed_config[k] = f"'{config[k]}'"
71 |         else:
72 |             if default != 'required':
73 |                 typed_config[k] = f"'{default}'"
74 |             else:
75 |                 raise ValueError(f'Required parameter {k} not found in config. '
76 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
77 | 
78 |     config_str = ""
79 |     for k, v in typed_config.items():
80 |         config_str += f"{k} = {v}\n"
81 |     return config_str
82 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/pipelines/mct.py:
--------------------------------------------------------------------------------
  1 | def mct_config_str(config):
  2 |     """Change the dtype of parameters and make a appropriate string"""
  3 |     int_parameters = {
  4 |         'overlap': 6,
  5 |         'r1_left_cut': 10,
  6 |         'r1_right_cut': 10,
  7 |         'r2_left_cut': 10,
  8 |         'r2_right_cut': 10,
  9 |         'quality_threshold': 20,
 10 |         'length_threshold': 30,
 11 |         'total_read_pairs_min': 1,
 12 |         'total_read_pairs_max': 6000000,
 13 |         'mapq_threshold': 10,
 14 |         'num_upstr_bases': 0,
 15 |         'num_downstr_bases': 2,
 16 |         'compress_level': 5,
 17 |         'dna_cov_min_threshold': 3,
 18 |         'rna_cov_min_threshold': 3
 19 |     }
 20 | 
 21 |     float_parameters = {
 22 |         'mc_rate_max_threshold': 0.5,
 23 |         'mc_rate_min_threshold': 0.9
 24 |     }
 25 |     bool_parameters = {'unmapped_fastq': False}
 26 | 
 27 |     str_parameters = {
 28 |         'mode': 'mc',
 29 |         'barcode_version': 'required',
 30 |         'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC',
 31 |         'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA',
 32 |         'bismark_reference': 'required',
 33 |         'hisat3n_dna_reference': 'required',
 34 |         'hisat3n_rna_reference': 'required',
 35 |         'hisat3n_repeat_index_type': 'no-repeat',
 36 |         'reference_fasta': 'required',
 37 |         'star_reference': 'required',
 38 |         'gtf_path': 'required',
 39 |         'feature_type': 'gene',
 40 |         'id_type': 'gene_id',
 41 |         'nome_flag_str': 'required'
 42 |     }
 43 |     if 'hisat3n_dna_reference' in config:
 44 |         del str_parameters['bismark_reference']
 45 |         del str_parameters['star_reference']
 46 |     else:
 47 |         del str_parameters['hisat3n_dna_reference']
 48 |         del str_parameters['hisat3n_rna_reference']
 49 |         del str_parameters['hisat3n_repeat_index_type']
 50 | 
 51 |     typed_config = {}
 52 |     for k, default in int_parameters.items():
 53 |         if k in config:
 54 |             typed_config[k] = int(config[k])
 55 |         else:
 56 |             if default != 'required':
 57 |                 typed_config[k] = default
 58 |             else:
 59 |                 raise ValueError(f'Required parameter {k} not found in config.')
 60 | 
 61 |     for k, default in float_parameters.items():
 62 |         if k in config:
 63 |             typed_config[k] = float(config[k])
 64 |         else:
 65 |             if default != 'required':
 66 |                 typed_config[k] = default
 67 |             else:
 68 |                 raise ValueError(f'Required parameter {k} not found in config.')
 69 | 
 70 |     for k, default in bool_parameters.items():
 71 |         if k in config:
 72 |             v = config[k]
 73 |             if v.lower().startswith('t'):
 74 |                 v = True
 75 |             else:
 76 |                 v = False
 77 |             typed_config[k] = v
 78 |         else:
 79 |             if default != 'required':
 80 |                 typed_config[k] = default
 81 |             else:
 82 |                 raise ValueError(f'Required parameter {k} not found in config. '
 83 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
 84 |     # judge unmapped_fastq specifically
 85 |     unmapped_param_str = '--un' if typed_config['unmapped_fastq'] else ''
 86 |     typed_config['unmapped_param_str'] = f"'{unmapped_param_str}'"
 87 | 
 88 |     for k, default in str_parameters.items():
 89 |         if k in config:
 90 |             typed_config[k] = f"'{config[k]}'"
 91 |         else:
 92 |             if default != 'required':
 93 |                 typed_config[k] = f"'{default}'"
 94 |             else:
 95 |                 raise ValueError(f'Required parameter {k} not found in config. '
 96 |                                  f'You can print the newest mapping config template via "yap default-mapping-config".')
 97 | 
 98 |     config_str = ""
 99 |     for k, v in typed_config.items():
100 |         config_str += f"{k} = {v}\n"
101 |     return config_str
102 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/stats/_4m.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import pandas as pd
 4 | import pysam
 5 | 
 6 | from .mc import mc_mapping_stats
 7 | from .mct import _count_reads_by_rg_in_star_bam, \
 8 |     summary_rna_mapping, \
 9 |     summarize_select_dna_reads, \
10 |     aggregate_feature_counts
11 | from .m3c import m3c_mapping_stats
12 | 
13 | 
14 | def _4m_mapping_stats(output_dir, config):
15 |     """this may apply to single UID dir, so config is provided as parameter"""
16 |     m3c_stats_df = m3c_mapping_stats(output_dir, config)
17 |     select_dna_stats_df = summarize_select_dna_reads(output_dir, config)
18 |     rna_stats_df = summary_rna_mapping(output_dir)
19 |     final_df = pd.concat([m3c_stats_df, select_dna_stats_df, rna_stats_df], axis=1)
20 |     return final_df
21 | 
22 | 
23 | def _4m_additional_cols(final_df, output_dir):
24 |     final_df = final_df.copy()
25 |     final_df['CellInputReadPairs'] = final_df['R1InputReads'].astype(int)
26 |     if 'PCRIndex' in final_df.columns:  # plate info might not exist if the cell name is abnormal
27 |         cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum())
28 |                                         for _, i in final_df.groupby('PCRIndex')])
29 |         final_df['CellBarcodeRatio'] = cell_barcode_ratio
30 | 
31 |     # snm3C part
32 |     final_df['FinalmCReads'] = final_df['R1DeduppedReads'] + final_df['R2DeduppedReads']
33 |     # use % to be consistent with others
34 |     final_df['R1MappingRate'] = final_df['R1UniqueMappedReads'] / final_df['R1TrimmedReads'] * 100
35 |     final_df['R2MappingRate'] = final_df['R2UniqueMappedReads'] / final_df['R2TrimmedReads'] * 100
36 |     final_df['R1DuplicationRate'] = (1 - final_df['R1DeduppedReads'] / final_df['R1UniqueMappedReads']) * 100
37 |     final_df['R2DuplicationRate'] = (1 - final_df['R2DeduppedReads'] / final_df['R2UniqueMappedReads']) * 100
38 |     final_df['TotalContacts'] = final_df[
39 |         ['CisShortContact', 'CisLongContact', 'TransContact']].sum(axis=1)
40 |     final_df['CisShortRatio'] = final_df['CisShortContact'] / final_df['TotalContacts']
41 |     final_df['CisLongRatio'] = final_df['CisLongContact'] / final_df['TotalContacts']
42 |     final_df['TransRatio'] = final_df['TransContact'] / final_df['TotalContacts']
43 | 
44 |     # snmCT part
45 |     stats = pd.read_hdf(output_dir / 'TotalRNAData.h5', key='stats')
46 |     final_df['GenesDetected'] = stats['GenesDetected']
47 |     # calculate some mCT specific ratios
48 |     final_df['DNAReadsYield'] = final_df['FinalDNAReads'] / (
49 |             final_df['CellInputReadPairs'] * 2)
50 |     final_df['RNAReadsYield'] = final_df['FinalRNAReads'] / final_df[
51 |         'CellInputReadPairs']
52 |     final_df['RNA/(DNA+RNA)'] = final_df['FinalRNAReads'].fillna(0) / (
53 |             final_df['R1DeduppedReads'].fillna(0) + 1)
54 |     return final_df
55 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/stats/__init__.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import subprocess
  3 | 
  4 | import pandas as pd
  5 | from papermill import execute_notebook, PapermillExecutionError
  6 | 
  7 | from .m3c import m3c_mapping_stats, m3c_additional_cols
  8 | from .mc import mc_mapping_stats, mc_additional_cols
  9 | from .mct import mct_mapping_stats, mct_additional_cols
 10 | from ._4m import _4m_mapping_stats, _4m_additional_cols
 11 | from .plate_info import get_plate_info
 12 | from ..pipelines import PACKAGE_DIR
 13 | from ...utilities import get_configuration
 14 | 
 15 | 
 16 | def mapping_stats(output_dir):
 17 |     """This is UID level mapping summary, the config file is in parent dir"""
 18 |     output_dir = pathlib.Path(output_dir).absolute()
 19 |     config = get_configuration(output_dir.parent / 'mapping_config.ini')
 20 |     mode = config['mode']
 21 | 
 22 |     if mode == 'mc':
 23 |         final_df = mc_mapping_stats(output_dir, config)
 24 |     elif mode == 'mct':
 25 |         final_df = mct_mapping_stats(output_dir, config)
 26 |     elif mode == 'm3c':
 27 |         final_df = m3c_mapping_stats(output_dir, config)
 28 |     elif mode == '4m':
 29 |         final_df = _4m_mapping_stats(output_dir, config)
 30 |     else:
 31 |         raise ValueError
 32 | 
 33 |     # plate info, which is tech independent.
 34 |     _plate_info = get_plate_info(final_df.index, barcode_version=config['barcode_version'])
 35 |     final_df = pd.concat([_plate_info, final_df], axis=1)
 36 | 
 37 |     # save
 38 |     final_df.to_csv(output_dir / 'MappingSummary.csv.gz')
 39 |     return
 40 | 
 41 | 
 42 | def final_summary(output_dir, cleanup=True, notebook=None):
 43 |     output_dir = pathlib.Path(output_dir).absolute()
 44 |     mode = get_configuration(output_dir / 'mapping_config.ini')['mode']
 45 |     path_to_remove = []
 46 | 
 47 |     # Before running summary,
 48 |     # first make sure all the UID dir having Snakefile also has mapping summary (means successful)
 49 |     snakefile_list = list(output_dir.glob('*/Snakefile'))
 50 |     summary_paths = []
 51 |     missing_summary_dirs = []
 52 |     for path in snakefile_list:
 53 |         uid_dir = path.parent
 54 |         summary_path = uid_dir / 'MappingSummary.csv.gz'
 55 |         if summary_path.exists():
 56 |             summary_paths.append(summary_path)
 57 |         else:
 58 |             missing_summary_dirs.append(uid_dir)
 59 | 
 60 |     if len(missing_summary_dirs) != 0:
 61 |         print('These sub dir missing MappingSummary files:')
 62 |         for p in missing_summary_dirs:
 63 |             print(p)
 64 |         raise FileNotFoundError(f'Note that all sub dir should be successfully mapped '
 65 |                                 f'before generating final summary. \n'
 66 |                                 f'The MappingSummary.csv.gz is the final target file of snakefile in {path}. \n'
 67 |                                 f'Run the corresponding snakemake command again to retry mapping.\n'
 68 |                                 f'The snakemake commands can be found in output_dir/snakemake/*/snakemake_cmd.txt')
 69 | 
 70 |     # aggregate mapping summaries
 71 |     total_mapping_summary = pd.concat([pd.read_csv(path, index_col=0)
 72 |                                        for path in summary_paths])
 73 |     total_mapping_summary_path = output_dir / 'stats/MappingSummary.csv.gz'
 74 | 
 75 |     # if this is mct, aggregate all the gene counts
 76 |     if mode in ['mct', '4m']:
 77 |         from ..stats.mct import aggregate_feature_counts
 78 |         aggregate_feature_counts(output_dir)
 79 | 
 80 |     # add additional columns based on some calculation
 81 |     if mode == 'mc':
 82 |         total_mapping_summary = mc_additional_cols(total_mapping_summary)
 83 |     elif mode == 'mct':
 84 |         total_mapping_summary = mct_additional_cols(total_mapping_summary, output_dir=output_dir)
 85 |     elif mode == 'm3c':
 86 |         total_mapping_summary = m3c_additional_cols(total_mapping_summary)
 87 |     elif mode == '4m':
 88 |         total_mapping_summary = _4m_additional_cols(total_mapping_summary, output_dir=output_dir)
 89 |     else:
 90 |         raise
 91 | 
 92 |     # save total mapping summary
 93 |     total_mapping_summary.to_csv(total_mapping_summary_path)
 94 | 
 95 |     # add .snakemake files to deletion
 96 |     snakemake_hiding_dirs = list(output_dir.glob('*/.snakemake'))
 97 |     path_to_remove += snakemake_hiding_dirs
 98 | 
 99 |     # add temp dir in the bam dirs to deletion
100 |     mapping_temp_dirs = list(output_dir.glob('*/bam/temp'))
101 |     path_to_remove += mapping_temp_dirs
102 | 
103 |     # write a ALLC path file for generating MCDS
104 |     allc_paths = pd.Series({path.name.split('.')[0]: str(path)
105 |                             for path in output_dir.glob('*/allc/*tsv.gz')})
106 |     allc_paths.to_csv(output_dir / 'stats/AllcPaths.tsv', sep='\t', header=False)
107 | 
108 |     if 'Plate' in total_mapping_summary.columns:  # only run notebook when plate info exist
109 |         # run summary notebook
110 |         nb_path = output_dir / 'stats/MappingSummary.ipynb'
111 |         try:
112 |             mode = get_configuration(output_dir / 'mapping_config.ini')['mode']
113 |             if notebook is None:
114 |                 template_notebook = PACKAGE_DIR / f'files/mapping_summary_template/{mode}_template.ipynb'
115 |             else:
116 |                 template_notebook = str(notebook)
117 |             print(f'Using notebook template from {template_notebook}')
118 |             print('Executing summary plotting notebook...')
119 |             execute_notebook(
120 |                 input_path=str(template_notebook),
121 |                 output_path=str(nb_path),
122 |                 parameters=dict(output_dir=str(output_dir))
123 |             )
124 |             print('Summary notebook successfully executed. Exporting HTML...')
125 |             subprocess.run(['jupyter', 'nbconvert', '--to', 'html', str(nb_path)])
126 |             print(f'See the summary plots here: {str(nb_path)[:-5]}html')
127 |             print(f'Or customize the summary plots here: {nb_path}')
128 |         except PapermillExecutionError:
129 |             print(f'Ops, summary plotting notebook got some error, check the information in {nb_path}')
130 |             cleanup = False
131 | 
132 |     # delete
133 |     if cleanup:
134 |         print('Clean up snakemake log (might take several minutes) ...')
135 |         for path in path_to_remove:
136 |             subprocess.run(['rm', '-rf', str(path)], check=True)
137 |     return
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/stats/m3c.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import pandas as pd
 4 | from pysam import AlignmentFile
 5 | 
 6 | from .utilities import parse_trim_fastq_stats, parse_trim_fastq_stats_mct, generate_allc_stats
 7 | 
 8 | 
 9 | def m3c_bam_unique_read_counts(bam_path, read_type_int):
10 |     unique_reads = set()
11 |     with AlignmentFile(bam_path) as bam:
12 |         for read in bam:
13 |             unique_reads.add(read.query_name.split(f'_{read_type_int}:N:0:')[0])
14 |     return len(unique_reads)
15 | 
16 | 
17 | def m3c_count_bams(bam_dir, cell_id, read_type):
18 |     bam_path_dict = {
19 |         f'{read_type}UniqueMappedReads': bam_dir / f'{cell_id}-{read_type}.two_mapping.filter.bam',
20 |         f'{read_type}DeduppedReads': bam_dir / f'{cell_id}-{read_type}.two_mapping.deduped.bam',
21 |     }
22 |     read_counts = {name: m3c_bam_unique_read_counts(path, 1 if read_type == 'R1' else 2)
23 |                    for name, path in bam_path_dict.items()}
24 |     return pd.Series(read_counts, name=cell_id)
25 | 
26 | 
27 | def m3c_mapping_stats(output_dir, config):
28 |     """this may apply to single UID dir, so config is provided as parameter"""
29 |     output_dir = pathlib.Path(output_dir).absolute()
30 |     fastq_dir = output_dir / 'fastq'
31 |     bam_dir = output_dir / 'bam'
32 |     hic_dir = output_dir / 'hic'
33 |     cell_stats = []
34 |     cell_ids = [path.name.split('.')[0]
35 |                 for path in bam_dir.glob('*.3C.sorted.bam')]
36 | 
37 |     for cell_id in cell_ids:
38 |         total_stats = []  # list of series
39 |         for read_type in ['R1', 'R2']:
40 |             # fastq reads
41 |             if config['mode'] in ['4m', 'mct']:
42 |                 total_stats.append(
43 |                     parse_trim_fastq_stats_mct(
44 |                         fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.txt'))
45 |             else:
46 |                 total_stats.append(
47 |                     parse_trim_fastq_stats(
48 |                         fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.tsv'))
49 |             # bam reads
50 |             total_stats.append(
51 |                 m3c_count_bams(bam_dir, cell_id, read_type)
52 |             )
53 |         # contacts
54 |         contact_counts = pd.read_csv(hic_dir / f'{cell_id}.3C.contact.tsv.counts.txt',
55 |                                      header=None, index_col=0, squeeze=True)
56 |         contact_counts.name = cell_id
57 |         total_stats.append(contact_counts)
58 | 
59 |         cell_stats.append(pd.concat(total_stats))
60 |     total_df = pd.DataFrame(cell_stats)
61 | 
62 |     # add allc stats
63 |     allc_df = generate_allc_stats(output_dir, config)
64 |     final_df = pd.concat([total_df, allc_df], sort=True, axis=1)
65 |     return final_df
66 | 
67 | 
68 | def m3c_additional_cols(final_df):
69 |     final_df['FinalmCReads'] = final_df['R1DeduppedReads'] + final_df['R2DeduppedReads']
70 |     final_df['CellInputReadPairs'] = final_df['R1InputReads']
71 |     # use % to be consistent with others
72 |     final_df['R1MappingRate'] = final_df['R1UniqueMappedReads'] / final_df['R1TrimmedReads'] * 100
73 |     final_df['R2MappingRate'] = final_df['R2UniqueMappedReads'] / final_df['R2TrimmedReads'] * 100
74 |     final_df['R1DuplicationRate'] = (1 - final_df['R1DeduppedReads'] / final_df['R1UniqueMappedReads']) * 100
75 |     final_df['R2DuplicationRate'] = (1 - final_df['R2DeduppedReads'] / final_df['R2UniqueMappedReads']) * 100
76 | 
77 |     if 'PCRIndex' in final_df.columns:  # plate info might not exist if the cell name is abnormal
78 |         cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum())
79 |                                         for _, i in final_df.groupby('PCRIndex')])
80 |         final_df['CellBarcodeRatio'] = cell_barcode_ratio
81 | 
82 |     final_df['TotalContacts'] = final_df[
83 |         ['CisShortContact', 'CisLongContact', 'TransContact']].sum(axis=1)
84 |     final_df['CisShortRatio'] = final_df['CisShortContact'] / final_df['TotalContacts']
85 |     final_df['CisLongRatio'] = final_df['CisLongContact'] / final_df['TotalContacts']
86 |     final_df['TransRatio'] = final_df['TransContact'] / final_df['TotalContacts']
87 |     return final_df
88 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/stats/mc.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from .utilities import parse_trim_fastq_stats, parse_trim_fastq_stats_mct, \
 6 |     parse_bismark_report, parse_deduplicate_stat, \
 7 |     generate_allc_stats
 8 | 
 9 | 
10 | def mc_mapping_stats(output_dir, config):
11 |     """this may apply to single UID dir, so config is provided as parameter"""
12 |     output_dir = pathlib.Path(output_dir).absolute()
13 |     fastq_dir = output_dir / 'fastq'
14 |     bam_dir = output_dir / 'bam'
15 |     allc_dir = output_dir / 'allc'
16 |     cell_stats = []
17 |     cell_ids = [path.name.split('.')[0]
18 |                 for path in allc_dir.glob(f'*.allc.tsv.gz')]
19 | 
20 |     for cell_id in cell_ids:
21 |         print(f'Parsing stats of {cell_id}.')
22 |         total_stats = []
23 |         for read_type in ['R1', 'R2']:
24 |             if config['mode'] in ['4m', 'mct']:
25 |                 total_stats.append(
26 |                     parse_trim_fastq_stats_mct(
27 |                         fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.txt'))
28 |             else:
29 |                 total_stats.append(
30 |                     parse_trim_fastq_stats(
31 |                         fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.tsv'))
32 |             total_stats.append(
33 |                 parse_bismark_report(
34 |                     bam_dir / f'{cell_id}-{read_type}.trimmed_bismark_bt2_SE_report.txt'))
35 |             total_stats.append(
36 |                 parse_deduplicate_stat(
37 |                     bam_dir / f'{cell_id}-{read_type}.trimmed_bismark_bt2.deduped.matrix.txt'
38 |                 ))
39 |         cell_stats.append(pd.concat(total_stats))
40 |     mapping_df = pd.DataFrame(cell_stats)
41 |     mapping_df.index.name = 'cell_id'
42 | 
43 |     # add allc stats
44 |     allc_df = generate_allc_stats(output_dir, config)
45 |     final_df = pd.concat([mapping_df, allc_df], sort=True, axis=1)
46 |     return final_df
47 | 
48 | 
49 | def mc_additional_cols(final_df):
50 |     """Additional columns for mC mapping summary"""
51 |     final_df = final_df.copy()
52 |     final_df['CellInputReadPairs'] = final_df['R1InputReads'].astype(int)  # == final_df['R2InputReads']
53 |     if 'PCRIndex' in final_df.columns:  # plate info might not exist if the cell name is abnormal
54 |         cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum())
55 |                                         for _, i in final_df.groupby('PCRIndex')])
56 |         final_df['CellBarcodeRatio'] = cell_barcode_ratio
57 | 
58 |     final_df['FinalmCReads'] = final_df['R1FinalBismarkReads'] + final_df['R2FinalBismarkReads']
59 |     return final_df
60 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/stats/mct.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import pandas as pd
  4 | import pysam
  5 | 
  6 | from .mc import mc_mapping_stats
  7 | 
  8 | 
  9 | def _count_reads_by_rg_in_star_bam(bam_path):
 10 |     try:
 11 |         bam = pysam.AlignmentFile(bam_path)
 12 |     except ValueError:
 13 |         # empty bam file
 14 |         return
 15 | 
 16 |     cell_read_counts = {cell['ID']: 0 for cell in bam.header['RG']}
 17 | 
 18 |     for read in bam:
 19 |         cell = read.get_tag('RG')
 20 |         cell_read_counts[cell] += 1
 21 |     read_count = pd.Series(cell_read_counts, name='Reads')
 22 |     read_count.index.name = 'cell_id'
 23 |     return read_count
 24 | 
 25 | 
 26 | def summary_rna_mapping(output_dir):
 27 |     output_dir = pathlib.Path(output_dir)
 28 | 
 29 |     # summarize read counts for each cell before filter by mC rate
 30 |     total_star_mapped_reads = _count_reads_by_rg_in_star_bam(output_dir / 'rna_bam/TotalRNAAligned.filtered.bam')
 31 | 
 32 |     # feature count summary
 33 |     total_counts = pd.read_csv(output_dir / 'rna_bam/TotalRNAAligned.rna_reads.feature_count.tsv.summary',
 34 |                                sep='\t', index_col=0).T
 35 |     total_counts.index = total_counts.index.map(lambda i: i.split(':')[-1])
 36 |     feature_count_summary = total_counts[['Assigned']].copy()
 37 |     feature_count_summary['FinalRNAReads'] = total_counts.sum(axis=1)
 38 |     feature_count_summary.columns = ['FinalCountedReads', 'FinalRNAReads']
 39 | 
 40 |     total_rna_stat = feature_count_summary.copy()
 41 |     total_rna_stat['RNAUniqueMappedReads'] = total_star_mapped_reads
 42 |     total_rna_stat['SelectedRNAReadsRatio'] = total_rna_stat['FinalRNAReads'] / total_rna_stat['RNAUniqueMappedReads']
 43 |     total_rna_stat.index.name = 'cell_id'
 44 |     return total_rna_stat
 45 | 
 46 | 
 47 | def summarize_select_dna_reads(output_dir,
 48 |                                config):
 49 |     bam_dir = pathlib.Path(output_dir) / 'bam'
 50 |     mc_rate_max_threshold = float(config['mc_rate_max_threshold']) * 100
 51 |     cov_min_threshold = float(config['dna_cov_min_threshold'])
 52 | 
 53 |     records = []
 54 |     select_dna_reads_stat_list = list(bam_dir.glob('*.reads_profile.csv'))
 55 |     for path in select_dna_reads_stat_list:
 56 |         try:
 57 |             _df = pd.read_csv(path)
 58 |         except pd.errors.EmptyDataError:
 59 |             # means the bam file is empty
 60 |             continue
 61 | 
 62 |         cell_id = path.name.split('.')[0]
 63 |         if cell_id.endswith('-R1') or cell_id.endswith('-R2'):
 64 |             # select DNA preformed in R1 R2 separately:
 65 |             cell_id = cell_id[:-3]
 66 |         _df['cell_id'] = cell_id
 67 |         _df['mc_rate_max_threshold'] = mc_rate_max_threshold
 68 |         _df['cov_min_threshold'] = cov_min_threshold
 69 |         records.append(_df)
 70 |     total_stats_df = pd.concat(records)
 71 | 
 72 |     selected_reads = total_stats_df[
 73 |         (total_stats_df['cov'] >= cov_min_threshold)
 74 |         & (total_stats_df['mc_frac'] < mc_rate_max_threshold)]
 75 | 
 76 |     selected_reads = selected_reads.groupby('cell_id')['count'].sum()
 77 |     selected_ratio = selected_reads / total_stats_df.groupby('cell_id')['count'].sum()
 78 |     final_stat = pd.DataFrame({'FinalDNAReads': selected_reads, 'SelectedDNAReadsRatio': selected_ratio})
 79 |     final_stat.index.name = 'cell_id'
 80 |     return final_stat
 81 | 
 82 | 
 83 | def mct_mapping_stats(output_dir, config):
 84 |     """this may apply to single UID dir, so config is provided as parameter"""
 85 |     mc_stats_df = mc_mapping_stats(output_dir, config)
 86 |     select_dna_stats_df = summarize_select_dna_reads(output_dir, config)
 87 |     rna_stats_df = summary_rna_mapping(output_dir)
 88 |     final_df = pd.concat([mc_stats_df, select_dna_stats_df, rna_stats_df], axis=1)
 89 |     return final_df
 90 | 
 91 | 
 92 | def aggregate_feature_counts(output_dir):
 93 |     output_dir = pathlib.Path(output_dir)
 94 |     cell_data = []
 95 | 
 96 |     count_paths = list(output_dir.glob('*/rna_bam/TotalRNAAligned.rna_reads.feature_count.tsv'))
 97 |     if len(count_paths) == 0:
 98 |         return
 99 | 
100 |     data = None
101 |     for path in count_paths:
102 |         data = pd.read_csv(path, sep='\t', index_col=0, comment='#')
103 |         cell_data.append(data.iloc[:, 5:])
104 |     cell_data = pd.concat(cell_data, axis=1, sort=True)
105 |     cell_data.columns = cell_data.columns.str.split(':').str[1]
106 | 
107 |     # all count table should have the same info, so read the last one
108 |     # chr, start, end, strand, length
109 |     gene_info = data.iloc[:, :5]
110 |     with pd.HDFStore(output_dir / 'TotalRNAData.h5', mode='w', complevel=5) as hdf:
111 |         hdf['data'] = cell_data.T  # cell by gene
112 |         hdf['gene'] = gene_info
113 |         hdf['stats'] = pd.DataFrame({'GenesDetected': (cell_data > 0).sum()})
114 |     return
115 | 
116 | 
117 | def mct_additional_cols(final_df, output_dir):
118 |     final_df = final_df.copy()
119 |     final_df['CellInputReadPairs'] = final_df['R1InputReads'].astype(int)  # == final_df['R2InputReads']
120 |     if 'PCRIndex' in final_df.columns:  # plate info might not exist if the cell name is abnormal
121 |         cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum())
122 |                                         for _, i in final_df.groupby('PCRIndex')])
123 |         final_df['CellBarcodeRatio'] = cell_barcode_ratio
124 | 
125 |     stats = pd.read_hdf(output_dir / 'TotalRNAData.h5', key='stats')
126 |     final_df['GenesDetected'] = stats['GenesDetected']
127 | 
128 |     # calculate some mCT specific ratios
129 |     final_df['DNAReadsYield'] = final_df['FinalDNAReads'] / (
130 |             final_df['CellInputReadPairs'] * 2)
131 |     final_df['RNAReadsYield'] = final_df['FinalRNAReads'] / final_df[
132 |         'CellInputReadPairs']
133 |     final_df['RNA/(DNA+RNA)'] = final_df['FinalRNAReads'].fillna(0) / (
134 |             final_df['R1FinalBismarkReads'].fillna(0) + 1)
135 |     return final_df
136 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/stats/plate_info.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def _parse_cell_id_v1(cell_id):
 5 |     plate1, plate2, pcr_index, random_index = cell_id.split('-')
 6 |     if random_index.upper() in {'AD001', 'AD002', 'AD004', 'AD006'}:
 7 |         plate = plate1
 8 |     else:
 9 |         plate = plate2
10 |     # 96 pos
11 |     col96 = int(pcr_index[1:]) - 1
12 |     row96 = ord(pcr_index[0]) - 65  # convert A-H to 0-8
13 |     # 384 pos
14 |     ad_index_384_dict = {
15 |         'AD001': (0, 0),
16 |         'AD002': (0, 1),
17 |         'AD004': (1, 0),
18 |         'AD006': (1, 1),
19 |         'AD007': (0, 0),
20 |         'AD008': (0, 1),
21 |         'AD010': (1, 0),
22 |         'AD012': (1, 1)
23 |     }
24 |     col384 = 2 * col96 + ad_index_384_dict[random_index][0]
25 |     row384 = 2 * row96 + ad_index_384_dict[random_index][1]
26 |     record = pd.Series({
27 |         'Plate': plate,
28 |         'PCRIndex': pcr_index,
29 |         'RandomIndex': random_index,
30 |         'Col384': col384,
31 |         'Row384': row384
32 |     })
33 |     return record
34 | 
35 | 
36 | def _parse_cell_id_v2(cell_id):
37 |     plate, multiplex_group, pcr_index, random_index = cell_id.split('-')
38 |     # 384 pos
39 |     col384 = int(random_index[1:]) - 1
40 |     row384 = ord(random_index[0]) - 65  # convert A-P to 0-23
41 |     record = pd.Series({
42 |         'Plate': plate,
43 |         'PCRIndex': pcr_index,
44 |         'MultiplexGroup': multiplex_group,
45 |         'RandomIndex': random_index,
46 |         'Col384': col384,
47 |         'Row384': row384
48 |     })
49 |     return record
50 | 
51 | 
52 | def get_plate_info(cell_ids, barcode_version):
53 |     if barcode_version == 'V1':
54 |         func = _parse_cell_id_v1
55 |     else:
56 |         func = _parse_cell_id_v2
57 |     try:
58 |         plate_info = pd.DataFrame([func(cell_id) for cell_id in cell_ids],
59 |                                   index=cell_ids)
60 |     except Exception:
61 |         print('Errors occur during parsing the plate info, this happens '
62 |               'when the input FASTQ file name is not generated by yap. '
63 |               'The `yap summary` also can not generate html report due to missing the plate info. '
64 |               'In this case, you need to add the plateinfo by yourself in order to make the plate view plots. '
65 |               'These information is not necessary for following analysis though.')
66 |         plate_info = pd.DataFrame([], index=cell_ids)
67 |     return plate_info
68 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/stats/plot.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import matplotlib as mpl
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | import seaborn as sns
  8 | from matplotlib.colors import Normalize
  9 | 
 10 | 
 11 | def plot_on_plate(data,
 12 |                   hue,
 13 |                   groupby,
 14 |                   ncols=4,
 15 |                   plate_base=384,
 16 |                   figsize_scale=1,
 17 |                   row='Row384',
 18 |                   col='Col384',
 19 |                   vmin=0,
 20 |                   vmax=1,
 21 |                   aggregation_func=None):
 22 |     """
 23 |     Plot metadata into 384 or 96 plate view (heatmap)
 24 |     Parameters
 25 |     ----------
 26 |     data
 27 |         dataframe contain plate postion and metric used for color
 28 |     hue
 29 |         int/float column name used as hue
 30 |     groupby
 31 |         groupby column, typically groupby plate id column(s) to plot each plate separately
 32 |     ncols
 33 |         number of column for axes, nrows will be calculated accordingly
 34 |     plate_base
 35 |         {384, 96} size of the plate view
 36 |     figsize_scale
 37 |         scale of figure size
 38 |     row
 39 |         column name for rows
 40 |     col
 41 |         column name for columns
 42 |     vmin
 43 |         cmap vmin
 44 |     vmax
 45 |         cmap vmax
 46 |     aggregation_func
 47 |         apply to reduce rows after groupby if the row is not unique
 48 |     """
 49 | 
 50 |     if plate_base == 384:
 51 |         plate_nrows, plate_ncols = 16, 24
 52 | 
 53 |     elif plate_base == 96:
 54 |         plate_nrows, plate_ncols = 8, 12
 55 |     else:
 56 |         raise ValueError(f'Plate base {plate_base} unknown')
 57 | 
 58 |     plot_data_list = []
 59 |     plate_names = []
 60 |     for plate, sub_df in data.groupby(groupby):
 61 |         # check if plate base are duplicated
 62 |         duplicated = sub_df[[row, col]].duplicated().sum() != 0
 63 |         if duplicated:
 64 |             if aggregation_func is None:
 65 |                 raise ValueError(
 66 |                     'Row after groupby is not unique, aggregation_func can not be None'
 67 |                 )
 68 |             plot_data = sub_df.groupby([row,
 69 |                                         col])[[hue]].apply(aggregation_func)
 70 |         else:
 71 |             plot_data = sub_df.set_index([row, col])[[hue]]
 72 |         # reindex, missing value will keep as NA
 73 |         full_index = pd.MultiIndex.from_tuples([(i, j)
 74 |                                                 for i in range(plate_nrows)
 75 |                                                 for j in range(plate_ncols)],
 76 |                                                names=[row, col])
 77 |         plot_data = plot_data.reindex(full_index).reset_index()
 78 |         plot_data_list.append(plot_data)
 79 |         if isinstance(plate, str):
 80 |             plate_names.append(plate)
 81 |         else:
 82 |             plate_names.append('\n'.join(plate))
 83 | 
 84 |     ncols = min(len(plot_data_list), ncols)
 85 |     nrows = int(np.ceil(len(plot_data_list) / ncols))
 86 |     cbar_frac = 0.06
 87 | 
 88 |     fig = plt.figure(figsize=((6.2 * ncols) * (1 + cbar_frac) * figsize_scale,
 89 |                               4 * nrows * figsize_scale))
 90 |     gs = fig.add_gridspec(nrows, ncols, wspace=0.1)
 91 |     cmap = copy.copy(mpl.cm.get_cmap("viridis"))
 92 |     cmap.set_under(color='#440154')
 93 |     cmap.set_over(color='#FDE725')
 94 |     cmap.set_bad(color='#FFFFFF')
 95 |     cnorm = Normalize(vmin, vmax)
 96 | 
 97 |     for i, (name, data) in enumerate(zip(plate_names, plot_data_list)):
 98 |         ax_row = i // ncols
 99 |         ax_col = i % ncols
100 | 
101 |         ax = fig.add_subplot(gs[ax_row, ax_col])
102 |         ax.scatter(
103 |             x=data[col],
104 |             y=data[row],
105 |             # have to do this, otherwise NaN is skipped.
106 |             c=[cmap(cnorm(v)) for v in data[hue]],
107 |             s=100,
108 |             linewidth=1,
109 |             edgecolor='lightgray')
110 |         ax.set(title=name,
111 |                ylabel='',
112 |                ylim=(plate_nrows, -1),
113 |                yticks=list(range(16)),
114 |                yticklabels=[chr(i + 65) for i in range(0, 16)],
115 |                xlabel='',
116 |                xticks=range(24),
117 |                xticklabels=range(1, 25))
118 |         ax.xaxis.set_tick_params(labelsize=8)
119 |         ax.yaxis.set_tick_params(labelsize=8)
120 |         ax.xaxis.tick_top()
121 |     fig.colorbar(mpl.cm.ScalarMappable(norm=cnorm, cmap=cmap),
122 |                  ax=fig.axes,
123 |                  shrink=0.6,
124 |                  fraction=cbar_frac,
125 |                  label=hue)
126 |     return fig, plate_names, plot_data_list
127 | 
128 | 
129 | def cutoff_vs_cell_remain(data,
130 |                           xlim_quantile=(0.01, 0.99),
131 |                           distribution_ylim=None,
132 |                           bins=100,
133 |                           kde=False):
134 |     xlim = tuple(np.quantile(data, xlim_quantile))
135 |     x = np.linspace(xlim[0], xlim[1], 500)
136 |     count_list = np.array([(data > i).sum() for i in x])
137 |     original_total_data = data.size
138 |     count_list = count_list / original_total_data * 100
139 |     data = data[(data < xlim[1]) & (data > xlim[0])]
140 | 
141 |     fig, ax1 = plt.subplots(figsize=(6, 3))
142 |     ax1 = sns.distplot(data, bins=bins, kde=kde, ax=ax1)
143 |     ax1.set_xlim(xlim)
144 |     ax1.set_xlabel(data.name)
145 |     if distribution_ylim is not None:
146 |         ax1.set_ylim(*distribution_ylim)
147 | 
148 |     ax2 = ax1.twinx()
149 |     ax2.plot(x, count_list, linewidth=1, c='grey')
150 |     ax2.set_ylabel('% of Cell Remained')
151 |     ax2.set(ylim=(0, 100), yticks=range(0, 101, 10))
152 |     ax2.grid(c='lightgray', linestyle='--', linewidth=0.5)
153 |     return fig, xlim
154 | 


--------------------------------------------------------------------------------
/cemba_data/mapping/test_environment.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | import subprocess
 3 | 
 4 | 
 5 | def testing_cmd(command, expected_return_code=0):
 6 |     try:
 7 |         p = subprocess.run(shlex.split(command),
 8 |                            stderr=subprocess.PIPE,
 9 |                            stdout=subprocess.PIPE,
10 |                            encoding='utf8',
11 |                            check=True)
12 |     except subprocess.CalledProcessError as e:
13 |         if e.returncode == expected_return_code:
14 |             return
15 |         print(e.stderr)
16 |         raise e
17 |     return
18 | 
19 | 
20 | COMMAND_TO_TEST = [
21 |     'cutadapt --version',
22 |     'bismark -version',
23 |     'bowtie2 --version',
24 |     'samtools --version',
25 |     'tabix --version',
26 |     'bgzip --version',
27 |     'bedtools --version'
28 | ]
29 | 
30 | 
31 | def testing_mapping_installation(mct=False):
32 |     for command in COMMAND_TO_TEST:
33 |         testing_cmd(command)
34 | 
35 |     # picard always return 1...
36 |     testing_cmd('picard MarkDuplicates --version', 1)
37 | 
38 |     if mct:
39 |         testing_cmd('STAR --version')
40 | 
41 |     # test ALLCools
42 |     try:
43 |         testing_cmd('allcools -h')
44 |     except subprocess.CalledProcessError:
45 |         print('"allcools -h" return error, see if allcools is installed. \n'
46 |               'https://github.com/lhqing/ALLCools')
47 | 


--------------------------------------------------------------------------------
/cemba_data/snm3C/__init__.py:
--------------------------------------------------------------------------------
1 | from .prepare_impute import prepare_impute_dir
2 | from .prepare_dataset import prepare_dataset_commands
3 | 


--------------------------------------------------------------------------------
/cemba_data/snm3C/prepare_dataset.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import pandas as pd
 3 | from .prepare_impute import execute_command
 4 | 
 5 | 
 6 | def prepare_dataset_commands(output_dir, fasta_path, cpu=10):
 7 |     output_dir = pathlib.Path(output_dir)
 8 |     project_name = output_dir.name
 9 |     scool_dir = output_dir / 'scool'
10 |     snakemake_dir = scool_dir / 'snakemake'
11 |     snakemake_dir.mkdir(exist_ok=True, parents=True)
12 |     raw_dir = scool_dir / 'raw'
13 |     raw_dir.mkdir(exist_ok=True)
14 |     impute_dir = scool_dir / 'impute'
15 |     impute_dir.mkdir(exist_ok=True)
16 |     dataset_dir = scool_dir / 'dataset'
17 |     dataset_dir.mkdir(exist_ok=True)
18 | 
19 |     # Calculate compartment at 100Kb resolution
20 |     compartment_input_dir = impute_dir / '100K'
21 |     compartment_cell_table = pd.Series({
22 |         path.name.split('.')[0]: str(path)
23 |         for path in compartment_input_dir.glob('*/*.cool')
24 |     })
25 |     compartment_cell_table_path = compartment_input_dir / 'cell_table.tsv'
26 |     compartment_cell_table.to_csv(compartment_cell_table_path, sep='\t', header=None)
27 |     # prepare a whole genome CpG ratio profile
28 |     cpg_path = compartment_input_dir / 'cpg_ratio.hdf'
29 |     cpg_ratio_cmd = f'hicluster cpg-ratio --cell_url {compartment_cell_table.iloc[0]} ' \
30 |                     f'--fasta_path {fasta_path} --hdf_output_path {cpg_path}'
31 |     execute_command(cpg_ratio_cmd)
32 |     # compartment command
33 |     compartment_cmd = f'hicluster compartment ' \
34 |                       f'--cell_table_path {compartment_cell_table_path} ' \
35 |                       f'--output_prefix {dataset_dir / project_name} ' \
36 |                       f'--cpg_profile_path {cpg_path} ' \
37 |                       f'--cpu {cpu}'
38 | 
39 |     # Calculate domain at 25Kb resolution
40 |     domain_input_dir = impute_dir / '25K'
41 |     domain_cell_table = pd.Series({
42 |         path.name.split('.')[0]: str(path)
43 |         for path in domain_input_dir.glob('*/*.cool')
44 |     })
45 |     domain_cell_table_path = domain_input_dir / 'cell_table.tsv'
46 |     domain_cell_table.to_csv(domain_cell_table_path, sep='\t', header=None)
47 |     domain_cmd = f'hicluster domain ' \
48 |                  f'--cell_table_path {domain_cell_table_path} ' \
49 |                  f'--output_prefix {dataset_dir / project_name} ' \
50 |                  f'--resolution 25000 ' \
51 |                  f'--window_size 10 ' \
52 |                  f'--cpu {cpu}'
53 | 
54 |     # Calculate cell embedding/decomposition at 100Kb resolution
55 |     embedding_dir = dataset_dir / 'embedding'
56 |     embedding_dir.mkdir(exist_ok=True)
57 |     embedding_cmd = f'hicluster embedding ' \
58 |                     f'--cell_table_path {compartment_cell_table_path} ' \
59 |                     f'--output_dir {embedding_dir} ' \
60 |                     f'--dim 50 ' \
61 |                     f'--dist 1000000 ' \
62 |                     f'--resolution 100000 ' \
63 |                     f'--scale_factor 100000 ' \
64 |                     f'--norm_sig --save_raw ' \
65 |                     f'--cpu {cpu}'
66 | 
67 |     # prepare qsub
68 |     qsub_dir = snakemake_dir / 'qsub'
69 |     qsub_dir.mkdir(exist_ok=True)
70 |     with open(qsub_dir / 'dataset_cmd.txt', 'w') as f:
71 |         f.write('\n'.join([compartment_cmd, domain_cmd, embedding_cmd]))
72 |     qsub_str = f"""
73 | #!/bin/bash
74 | #$ -N y{project_name}
75 | #$ -V
76 | #$ -l h_rt=999:99:99
77 | #$ -l s_rt=999:99:99
78 | #$ -wd {qsub_dir}
79 | #$ -e {qsub_dir}/qsub_dataset.error.log
80 | #$ -o {qsub_dir}/qsub_dataset.output.log
81 | #$ -pe smp 1
82 | #$ -l h_vmem=3G
83 | 
84 | yap qsub --command_file_path {qsub_dir}/dataset_cmd.txt \
85 | --working_dir {qsub_dir} --project_name y{project_name}_dataset \
86 | --total_cpu {int(cpu*3)} --qsub_global_parms "-pe smp={cpu};-l h_vmem=5G"
87 | """
88 |     with open(qsub_dir / 'qsub_dataset.sh', 'w') as f:
89 |         f.write(qsub_str)
90 |     return
91 | 


--------------------------------------------------------------------------------
/cemba_data/utilities.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import functools
  3 | import itertools
  4 | import logging
  5 | import pathlib
  6 | import subprocess
  7 | from concurrent.futures import ProcessPoolExecutor, as_completed
  8 | 
  9 | # logger
 10 | log = logging.getLogger(__name__)
 11 | log.addHandler(logging.NullHandler())
 12 | 
 13 | 
 14 | def get_configuration(config_path):
 15 |     """
 16 |     Read .ini config file from given path
 17 |     """
 18 |     if isinstance(config_path, configparser.ConfigParser):
 19 |         return config_path
 20 |     ref_path_config = configparser.ConfigParser()
 21 |     ref_path_config.read(config_path)
 22 | 
 23 |     total_config = {}
 24 |     for name, section in ref_path_config.items():
 25 |         for k, v in section.items():
 26 |             total_config[k] = v
 27 |     return total_config
 28 | 
 29 | 
 30 | def test_cmd(tool_name, cmd_list):
 31 |     try:
 32 |         subprocess.run(cmd_list,
 33 |                        stdout=subprocess.PIPE,
 34 |                        stderr=subprocess.PIPE,
 35 |                        encoding='utf8',
 36 |                        check=True)
 37 |     except subprocess.CalledProcessError as e:
 38 |         log.error(f'Test {tool_name} got non-zero return code {e.returncode}')
 39 |         log.error(e.stderr)
 40 |         raise
 41 |     return
 42 | 
 43 | 
 44 | def valid_environments(config):
 45 |     log.info('Test mapping environments')
 46 | 
 47 |     # test cutadapt
 48 |     test_cmd(tool_name='cutadapt', cmd_list=['cutadapt', '--version'])
 49 |     # test samtools
 50 |     test_cmd(tool_name='samtools', cmd_list=['samtools', '--version'])
 51 |     # test picard, picard always have return code 1...
 52 |     test_cmd(tool_name='picard', cmd_list=['which', 'picard'])
 53 |     # test bismark_mapping
 54 |     test_cmd(tool_name='bismark_mapping', cmd_list=['bismark_mapping', '--version'])
 55 |     if config['mode'] != 'm3c':
 56 |         # test bowtie2
 57 |         test_cmd(tool_name='bowtie2', cmd_list=['bowtie2', '--version'])
 58 |     else:
 59 |         # test bowtie1
 60 |         test_cmd(tool_name='bowtie', cmd_list=['bowtie', '--version'])
 61 |     # test pigz
 62 |     test_cmd(tool_name='pigz', cmd_list=['pigz', '-V'])
 63 | 
 64 |     bismark_dir = pathlib.Path(config['bismark_reference'])
 65 |     if not bismark_dir.is_dir():
 66 |         raise TypeError(f"Bismark reference must be a directory contain a sub-dir named Bisulfite_Genome, "
 67 |                         f"generated by bismark_genome_preparation. Got a file path")
 68 |     if not bismark_dir.exists():
 69 |         raise FileNotFoundError(f"Bismark reference directory not found. "
 70 |                                 f"Path in the config.ini is {bismark_dir}")
 71 | 
 72 |     allc_ref_fasta = pathlib.Path(config['reference_fasta'])
 73 |     allc_ref_fai = pathlib.Path(config['reference_fasta'] + '.fai')
 74 |     if not allc_ref_fasta.exists():
 75 |         raise FileNotFoundError(f"Reference fasta for ALLC generation not found. "
 76 |                                 f"Path in the config.ini is {allc_ref_fasta}")
 77 |     if not allc_ref_fai.exists():
 78 |         raise FileNotFoundError(f".fai index for reference fasta not found. "
 79 |                                 f"Path of fadix should be {allc_ref_fai}. "
 80 |                                 f"You can use 'samtools fadix {allc_ref_fasta}' to generate.")
 81 |     return
 82 | 
 83 | 
 84 | def parse_index_fasta(fasta_path):
 85 |     records = {}
 86 |     with open(fasta_path) as f:
 87 |         key_line = True
 88 |         for line in f:
 89 |             if key_line:
 90 |                 key = line.lstrip('>').rstrip('\n')
 91 |                 key_line = False
 92 |             else:
 93 |                 value = line.lstrip('^').rstrip('\n')
 94 |                 records[key] = value
 95 |                 key_line = True
 96 |     return records
 97 | 
 98 | 
 99 | def command_runner(commands, runner=None, cpu=1):
100 |     if runner is None:
101 |         from functools import partial
102 |         runner = partial(subprocess.run,
103 |                          stdout=subprocess.PIPE,
104 |                          stderr=subprocess.PIPE,
105 |                          encoding='utf8',
106 |                          shell=True,
107 |                          check=True)
108 |     if cpu <= 1:
109 |         for command in commands:
110 |             runner(command)
111 |     else:
112 |         with ProcessPoolExecutor(cpu) as pool:
113 |             futures = []
114 |             for command in commands:
115 |                 future = pool.submit(runner, command)
116 |                 futures.append(future)
117 | 
118 |             for future in as_completed(futures):
119 |                 try:
120 |                     future.result()
121 |                 except subprocess.CalledProcessError as e:
122 |                     print("Got error in fastq_qc, command was:")
123 |                     print(command)
124 |                     print(e.stdout)
125 |                     print(e.stderr)
126 |                     raise e
127 |     return
128 | 
129 | 
130 | def snakemake(workdir, snakefile, cores):
131 |     try:
132 |         subprocess.run([
133 |             'snakemake', '-d', str(workdir), '--snakefile',
134 |             str(snakefile), '--cores',
135 |             str(cores)
136 |         ],
137 |             check=True,
138 |             stdin=subprocess.PIPE,
139 |             stdout=subprocess.PIPE,
140 |             encoding='utf8')
141 |     except subprocess.CalledProcessError as e:
142 |         print(e.stdout)
143 |         print(e.stderr)
144 |         raise e
145 |     return
146 | 
147 | 
148 | def get_barcode_version(output_dir):
149 |     fastq_dir = pathlib.Path(output_dir) / 'fastq'
150 |     with open(fastq_dir / '.barcode_version') as f:
151 |         return f.read()
152 | 
153 | 
154 | def get_mode(output_dir):
155 |     fastq_dir = pathlib.Path(output_dir) / 'fastq'
156 |     with open(fastq_dir / '.mode') as f:
157 |         return f.read()
158 | 
159 | 
160 | MAPPING_MODE_CHOICES = ['mct', 'mc', 'm3c', '4m']
161 | IUPAC_TABLE = {
162 |     'A': 'A',
163 |     'T': 'T',
164 |     'C': 'C',
165 |     'G': 'G',
166 |     'R': 'AG',
167 |     'Y': 'CT',
168 |     'S': 'GC',
169 |     'W': 'AT',
170 |     'K': 'GT',
171 |     'M': 'AC',
172 |     'B': 'CGT',
173 |     'D': 'AGT',
174 |     'H': 'ATC',
175 |     'V': 'ACG',
176 |     'N': 'ATCGN'
177 | }
178 | 
179 | 
180 | @functools.lru_cache(maxsize=100)
181 | def parse_mc_pattern(pattern: str) -> set:
182 |     """
183 |     parse mC context pattern
184 |     """
185 |     # IUPAC DNA abbr. table
186 |     all_pos_list = []
187 |     pattern = pattern.upper()
188 |     for base in pattern:
189 |         try:
190 |             all_pos_list.append(IUPAC_TABLE[base])
191 |         except KeyError:
192 |             raise KeyError(f'Base {base} is not in IUPAC table.')
193 |     context_set = set([''.join(i) for i in itertools.product(*all_pos_list)])
194 |     return context_set
195 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/TODO_GenerateMCDS.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": []
 9 |   }
10 |  ],
11 |  "metadata": {
12 |   "kernelspec": {
13 |    "display_name": "Python 3",
14 |    "language": "python",
15 |    "name": "python3"
16 |   },
17 |   "language_info": {
18 |    "codemirror_mode": {
19 |     "name": "ipython",
20 |     "version": 3
21 |    },
22 |    "file_extension": ".py",
23 |    "mimetype": "text/x-python",
24 |    "name": "python",
25 |    "nbconvert_exporter": "python",
26 |    "pygments_lexer": "ipython3",
27 |    "version": "3.7.3"
28 |   },
29 |   "toc": {
30 |    "base_numbering": 1,
31 |    "nav_menu": {},
32 |    "number_sections": true,
33 |    "sideBar": true,
34 |    "skip_h1_title": false,
35 |    "title_cell": "Table of Contents",
36 |    "title_sidebar": "Contents",
37 |    "toc_cell": false,
38 |    "toc_position": {},
39 |    "toc_section_display": true,
40 |    "toc_window_display": false
41 |   }
42 |  },
43 |  "nbformat": 4,
44 |  "nbformat_minor": 2
45 | }
46 | 


--------------------------------------------------------------------------------
/doc/TODO_overview.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": []
 9 |   }
10 |  ],
11 |  "metadata": {
12 |   "hide_input": false,
13 |   "kernelspec": {
14 |    "display_name": "Python 3",
15 |    "language": "python",
16 |    "name": "python3"
17 |   },
18 |   "language_info": {
19 |    "codemirror_mode": {
20 |     "name": "ipython",
21 |     "version": 3
22 |    },
23 |    "file_extension": ".py",
24 |    "mimetype": "text/x-python",
25 |    "name": "python",
26 |    "nbconvert_exporter": "python",
27 |    "pygments_lexer": "ipython3",
28 |    "version": "3.7.3"
29 |   },
30 |   "toc": {
31 |    "base_numbering": 1,
32 |    "nav_menu": {},
33 |    "number_sections": true,
34 |    "sideBar": true,
35 |    "skip_h1_title": true,
36 |    "title_cell": "Table of Contents",
37 |    "title_sidebar": "Contents",
38 |    "toc_cell": false,
39 |    "toc_position": {},
40 |    "toc_section_display": true,
41 |    "toc_window_display": true
42 |   }
43 |  },
44 |  "nbformat": 4,
45 |  "nbformat_minor": 2
46 | }
47 | 


--------------------------------------------------------------------------------
/doc/TechBasic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Technology Basics\n",
  8 |     "\n",
  9 |     "## What does yap do\n",
 10 |     "\n",
 11 |     "### All sequencing technologies are methylation based\n",
 12 |     "\n",
 13 |     "All the technologies covered by yap is based on snmC-seq2, here I visualized the steps and barcoding strategies we used currently in Ecker Lab for snmC-seq2. This is basically all yap mapping is based on.\n",
 14 |     "\n",
 15 |     "### Multiplex cell in preparing library\n",
 16 |     "\n",
 17 |     "When preparing library, the most important part related to mapping is the cell multiplexing:\n",
 18 |     "\n",
 19 |     "1. use random primer (inside pipeline, **index_name** corresponding to each random primer)\n",
 20 |     "2. use illumina P5/P7 primer pair (inside pipeline, **primer_name** and **uid** corresponding to each illumina P5/P7 primer pair)\n",
 21 |     "    \n",
 22 |     "### Demultiplex cell in mapping\n",
 23 |     "\n",
 24 |     "When mapping use yap (**notice the reverse order**):\n",
 25 |     "\n",
 26 |     "1. prepare samplesheet for bcl2fastq, use bcl2fastq to demultiplex illumina P5/P7 primer pair. Each result file set got a **uid**, that **uid** corresponding to the illumina primer pair throughout the pipeline.\n",
 27 |     "2. use cutadapt to demultiplex random primer. Each result file set got a **index_name**, that **index_name** corresponding to the random primer throughout the pipeline.\n",
 28 |     "3. **uid** + **index_name** uniquely determine a cell within the same pool on MiSeq or NovaSeq.\n",
 29 |     "4. After getting single cell files, yap just do mapping steps for each individual cells, and then summarize all the mapping stats for the whole library.\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "## Important Reference\n",
 33 |     "\n",
 34 |     "- **snmC-seq original publication**: [Luo, Chongyuan, Christopher L. Keown, Laurie Kurihara, Jingtian Zhou, Yupeng He, Junhao Li, Rosa Castanon, et al. 2017. “Single-Cell Methylomes Identify Neuronal Subtypes and Regulatory Elements in Mammalian Cortex.” Science 357 (6351): 600–604.](http://dx.doi.org/10.1126/science.aan3351)\n",
 35 |     "- **snmC-seq2**: [Luo, Chongyuan, Angeline Rivkin, Jingtian Zhou, Justin P. Sandoval, Laurie Kurihara, Jacinta Lucero, Rosa Castanon, et al. 2018. “Robust Single-Cell DNA Methylome Profiling with snmC-seq2.” Nature Communications 9 (1): 3824.](http://dx.doi.org/10.1038/s41467-018-06355-2)\n",
 36 |     "- **snmCT-seq**: [Luo, Chongyuan, Hanqing Liu, Bang-An Wang, Anna Bartlett, Angeline Rivkin, Joseph R. Nery, and Joseph R. Ecker. 2018. “Multi-Omic Profiling of Transcriptome and DNA Methylome in Single Nuclei with Molecular Partitioning.” bioRxiv. https://doi.org/10.1101/434845.](http://dx.doi.org/10.1101/434845)\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "## snmC-seq2 Library\n",
 40 |     "\n",
 41 |     "### Molecular steps\n",
 42 |     "![molecularsteps](files/molecularsteps.png)\n",
 43 |     "\n",
 44 |     "### Reads and Primer Structure\n",
 45 |     "\n",
 46 |     "![primerstructure](files/primerstructure.png)\n",
 47 |     "\n",
 48 |     "## Cell Multiplexing\n",
 49 |     "\n",
 50 |     "### V1 (8-random-index)\n",
 51 |     "\n",
 52 |     "![v1barcode](files/v1barcode.png)\n",
 53 |     "\n",
 54 |     "\n",
 55 |     "### V2 (384-random-index)\n",
 56 |     "![v2barcode](files/v2barcode.png)\n"
 57 |    ]
 58 |   }
 59 |  ],
 60 |  "metadata": {
 61 |   "hide_input": false,
 62 |   "kernel_info": {
 63 |    "name": "python3"
 64 |   },
 65 |   "kernelspec": {
 66 |    "display_name": "Python 3",
 67 |    "language": "python",
 68 |    "name": "python3"
 69 |   },
 70 |   "language_info": {
 71 |    "codemirror_mode": {
 72 |     "name": "ipython",
 73 |     "version": 3
 74 |    },
 75 |    "file_extension": ".py",
 76 |    "mimetype": "text/x-python",
 77 |    "name": "python",
 78 |    "nbconvert_exporter": "python",
 79 |    "pygments_lexer": "ipython3",
 80 |    "version": "3.7.3"
 81 |   },
 82 |   "nteract": {
 83 |    "version": "0.12.3"
 84 |   },
 85 |   "toc": {
 86 |    "base_numbering": 1,
 87 |    "nav_menu": {},
 88 |    "number_sections": true,
 89 |    "sideBar": true,
 90 |    "skip_h1_title": true,
 91 |    "title_cell": "Table of Contents",
 92 |    "title_sidebar": "Contents",
 93 |    "toc_cell": true,
 94 |    "toc_position": {},
 95 |    "toc_section_display": true,
 96 |    "toc_window_display": false
 97 |   },
 98 |   "varInspector": {
 99 |    "cols": {
100 |     "lenName": 16,
101 |     "lenType": 16,
102 |     "lenVar": 40
103 |    },
104 |    "kernels_config": {
105 |     "python": {
106 |      "delete_cmd_postfix": "",
107 |      "delete_cmd_prefix": "del ",
108 |      "library": "var_list.py",
109 |      "varRefreshCmd": "print(var_dic_list())"
110 |     },
111 |     "r": {
112 |      "delete_cmd_postfix": ") ",
113 |      "delete_cmd_prefix": "rm(",
114 |      "library": "var_list.r",
115 |      "varRefreshCmd": "cat(var_dic_list()) "
116 |     }
117 |    },
118 |    "types_to_exclude": [
119 |     "module",
120 |     "function",
121 |     "builtin_function_or_method",
122 |     "instance",
123 |     "_Feature"
124 |    ],
125 |    "window_display": false
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 2
130 | }
131 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'yap'
21 | copyright = '2019, Hanqing Liu'
22 | author = 'Hanqing Liu'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 | 'nbsphinx', 'sphinx.ext.mathjax']
32 | 
33 | # Add any paths that contain templates here, relative to this directory.
34 | templates_path = ['_templates']
35 | 
36 | # List of patterns, relative to source directory, that match files and
37 | # directories to ignore when looking for source files.
38 | # This pattern also affects html_static_path and html_extra_path.
39 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
40 | 
41 | 
42 | # -- Options for HTML output -------------------------------------------------
43 | 
44 | # The theme to use for HTML and HTML Help pages.  See the documentation for
45 | # a list of builtin themes.
46 | #
47 | html_theme = 'default'
48 | 
49 | # Add any paths that contain custom static files (such as style sheets) here,
50 | # relative to this directory. They are copied after the builtin static files,
51 | # so a file named "default.css" will overwrite the builtin "default.css".
52 | html_static_path = ['_static']
53 | master_doc = 'index'


--------------------------------------------------------------------------------
/doc/demultiplex.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Demultiplex (start from sequencing)\n",
  8 |     "\n",
  9 |     "## Related Commands\n",
 10 |     "```shell\n",
 11 |     "# Demultiplex\n",
 12 |     "yap demultiplex\n",
 13 |     "```"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Input of this step\n",
 21 |     "In the previous step, we generated sample sheet based on plate information file, and then used illumina bcl2fastq to demultiplex the sequencing results into **raw FASTQ file sets**. This step only demultiplexed the barcode on the illumina primers, therefore, each set of FASTQ file still contain reads mixed from multiple cells. \n",
 22 |     "\n",
 23 |     "Depending on the number of random index used in each barcode version, in V1, each set contain reads from eight cells; in V2, each set contain reads from 384 cells."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Output of this step\n",
 31 |     "\n",
 32 |     "- This step demultiplex raw FASTQ files into single cell raw FASTQ files.\n",
 33 |     "- The random index sequence will be removed from the reads\n",
 34 |     "- Each cell will have two fastq files in the output directory, with fixed name pattern:\n",
 35 |     "    - `{cell_id}-R1.fq.gz` for R1\n",
 36 |     "    - `{cell_id}-R2.fq.gz` for R2"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## Usage"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 12,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "usage: yap demultiplex [-h] --fastq_pattern FASTQ_PATTERN --output_dir\r\n",
 56 |       "                       OUTPUT_DIR --barcode_version {V1,V2} --mode\r\n",
 57 |       "                       {mc,mct,mc2t} --cpu CPU\r\n",
 58 |       "\r\n",
 59 |       "optional arguments:\r\n",
 60 |       "  -h, --help            show this help message and exit\r\n",
 61 |       "\r\n",
 62 |       "Required inputs:\r\n",
 63 |       "  --fastq_pattern FASTQ_PATTERN\r\n",
 64 |       "                        FASTQ files with wildcard to match all bcl2fastq\r\n",
 65 |       "                        results, pattern with wildcard must be quoted.\r\n",
 66 |       "                        (default: None)\r\n",
 67 |       "  --output_dir OUTPUT_DIR\r\n",
 68 |       "                        Pipeline output directory, will be created\r\n",
 69 |       "                        recursively. (default: None)\r\n",
 70 |       "  --barcode_version {V1,V2}\r\n",
 71 |       "                        Barcode version of this library, V1 for the 8 random\r\n",
 72 |       "                        index, V2 for the 384 random index. (default: None)\r\n",
 73 |       "  --mode {mc,mct,mc2t}  Technology used in this library. (default: None)\r\n",
 74 |       "  --cpu CPU             Number of cores to use. Max is 12. (default: None)\r\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "!yap demultiplex -h"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "### Notes\n",
 87 |     "- **Remember to use \"\" to quote the fastq pattern like this:\n",
 88 |     "    `--fastq_pattern` \"path/pattern/to/your/bcl2fastq/results/*fastq.gz\"**\n",
 89 |     "- An error will occor if `output_dir` already exists."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Runtime notes for NovaSeq\n",
 97 |     "\n",
 98 |     "- This command run demultiplex directly, the runtime is roughly ~8 Gb per CPU per hour. For a typical eight-plate NovaSeq library (500GB), the runtime using 12 CPU is ~5-7 hours depending on the file system loads.\n",
 99 |     "- This command creads lots of files simutaniously, in order to prevent too much berden on the file system, I set default and max CPU = 12"
100 |    ]
101 |   }
102 |  ],
103 |  "metadata": {
104 |   "kernelspec": {
105 |    "display_name": "Python 3",
106 |    "language": "python",
107 |    "name": "python3"
108 |   },
109 |   "language_info": {
110 |    "codemirror_mode": {
111 |     "name": "ipython",
112 |     "version": 3
113 |    },
114 |    "file_extension": ".py",
115 |    "mimetype": "text/x-python",
116 |    "name": "python",
117 |    "nbconvert_exporter": "python",
118 |    "pygments_lexer": "ipython3",
119 |    "version": "3.7.6"
120 |   }
121 |  },
122 |  "nbformat": 4,
123 |  "nbformat_minor": 4
124 | }
125 | 


--------------------------------------------------------------------------------
/doc/files/MappingPipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/MappingPipeline.png


--------------------------------------------------------------------------------
/doc/files/molecularsteps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/molecularsteps.png


--------------------------------------------------------------------------------
/doc/files/primerstructure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/primerstructure.png


--------------------------------------------------------------------------------
/doc/files/v1barcode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/v1barcode.png


--------------------------------------------------------------------------------
/doc/files/v2barcode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/v2barcode.png


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. yap documentation master file, created by
 2 |    sphinx-quickstart on Fri Sep 13 16:24:00 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | YAP documentation
 7 | ===============================
 8 | Please read the new documentation of YAP here:
 9 | 
10 | https://hq-1.gitbook.io/mc/
11 | 
12 | - Code: https://github.com/lhqing/cemba_data
13 | - Author: Hanqing Liu, hanliu@salk.edu
14 | 


--------------------------------------------------------------------------------
/doc/installation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Installation\n",
  7 |     "##  Setup Conda and Mapping Environment\n",
  8 |     "###  check if conda is installed\n",
  9 |     "```shell\n",
 10 |     "conda info\n",
 11 |     "```\n",
 12 |     "\n",
 13 |     "###  if not installed, install either miniconda or anaconda.\n",
 14 |     "- IMPORTANT: select python 3\n",
 15 |     "- miniconda (recommend if you don't use python a lot): https://conda.io/miniconda.html\n",
 16 |     "- anaconda (larger): https://www.anaconda.com/download/\n",
 17 |     "\n",
 18 |     "\n",
 19 |     "### Set up bioconda\n",
 20 |     "[bioconda](https://bioconda.github.io/) is a package manager for most popular biological tools, its wonderful!\n",
 21 |     "```shell\n",
 22 |     "# run these command to add bioconda into your conda channel, the order of these 3 line matters\n",
 23 |     "conda config --add channels defaults\n",
 24 |     "conda config --add channels bioconda\n",
 25 |     "conda config --add channels conda-forge\n",
 26 |     "```\n",
 27 |     "\n",
 28 |     "###  Create Mapping Environment \n",
 29 |     "you can change the name into any desired name, but python version need to be 3.7\n",
 30 |     "```shell\n",
 31 |     "conda create --name mapping python==3.7\n",
 32 |     "```\n",
 33 |     "\n",
 34 |     "### why using stand alone conda environment?\n",
 35 |     "- Using environment make sure all the mapping related package is handled by conda and pip in a stand alone place\n",
 36 |     "- It will not impact any of your other installed packages and vise versa.\n",
 37 |     "- This make sure the stability of pipeline.\n",
 38 |     "- The only drawback of using environment is **you need to activate environment every time**, because everything is only installed for that environment.\n",
 39 |     "- See [here](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) for more info about conda environment.\n",
 40 |     "\n",
 41 |     "###  activate new environment\n",
 42 |     "**remember to run this command EVERY TIME before using the pipeline.**\n",
 43 |     "\n",
 44 |     "```shell\n",
 45 |     "source activate mapping\n",
 46 |     "```"
 47 |    ],
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    }
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Install packages\n",
 57 |     "\n",
 58 |     "###  install packages into new environment\n",
 59 |     "```shell\n",
 60 |     "conda install -n mapping bedtools=2.27 bismark=0.20 bowtie2=2.3 cutadapt=1.18 fastqc=0.11 picard=2.18 samtools=1.9 htslib=1.9 pysam=0.15\n",
 61 |     "# for mCT mapping\n",
 62 |     "conda install -n mapping STAR=2.7\n",
 63 |     "\n",
 64 |     "# for generating ALLC files (single cell base level methylation table)\n",
 65 |     "# ALLCools is still in developing, right now only support install via github.\n",
 66 |     "git clone https://github.com/lhqing/ALLCools.git\n",
 67 |     "cd ALLCools\n",
 68 |     "pip install .\n",
 69 |     "```\n",
 70 |     "\n",
 71 |     "###  clone cemba-data repo and install it\n",
 72 |     "this step will take some time, a few packages will be installed into this environment\n",
 73 |     "```shell\n",
 74 |     "git clone https://github.com/lhqing/cemba_data.git\n",
 75 |     "cd cemba_data\n",
 76 |     "pip install .\n",
 77 |     "```\n",
 78 |     "\n",
 79 |     "###  test if installed correctly\n",
 80 |     "```shell\n",
 81 |     "yap -h\n",
 82 |     "```\n",
 83 |     "\n",
 84 |     "## update the package\n",
 85 |     "**Again, remember you should do this in mapping environment**\n",
 86 |     "\n",
 87 |     "```shell\n",
 88 |     "source activate mapping\n",
 89 |     "# or source activate your_environment_name\n",
 90 |     "\n",
 91 |     "cd /path/to/original/dir/you/clone/from/github/cemba_data\n",
 92 |     "git pull\n",
 93 |     "pip install .\n",
 94 |     "```"
 95 |    ]
 96 |   }
 97 |  ],
 98 |  "metadata": {
 99 |   "hide_input": false,
100 |   "kernelspec": {
101 |    "display_name": "Python 3",
102 |    "language": "python",
103 |    "name": "python3"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.7.3"
116 |   },
117 |   "toc": {
118 |    "base_numbering": 1,
119 |    "nav_menu": {},
120 |    "number_sections": true,
121 |    "sideBar": true,
122 |    "skip_h1_title": true,
123 |    "title_cell": "Table of Contents",
124 |    "title_sidebar": "Contents",
125 |    "toc_cell": true,
126 |    "toc_position": {},
127 |    "toc_section_display": true,
128 |    "toc_window_display": true
129 |   },
130 |   "varInspector": {
131 |    "cols": {
132 |     "lenName": 16,
133 |     "lenType": 16,
134 |     "lenVar": 40
135 |    },
136 |    "kernels_config": {
137 |     "python": {
138 |      "delete_cmd_postfix": "",
139 |      "delete_cmd_prefix": "del ",
140 |      "library": "var_list.py",
141 |      "varRefreshCmd": "print(var_dic_list())"
142 |     },
143 |     "r": {
144 |      "delete_cmd_postfix": ") ",
145 |      "delete_cmd_prefix": "rm(",
146 |      "library": "var_list.r",
147 |      "varRefreshCmd": "cat(var_dic_list()) "
148 |     }
149 |    },
150 |    "types_to_exclude": [
151 |     "module",
152 |     "function",
153 |     "builtin_function_or_method",
154 |     "instance",
155 |     "_Feature"
156 |    ],
157 |    "window_display": false
158 |   },
159 |   "pycharm": {
160 |    "stem_cell": {
161 |     "cell_type": "raw",
162 |     "source": [],
163 |     "metadata": {
164 |      "collapsed": false
165 |     }
166 |    }
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 2
171 | }


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/env.yaml:
--------------------------------------------------------------------------------
 1 | name: base
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |  - python=3.8
 8 |  - pip
 9 |  - jupyter
10 |  - snakemake
11 |  - pytables
12 |  - seaborn
13 |  - xarray
14 |  - dask
15 |  - mamba
16 |  - natsort
17 |  - netCDF4
18 |  - networkx
19 |  - opentsne
20 |  - plotly
21 |  - pynndescent
22 |  - leidenalg
23 |  - anndata
24 |  - scanpy
25 |  - scikit-learn
26 |  - statsmodels
27 |  - xarray
28 |  - yaml
29 |  - zarr
30 |  - biopython
31 |  - cutadapt
32 |  - bismark=0.20
33 |  - bowtie2
34 |  - bowtie
35 |  - samtools
36 |  - picard
37 |  - bedtools
38 |  - htslib>=1.9
39 |  - pysam
40 |  - pybedtools
41 |  - pyBigWig
42 |  - star=2.7.3a
43 |  - subread=2.0
44 |  - rpy2
45 |  - pip:
46 |     - papermill
47 |     - imblearn
48 |     - allcools
49 |     - schicluster
50 |     - cemba_data
51 | 


--------------------------------------------------------------------------------
/hisat3n_env.yml:
--------------------------------------------------------------------------------
 1 | name: base
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |  - python=3.9
 8 |  - pip
 9 |  - jupyter
10 |  - snakemake
11 |  - pytables
12 |  - seaborn
13 |  - yaml
14 |  - cutadapt
15 |  - samtools
16 |  - picard
17 |  - bedtools
18 |  - htslib=1.15
19 |  - pysam
20 |  - pybedtools
21 |  - pyBigWig
22 |  - pip:
23 |     - papermill
24 |     - allcools
25 |     - cemba_data
26 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"]
3 | 
4 | [tool.setuptools_scm]
5 | write_to = 'cemba_data/_version.py'


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | papermill
2 | ipykernel
3 | nbsphinx
4 | sphinx>=3
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='cemba-data',
 5 |     use_scm_version=True,
 6 |     setup_requires=['setuptools_scm'],
 7 |     author='Hanqing Liu',
 8 |     author_email='hanliu@salk.edu',
 9 |     description='Pipelines for single nucleus methylome and multi-omic dataset.',
10 |     long_description=open('README.md').read(),
11 |     long_description_content_type='text/markdown',
12 |     url='https://github.com/lhqing/cemba_data',
13 |     license='MIT',
14 |     classifiers=[
15 |         "License :: OSI Approved :: MIT License",
16 |         "Programming Language :: Python :: 3",
17 |         "Programming Language :: Python :: 3.7",
18 |     ],
19 |     packages=find_packages(exclude=('doc',)),
20 |     include_package_data=True,
21 |     package_data={
22 |         '': ['*.txt', '*.tsv', '*.csv', '*.fa', '*Snakefile', '*ipynb']
23 |     },
24 |     install_requires=['pandas>=1.0',
25 |                       'numpy',
26 |                       'seaborn',
27 |                       'matplotlib',
28 |                       'papermill',
29 |                       'dnaio',
30 |                       'pysam'],
31 |     entry_points={
32 |         'console_scripts': ['yap=cemba_data.__main__:main',
33 |                             'yap-internal=cemba_data._yap_internal_cli_:internal_main',
34 |                             'yap-hisat3n=cemba_data.hisat3n.cli:main'],
35 |     }
36 | )
37 | 


--------------------------------------------------------------------------------