├── .github └── workflows │ └── publish.yaml ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── cemba_data ├── __init__.py ├── __main__.py ├── _yap_internal_cli_.py ├── bulk │ ├── Snakefile_template │ │ ├── __init__.py │ │ └── mc_bulk.Snakefile │ ├── __init__.py │ ├── atac_bulk.py │ ├── mc_bulk.py │ ├── mc_bulk_multigroup │ │ ├── __init__.py │ │ ├── mc_bulk_multigroup.py │ │ └── mc_bulk_multigroup_template.py │ └── mct_bulk.py ├── demultiplex │ ├── __init__.py │ ├── demultiplex.py │ ├── fastq_dataframe.py │ └── plateinfo_and_samplesheet.py ├── dmr │ ├── __init__.py │ ├── dmrseq │ │ ├── DMRseq.ipynb │ │ └── __init__.py │ └── dss │ │ ├── DSS.MultiGroup.SingleRegionDML.ipynb │ │ ├── DSS.TwoGroup.SingleRegionDML.ipynb │ │ ├── MultiGroup.py │ │ ├── TwoGroup.py │ │ └── __init__.py ├── files │ ├── V1_i7_i5_index.tsv │ ├── V2_i7_i5_index.tsv │ ├── __init__.py │ ├── default_config │ │ ├── __init__.py │ │ ├── mapping_config_4m.ini │ │ ├── mapping_config_m3c.ini │ │ ├── mapping_config_mc.ini │ │ ├── mapping_config_mct-nome.ini │ │ ├── mapping_config_mct.ini │ │ └── mapping_config_nome.ini │ ├── mapping_summary_template │ │ ├── 4m_template.ipynb │ │ ├── __init__.py │ │ ├── m3c_template.ipynb │ │ ├── mc_template.ipynb │ │ └── mct_template.ipynb │ ├── plate_info_template_v1.txt │ ├── plate_info_template_v2.txt │ ├── random_index_v1.fa │ ├── random_index_v2 │ │ ├── __init__.py │ │ ├── random_index_v2.fa │ │ ├── random_index_v2.multiplex_group_1.fa │ │ ├── random_index_v2.multiplex_group_2.fa │ │ ├── random_index_v2.multiplex_group_3.fa │ │ ├── random_index_v2.multiplex_group_4.fa │ │ ├── random_index_v2.multiplex_group_5.fa │ │ └── random_index_v2.multiplex_group_6.fa │ ├── sample_sheet_header.txt │ ├── sbatch_template_schicluster.txt │ └── sbatch_template_yap.txt ├── hisat3n │ ├── __init__.py │ ├── cli.py │ ├── config │ │ ├── __init__.py │ │ ├── gcp.md │ │ ├── hisat-3n-build.sh │ │ ├── hisat3n_mapping_env.yaml │ │ └── vm_init.sh │ ├── hisat3n_general.py │ ├── hisat3n_m3c.py │ ├── hisat3n_mct.py │ ├── snakefile │ │ ├── __init__.py │ │ ├── m3c.smk │ │ ├── mc-multi.smk │ │ ├── mc-multi_sort_input.smk │ │ ├── mc.smk │ │ ├── mct-multi.smk │ │ └── mct.smk │ ├── stats_col_names.py │ ├── stats_parser.py │ ├── summary.py │ └── utilities.py ├── mapping │ ├── Snakefile_template │ │ ├── 4m.Snakefile │ │ ├── __init__.py │ │ ├── m3c.Snakefile │ │ ├── mc.Snakefile │ │ └── mct.Snakefile │ ├── __init__.py │ ├── config.py │ ├── m3c │ │ └── __init__.py │ ├── mct │ │ ├── __init__.py │ │ ├── mct_bismark_bam_filter.py │ │ └── mct_star_bam_filter.py │ ├── pipelines │ │ ├── _4m.py │ │ ├── __init__.py │ │ ├── m3c.py │ │ ├── mc.py │ │ └── mct.py │ ├── stats │ │ ├── _4m.py │ │ ├── __init__.py │ │ ├── m3c.py │ │ ├── mc.py │ │ ├── mct.py │ │ ├── plate_info.py │ │ ├── plot.py │ │ └── utilities.py │ └── test_environment.py ├── qsub.py ├── sbatch.py ├── snm3C │ ├── __init__.py │ ├── prepare_dataset.py │ └── prepare_impute.py └── utilities.py ├── doc ├── Makefile ├── Mapping.ipynb ├── MappingSummary.ipynb ├── PipelineInput.ipynb ├── PlateInfoAndSampleSheet.ipynb ├── TODO_GenerateMCDS.ipynb ├── TODO_overview.ipynb ├── TechBasic.ipynb ├── archive │ └── MakeFastqDataframe.ipynb ├── conf.py ├── demultiplex.ipynb ├── files │ ├── MappingPipeline.png │ ├── molecularsteps.png │ ├── primerstructure.png │ ├── v1barcode.png │ └── v2barcode.png ├── index.rst ├── installation.ipynb └── make.bat ├── env.yaml ├── hisat3n_env.yml ├── pyproject.toml ├── requirements.txt └── setup.py /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | push: 13 | # Sequence of patterns matched against refs/tags 14 | tags: 15 | - "v*" # Push events to matching v*, i.e. v1.0, v20.15.10 16 | 17 | permissions: 18 | contents: read 19 | 20 | jobs: 21 | deploy: 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | # build python package and deploy to pypi 26 | - uses: actions/checkout@v3 27 | - name: Set up Python 28 | uses: actions/setup-python@v3 29 | with: 30 | python-version: "3.8" 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip wheel twine build 34 | pip install build 35 | - name: Build package 36 | run: python -m build 37 | - name: Publish package 38 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 39 | with: 40 | user: __token__ 41 | password: ${{ secrets.PYPI_API_TOKEN_CEMBA_DATA }} 42 | 43 | # # build docker image and push to GCR 44 | # - uses: actions/checkout@v3 45 | # - uses: google-github-actions/setup-gcloud@v0 46 | # - name: Get the version 47 | # id: get_tag_name 48 | # run: echo ::set-output name=GIT_TAG_NAME::${GITHUB_REF/refs\/tags\//} 49 | # - uses: RafikFarhad/push-to-gcr-github-action@v4 50 | # with: 51 | # gcloud_service_key: ${{ secrets.GCLOUD_SERVICE_KEY }} 52 | # registry: gcr.io 53 | # project_id: prod-635e 54 | # image_name: wmb 55 | # image_tag: ${{ steps.get_tag_name.outputs.GIT_TAG_NAME}},latest 56 | # dockerfile: ./Dockerfile 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea/ 6 | .DS_Store 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | cemba_data/_version.py 107 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.7' 4 | script: true 5 | deploy: 6 | provider: pypi 7 | username: __token__ 8 | on: 9 | tags: true 10 | password: 11 | secure: KGUWzdUpJgJGR4wOz8w3o16zscEFlRWF5Kdm3PSLTDuDY6OpNK0L4lywyMuhh178hCRZ72/5FmzXYoBXI1g2IODEvpWxmvbFe3kF8FPPD3BfgYsIsF0i4pNHPpdmIxZeuBaymf+SctVNY4o81mup7n3T05P9l8mATDOnSgP+5WLoHAk+ie7D9/H386xueGxfcKuUmzyZRlqUsjs7COXgDiG9VoyZi4KvUwlZz8+jriYjs9qL/t1rN2Mg0ZCDCzGghNDo36tnvRAX+TqGACj4xURXydCJGPx6hUPTJkbydIhGlvaVblCO8FYDsLuedUIblU5SMAUklkhh48VoR1k5+l2mxCkAOLCPYodZ2AS+wNhF5yMXbOhd4zmabw0uxfpfEVZOjcDi08YzbsRFyz5f8BuFkXwjWeaUpiNG8oj/6xZBpWzGNg5cQ+ZzqHXuavf5mzgrt+K0TxBGLfQ4san0EgbBYESkUaVWRaYt0LEhmkk58Wx27Um+C7lrl2Wxs6C0rnNXzho8jiAe2ZTHva8EhG1fJuUiLZ6YA2xobZVmZlFj/J/eEoZYRvLN1dEGhWwhcgenc/1rY1NW1mllGkGVzfvB/YqZEbk9Mo9PvNej5KLg63aoYJ0/tgL/fTdBE1S1LlisZPgFHdZ2RwkB6NxazXY2qWZQkLPqJ02aEuSDb1k= 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mambaorg/micromamba:0.23.0 2 | COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml 3 | RUN micromamba install -y -f /tmp/env.yaml && \ 4 | micromamba clean --all --yes 5 | 6 | ARG MAMBA_DOCKERFILE_ACTIVATE=1 7 | 8 | RUN yap --version 9 | RUN allcools --version 10 | 11 | USER root 12 | # default argument when not provided in the --build-arg 13 | # to build the image with gcp, use 14 | # docker build --build-arg gcp=true -t mapping-gcp:tag . 15 | ARG gcp 16 | RUN if [ "$gcp" = "true" ] ; then \ 17 | apt-get update && \ 18 | apt-get install -y curl gnupg && \ 19 | echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | \ 20 | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 21 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ 22 | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ 23 | apt-get update -y && \ 24 | apt-get install google-cloud-sdk -y; \ 25 | else echo 'no gcp install'; \ 26 | fi 27 | 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 - 2020 Hanqing Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include cemba_data *.ini 2 | include LICENSE.txt 3 | recursive-exclude * __pycache__ 4 | recursive-include cemba_data *.txt *.tsv *.csv *.fa *Snakefile *ipynb 5 | exclude doc 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](http://www.network-science.de/ascii/) 2 |
 3 |  **    **     **        *******
 4 | //**  **     ****      /**////**
 5 |  //****     **//**     /**   /**
 6 |   //**     **  //**    /*******
 7 |    /**    **********   /**////
 8 |    /**   /**//////**   /**
 9 |    /**   /**     /**   /**
10 |    //    //      //    //
11 | 
12 | 13 | # YAP (Yet Another Pipeline) 14 | Pipeline(s) for mapping and cluster-level aggregation of single nucleus methylome and multi-omic datasets. 15 | Technologies supported: 16 | - snmC-seq(1/2/3) 17 | - snmCT-seq (mC + RNA) 18 | - snmC2T-seq (mC + RNA + Chromatin Accessibility) 19 | - snm3C-seq (mC + Chromatin Conformation) 20 | - any NOMe treated version of the above 21 | 22 | [See Documentation](https://hq-1.gitbook.io/mc/) 23 | -------------------------------------------------------------------------------- /cemba_data/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import version as __version__ 2 | -------------------------------------------------------------------------------- /cemba_data/bulk/Snakefile_template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/bulk/Snakefile_template/__init__.py -------------------------------------------------------------------------------- /cemba_data/bulk/Snakefile_template/mc_bulk.Snakefile: -------------------------------------------------------------------------------- 1 | 2 | # Example (required) parameters 3 | # merge_allc_cpu = 10 4 | # mcg_context = 'CGN' 5 | # mch_context = 'CHN' 6 | # bigwig_mch_bin_size = 50 7 | # bigwig_mcg_bin_size = 1 8 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE' 9 | # group = 'GROUP_NAME' 10 | 11 | # the main rule is the final target 12 | rule main: 13 | input: 14 | f"{group}.{mcg_context}-both.frac.bw", 15 | f"{group}.{mcg_context}-both.cov.bw", 16 | f"{group}.{mch_context}-both.frac.bw", 17 | f"{group}.{mch_context}-both.cov.bw", 18 | f"{group}.{mcg_context}-Merge.allc.tsv.gz" 19 | 20 | 21 | # Merge ALLC 22 | rule merge_allc: 23 | input: 24 | f"{group}.allc_paths.txt" 25 | output: 26 | allc=f"{group}.allc.tsv.gz", 27 | tbi=f"{group}.allc.tsv.gz.tbi" 28 | threads: 29 | max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1))) 30 | resources: 31 | mem_mb=merge_allc_cpu * 5000 32 | shell: 33 | "allcools merge-allc " 34 | "--allc_paths {input} " 35 | "--output_path {output.allc} " 36 | "--chrom_size_path {chrom_size_path} " 37 | "--cpu {threads}" 38 | 39 | 40 | # Extract mCG ALLC for DMR calling 41 | rule extract_allc_mcg: 42 | input: 43 | f"{group}.allc.tsv.gz" 44 | output: 45 | allc_cg=f"{group}.{mcg_context}-Merge.allc.tsv.gz", 46 | allc_cg_tbi=f"{group}.{mcg_context}-Merge.allc.tsv.gz.tbi" 47 | threads: 48 | 1 49 | resources: 50 | mem_mb=100 51 | shell: 52 | "allcools extract-allc " 53 | "--allc_path {input} " 54 | "--output_prefix {group} " 55 | "--mc_contexts {mcg_context} " 56 | "--chrom_size_path {chrom_size_path} " 57 | "--strandness merge " 58 | "--output_format allc " 59 | "--cpu {threads}" 60 | 61 | 62 | # Generate mCH BigWig files 63 | rule bigwig_ch: 64 | input: 65 | f"{group}.allc.tsv.gz" 66 | output: 67 | f"{group}.{mch_context}-both.cov.bw", 68 | f"{group}.{mch_context}-both.frac.bw" 69 | threads: 70 | 1 71 | resources: 72 | mem_mb=100 73 | shell: 74 | "allcools allc-to-bigwig " 75 | "--allc_path {input} " 76 | "--output_prefix {group} " 77 | "--bin_size {bigwig_mch_bin_size} " 78 | "--mc_contexts {mch_context} " 79 | "--chrom_size_path {chrom_size_path}" 80 | 81 | 82 | # Generate mCG BigWig files 83 | rule bigwig_cg: 84 | input: 85 | f"{group}.allc.tsv.gz" 86 | output: 87 | f"{group}.{mcg_context}-both.cov.bw", 88 | f"{group}.{mcg_context}-both.frac.bw" 89 | threads: 90 | 1 91 | resources: 92 | mem_mb=100 93 | shell: 94 | "allcools allc-to-bigwig " 95 | "--allc_path {input} " 96 | "--output_prefix {group} " 97 | "--bin_size {bigwig_mcg_bin_size} " 98 | "--mc_contexts {mcg_context} " 99 | "--chrom_size_path {chrom_size_path}" 100 | -------------------------------------------------------------------------------- /cemba_data/bulk/__init__.py: -------------------------------------------------------------------------------- 1 | from .mc_bulk import prepare_mc_bulk 2 | from .atac_bulk import prepare_atac_bulk 3 | from .mct_bulk import merge_mct_cluster_bam 4 | from .mc_bulk_multigroup.mc_bulk_multigroup import merge_bulk_multigroup 5 | -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | import cemba_data 4 | 5 | PACKAGE_DIR = pathlib.Path(cemba_data.__path__[0]) 6 | 7 | 8 | def prepare_mc_bulk(allc_table, 9 | output_dir, 10 | chrom_size_path, 11 | mch_context='CHN', 12 | mcg_context='CGN', 13 | bigwig_mch_bin_size=50, 14 | bigwig_mcg_bin_size=1, 15 | cpu_per_job=12, 16 | total_cpu=60): 17 | """ 18 | Prepare the snakefile for merging single-cell ALLC files into pseudo-bulk 19 | 20 | Parameters 21 | ---------- 22 | allc_table 23 | Path of the allc table. The allc table is a two column tsv file. 24 | The first columns is the absolute ALLC file paths; 25 | the second column is the group name of each file. 26 | output_dir 27 | Path of the output directory, will be created if not exist. 28 | mch_context 29 | mCH contexts for generating the bigwig tracks 30 | mcg_context 31 | mCG contexts for generating the bigwig tracks and merge strand 32 | chrom_size_path 33 | Path of the chromosome size file path 34 | bigwig_mch_bin_size 35 | Bin size used to generate mCH bigwig 36 | bigwig_mcg_bin_size 37 | Bin size used to generate mCG bigwig 38 | cpu_per_job 39 | Number of CPUs to use in individual merge-allc job 40 | total_cpu 41 | Number of CPUs to use in total 42 | 43 | Returns 44 | ------- 45 | 46 | """ 47 | snakemake_template_path = PACKAGE_DIR / 'bulk/Snakefile_template/mc_bulk.Snakefile' 48 | output_dir = pathlib.Path(output_dir).absolute() 49 | output_dir.mkdir(exist_ok=True) 50 | 51 | merge_allc_cpu = int(cpu_per_job / 1.1) 52 | total_mem_mb = cpu_per_job * 5000 53 | 54 | # prepare ALLC path dict 55 | # allc_path to group 56 | if str(allc_table).endswith('csv'): 57 | allc_path = pd.read_csv(allc_table, index_col=0, header=None, squeeze=True) 58 | else: 59 | allc_path = pd.read_csv(allc_table, sep='\t', index_col=0, header=None, squeeze=True) 60 | file_not_exist = allc_path[allc_path.index.map(lambda i: not pathlib.Path(i).exists())] 61 | if file_not_exist.size != 0: 62 | path_str = "\n".join(file_not_exist.index.tolist()) 63 | raise FileNotFoundError(f'{file_not_exist.size} files do not exist:' 64 | f'\n{path_str}') 65 | allc_dict = {group: paths.index.tolist() for group, paths in allc_path.groupby(allc_path)} 66 | 67 | # Prepare Snakefile 68 | snakemake_cmds = [] 69 | for group, paths in allc_dict.items(): 70 | # each group has a separate snakemake file 71 | group_dir = output_dir / group 72 | group_dir.mkdir(exist_ok=True) 73 | allc_list_path = group_dir / f'{group}.allc_paths.txt' 74 | with open(allc_list_path, 'w') as f: 75 | f.write('\n'.join(paths)) 76 | snakemake_parameters = f""" 77 | merge_allc_cpu = {merge_allc_cpu} 78 | mch_context = '{mch_context}' 79 | mcg_context = '{mcg_context}' 80 | bigwig_mch_bin_size = {bigwig_mch_bin_size} 81 | bigwig_mcg_bin_size = {bigwig_mcg_bin_size} 82 | chrom_size_path = '{chrom_size_path}' 83 | group = '{group}' 84 | 85 | """ 86 | with open(snakemake_template_path) as f: 87 | snakemake_template = f.read() 88 | snakemake_str = snakemake_parameters + snakemake_template 89 | with open(group_dir / f'Snakefile', 'w') as f: 90 | f.write(snakemake_str) 91 | snakemake_cmd = f'snakemake ' \ 92 | f'-d {group_dir.absolute()} ' \ 93 | f'--snakefile {group_dir.absolute()}/Snakefile ' \ 94 | f'-j {cpu_per_job} ' \ 95 | f'--default-resources mem_mb=100 ' \ 96 | f'--resources mem_mb={total_mem_mb} ' \ 97 | f'--rerun-incomplete' 98 | snakemake_cmds.append(snakemake_cmd) 99 | 100 | qsub_dir = output_dir / 'qsub' 101 | qsub_dir.mkdir(exist_ok=True) 102 | with open(qsub_dir / 'snakemake_cmds.txt', 'w') as f: 103 | f.write('\n'.join(snakemake_cmds)) 104 | with open(qsub_dir / 'qsub.sh', 'w') as f: 105 | qsub_str = f""" 106 | yap qsub \ 107 | --command_file_path {qsub_dir / 'snakemake_cmds.txt'} \ 108 | --working_dir {qsub_dir} \ 109 | --project_name merge \ 110 | --total_cpu {total_cpu} \ 111 | --qsub_global_parms "-pe smp={cpu_per_job};-l h_vmem=5G" 112 | """ 113 | f.write(qsub_str) 114 | print(f'Execute this command to start pipeline:\nnohup sh {qsub_dir / "qsub.sh"} &') 115 | return 116 | -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk_multigroup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/bulk/mc_bulk_multigroup/__init__.py -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk_multigroup/mc_bulk_multigroup.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | import shutil 4 | from .mc_bulk_multigroup_template import MERGE_TEMPLATE, MERGE_EXTRACT_TEMPLATE 5 | 6 | 7 | ''' 8 | the group_file is in csv format with the first column as allcpath and different group the others. 9 | the group_file needs a header. 10 | ''' 11 | 12 | def merge_bulk_multigroup(group_path, output_path, chrom_size_path, 13 | n_cpu=10, elem_snakegroup_num = 50, 14 | cate_snakegroup_num = 10, ): 15 | 16 | 17 | outdir = Path(output_path) 18 | outdir.mkdir(parents=True, exist_ok=True) 19 | shutil.copyfile(group_path, outdir/'GROUP.csv') 20 | 21 | df = pd.read_csv(group_path) 22 | df = df.rename(columns={df.columns[0]:'_path'}) 23 | sample_cates = df.columns[1:] 24 | 25 | df['_elem'] = pd.factorize(df[sample_cates].astype(str).apply('-'.join, axis=1))[0] 26 | countdict = df['_elem'].value_counts().to_dict() 27 | df['_elem'] = df['_elem'].apply(lambda x: f'{x}_{countdict[x]}') 28 | 29 | elem_grp_df = df.groupby('_elem')['_path'].apply(lambda x: x.unique()).to_frame() 30 | elem_grp_df.index.name = '_sample' 31 | elem_grp_df['_cate'] = '_elem' 32 | elem_grp_df = elem_grp_df.reset_index()[['_cate','_sample','_path']] 33 | 34 | df = df[df.columns[1:]].drop_duplicates() 35 | 36 | df['_path'] = output_path+'/_elem/'+df['_elem']+'.allc.tsv.gz' 37 | 38 | cate_grp_df = [] 39 | for cate in sample_cates: 40 | catedf = df[[cate,'_path']].groupby(cate)['_path'].apply(lambda x: x.unique()).to_frame() 41 | catedf['_cate'] = cate 42 | catedf.index.name = '_sample' 43 | catedf = catedf.reset_index() 44 | cate_grp_df.append(catedf) 45 | cate_grp_df = pd.concat(cate_grp_df).reset_index(drop=True)[['_cate','_sample','_path']] 46 | 47 | 48 | def prepare_snakefiles(grp_df, output_path, tag, n_per_snake=None, template=MERGE_TEMPLATE): 49 | outdir = Path(output_path) 50 | snkdir = outdir/'snakefiles' 51 | snkdir.mkdir(exist_ok=True) 52 | 53 | for cate in grp_df['_cate'].unique(): 54 | catedir = outdir/cate 55 | catedir.mkdir(exist_ok=True) 56 | 57 | for _,(cate,sample,paths) in grp_df.iterrows(): 58 | catedir = outdir/cate 59 | with open(catedir/f'{sample}.pathlist','w') as f: 60 | f.write('\n'.join(paths)) 61 | 62 | if n_per_snake is None: 63 | n_per_snake = len(grp_df) 64 | 65 | snk_ids = [] 66 | for i, snkdf in grp_df.groupby(grp_df.index%n_per_snake): 67 | snk_id = f'{tag}_{i}' 68 | 69 | tocp_df = snkdf[snkdf['_path'].apply(len)==1] 70 | tomg_df = snkdf[snkdf['_path'].apply(len)>1] 71 | 72 | with open(snkdir/f'{snk_id}.snakefile', 'w') as f: 73 | f.write( 74 | f'''merge_allc_cpu = {n_cpu} 75 | mcg_context = 'CGN' 76 | chrom_size_path = '{chrom_size_path}' 77 | merge_sample_prefixes = [{','.join("'"+tomg_df['_cate']+'/'+tomg_df['_sample']+"'")}] 78 | copy_sample_prefixes = [{','.join("'"+tocp_df['_cate']+'/'+tocp_df['_sample']+"'")}] 79 | group = "{snk_id}" 80 | ''' 81 | ) 82 | f.write(template) 83 | snk_ids.append(snk_id) 84 | 85 | return snk_ids 86 | 87 | elem_snk_ids = prepare_snakefiles(elem_grp_df, output_path, 'elem',elem_snakegroup_num, template=MERGE_TEMPLATE) 88 | cate_snk_ids = prepare_snakefiles(cate_grp_df, output_path, 'cate',cate_snakegroup_num, template=MERGE_EXTRACT_TEMPLATE) 89 | 90 | def prepare_commands(snake_ids): 91 | cmds = [f'snakemake -d {outdir.resolve()} --snakefile {outdir.resolve()}/snakefiles/{snkid}.snakefile ' 92 | f'-j {n_cpu} --default-resources mem_mb=100 --resources mem_mb=1000 --rerun-incomplete' \ 93 | for snkid in snake_ids] 94 | return cmds 95 | 96 | 97 | 98 | with open(outdir/'run_snakemake_cmds_1.txt', 'w') as f: 99 | f.write('\n'.join(prepare_commands(elem_snk_ids))) 100 | with open(outdir/'run_snakemake_cmds_2.txt', 'w') as f: 101 | f.write('\n'.join(prepare_commands(cate_snk_ids))) 102 | -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk_multigroup/mc_bulk_multigroup_template.py: -------------------------------------------------------------------------------- 1 | MERGE_TEMPLATE = ''' 2 | # Example (required) parameters 3 | # merge_allc_cpu = 10 4 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE' 5 | # merge_sample_prefixes = '[]' 6 | # copy_sample_prefixes = '[]' 7 | # group = 'GROUP_NAME' 8 | sample_prefixes = merge_sample_prefixes + copy_sample_prefixes 9 | 10 | # the main rule is the final target 11 | rule main: 12 | input: 13 | expand("{sample}.allc.tsv.gz", sample=sample_prefixes), 14 | expand("{sample}.allc.tsv.gz.tbi", sample=sample_prefixes), 15 | # output: 16 | # f"{group}.finished" 17 | # shell: 18 | # "date > {output}" 19 | 20 | 21 | 22 | # Merge ALLC 23 | rule merge_allc: 24 | input: 25 | "{sample}.pathlist", 26 | output: 27 | allc="{sample}.allc.tsv.gz", 28 | tbi="{sample}.allc.tsv.gz.tbi" 29 | threads: 30 | max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1))) 31 | resources: 32 | mem_mb=merge_allc_cpu * 5000 33 | run: 34 | if wildcards.sample in merge_sample_prefixes: 35 | shell("allcools merge-allc " 36 | "--allc_paths {input} " 37 | "--output_path {output.allc} " 38 | "--chrom_size_path {chrom_size_path} " 39 | "--cpu {threads}") 40 | else: 41 | shell("cp $(cat {input}) {output.allc} ;" 42 | "cp $(cat {input}).tbi {output.tbi} ;") 43 | 44 | ''' 45 | 46 | MERGE_EXTRACT_TEMPLATE = ''' 47 | # Example (required) parameters 48 | # merge_allc_cpu = 10 49 | # mcg_context = 'CGN' 50 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE' 51 | # merge_sample_prefixes = '[]' 52 | # copy_sample_prefixes = '[]' 53 | # group = 'GROUP_NAME' 54 | sample_prefixes = merge_sample_prefixes + copy_sample_prefixes 55 | 56 | # the main rule is the final target 57 | rule main: 58 | input: 59 | expand("{sample}.{mcg_context}-Merge.allc.tsv.gz", sample=sample_prefixes, mcg_context=[mcg_context]), 60 | expand("{sample}.{mcg_context}-Merge.allc.tsv.gz.tbi", sample=sample_prefixes, mcg_context=[mcg_context]), 61 | # output: 62 | # f"{group}.finished" 63 | # shell: 64 | # "date > {output}" 65 | 66 | 67 | # Merge ALLC 68 | rule merge_allc: 69 | input: 70 | "{sample}.pathlist", 71 | output: 72 | allc="{sample}.allc.tsv.gz", 73 | tbi="{sample}.allc.tsv.gz.tbi" 74 | threads: 75 | max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1))) 76 | resources: 77 | mem_mb=merge_allc_cpu * 5000 78 | run: 79 | if wildcards.sample in merge_sample_prefixes: 80 | shell("allcools merge-allc " 81 | "--allc_paths {input} " 82 | "--output_path {output.allc} " 83 | "--chrom_size_path {chrom_size_path} " 84 | "--cpu {threads}") 85 | else: 86 | shell("cp $(cat {input}) {output.allc} ;" 87 | "cp $(cat {input}).tbi {output.tbi} ;") 88 | 89 | # Extract mCG ALLC for DMR calling 90 | rule extract_allc_mcg: 91 | input: 92 | "{sample}.allc.tsv.gz" 93 | output: 94 | allc_cg="{sample}.{mcg_context}-Merge.allc.tsv.gz", 95 | allc_cg_tbi="{sample}.{mcg_context}-Merge.allc.tsv.gz.tbi" 96 | threads: 97 | 1 98 | resources: 99 | mem_mb=100 100 | shell: 101 | "allcools extract-allc " 102 | "--allc_path {input} " 103 | "--output_prefix {wildcards.sample} " 104 | "--mc_contexts {mcg_context} " 105 | "--chrom_size_path {chrom_size_path} " 106 | "--strandness merge " 107 | "--output_format allc " 108 | "--cpu {threads}" 109 | ''' 110 | -------------------------------------------------------------------------------- /cemba_data/bulk/mct_bulk.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import pandas as pd 3 | import glob 4 | import subprocess 5 | from concurrent.futures import ProcessPoolExecutor, as_completed 6 | import os 7 | 8 | 9 | def merge_single_bam(bam_path, cell_id_to_cluster, output_prefix, header_dict): 10 | header = pysam.AlignmentHeader.from_dict(header_dict) 11 | clusters = set(cell_id_to_cluster.values()) 12 | cluster_read_counts = {c: 0 for c in clusters} 13 | 14 | # write reads by cluster 15 | with pysam.AlignmentFile(bam_path, "rb") as bam_file: 16 | # open BAM handles for each cluster 17 | cluster_handles = {} 18 | for cluster in clusters: 19 | cluster_handles[cluster] = pysam.AlignmentFile( 20 | f'{output_prefix}_{cluster}.bam', "wb", header=header) 21 | 22 | for read in bam_file: 23 | cell_id = read.get_tag('RG') 24 | try: 25 | cluster = cell_id_to_cluster[cell_id] 26 | # this removes RG tag 27 | read.set_tag('RG', None) 28 | cluster_handles[cluster].write(read) 29 | cluster_read_counts[cluster] += 1 30 | except KeyError: 31 | continue 32 | 33 | # close handles 34 | for handle in cluster_handles.values(): 35 | handle.close() 36 | 37 | # delete empty out_bam 38 | for cluster, count in cluster_read_counts.items(): 39 | bam_path = f'{output_prefix}_{cluster}.bam' 40 | if count == 0: 41 | subprocess.run(['rm', '-rf', bam_path]) 42 | return cluster_read_counts 43 | 44 | 45 | def merge_mct_cluster_bam(cell_id_to_cluster_path, 46 | bam_list_path, 47 | output_prefix, 48 | cpu=10): 49 | cell_id_to_cluster = pd.read_csv( 50 | cell_id_to_cluster_path, 51 | index_col=0, 52 | header=None, 53 | squeeze=True).to_dict() 54 | bam_paths = pd.read_csv(bam_list_path, header=None, squeeze=True).tolist() 55 | 56 | # get header 57 | with pysam.AlignmentFile(bam_paths[0]) as bam: 58 | header_dict = bam.header.as_dict() 59 | # remove cell specific info 60 | keys_to_delete = ['PG', 'RG', 'CO'] 61 | for k in keys_to_delete: 62 | if k in header_dict: 63 | del header_dict[k] 64 | 65 | clusters = set(cell_id_to_cluster.values()) 66 | total_cluster_read_counts = {c: 0 for c in clusters} 67 | 68 | # merge single bam files 69 | with ProcessPoolExecutor(cpu) as exe: 70 | futures = {} 71 | for i, path in enumerate(bam_paths): 72 | f = exe.submit(merge_single_bam, 73 | bam_path=path, 74 | cell_id_to_cluster=cell_id_to_cluster, 75 | output_prefix=f'{output_prefix}{i:06d}', 76 | header_dict=header_dict) 77 | futures[f] = path 78 | 79 | for f in as_completed(futures): 80 | cluster_read_counts = f.result() 81 | for k, v in cluster_read_counts.items(): 82 | total_cluster_read_counts[k] += v 83 | 84 | # merge cluster bam files 85 | with ProcessPoolExecutor(cpu) as exe: 86 | futures = {} 87 | for cluster in clusters: 88 | chunk_paths = list(glob.glob(f'{output_prefix}*_{cluster}.bam')) 89 | if len(chunk_paths) == 0: 90 | continue 91 | merge_cmd = f'samtools merge --no-PG -c -o {output_prefix}_{cluster}.bam ' \ 92 | f'{output_prefix}*_{cluster}.bam && ' \ 93 | f'samtools index {output_prefix}_{cluster}.bam' 94 | f = exe.submit(subprocess.run, 95 | merge_cmd, 96 | shell=True, 97 | check=True) 98 | futures[f] = chunk_paths 99 | 100 | for f in as_completed(futures): 101 | chunk_paths = futures[f] 102 | f.result() 103 | for path in chunk_paths: 104 | os.unlink(path) 105 | return 106 | -------------------------------------------------------------------------------- /cemba_data/demultiplex/__init__.py: -------------------------------------------------------------------------------- 1 | from .plateinfo_and_samplesheet import print_plate_info, make_sample_sheet 2 | from .demultiplex import demultiplex_pipeline, update_snakemake 3 | -------------------------------------------------------------------------------- /cemba_data/demultiplex/fastq_dataframe.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate raw FASTQ dataframe based on fixed name pattern 3 | name pattern is based on samplesheet generated in plateinfo_and_samplesheet.py 4 | """ 5 | 6 | import glob 7 | import logging 8 | import pathlib 9 | 10 | import pandas as pd 11 | 12 | # logger 13 | log = logging.getLogger() 14 | 15 | 16 | def _parse_v1_fastq_path(path): 17 | """ 18 | UID pattern of V1 {sample_id_prefix}-{plate1}-{plate2}-{plate_pos} 19 | FASTQ name pattern of V1: 20 | {sample_id_prefix}-{plate1}-{plate2}-{plate_pos}_{internal_info}_{lane}_{read_type}_{internal_info}.fastq.gz 21 | """ 22 | path = pathlib.Path(path) 23 | try: 24 | *_, plate1, plate2, multi_field = path.name.split('-') 25 | plate_pos, _, lane, read_type, _ = multi_field.split('_') 26 | try: 27 | assert plate_pos[0] in 'ABCDEFGH' 28 | assert int(plate_pos[1:]) in list(range(1, 13)) 29 | assert lane in {'L001', 'L002', 'L003', 'L004'} 30 | assert read_type in {'R1', 'R2'} 31 | assert plate1 != plate2 32 | except AssertionError: 33 | raise ValueError 34 | except ValueError: 35 | raise ValueError(f'Found unknown name pattern in path {path}') 36 | name_dict = dict(plate1=plate1, 37 | plate2=plate2, 38 | plate_pos=plate_pos, 39 | lane=lane, 40 | read_type=read_type, 41 | fastq_path=path, 42 | uid=f'{plate1}-{plate2}-{plate_pos}') 43 | name_series = pd.Series(name_dict) 44 | return name_series 45 | 46 | 47 | def _parse_v2_fastq_path(path): 48 | """ 49 | UID pattern of V2 {sample_id_prefix}-{plate}-{multiplex_group}-{barcode_name} 50 | FASTQ name pattern of V1: 51 | {sample_id_prefix}-{plate}-{multiplex_group}-{barcode_name}_{internal_info}_{lane}_{read_type}_{internal_info}.fastq.gz 52 | """ 53 | path = pathlib.Path(path) 54 | try: 55 | *_, plate, multiplex_group, multi_field = path.name.split('-') 56 | primer_name, _, lane, read_type, _ = multi_field.split('_') 57 | try: 58 | assert primer_name[0] in 'ABCDEFGHIJKLMNOP' 59 | assert int(primer_name[1:]) in list(range(1, 25)) 60 | assert int(multiplex_group) in list(range(1, 7)) 61 | assert lane in {'L001', 'L002', 'L003', 'L004'} 62 | assert read_type in {'R1', 'R2'} 63 | except AssertionError: 64 | raise ValueError 65 | except ValueError: 66 | raise ValueError(f'Found unknown name pattern in path {path}') 67 | name_dict = dict(plate=plate, 68 | multiplex_group=multiplex_group, 69 | primer_name=primer_name, 70 | lane=lane, 71 | read_type=read_type, 72 | fastq_path=path, 73 | uid=f'{plate}-{multiplex_group}-{primer_name}') 74 | name_series = pd.Series(name_dict) 75 | return name_series 76 | 77 | 78 | def make_fastq_dataframe(file_path, barcode_version, output_path=None): 79 | """ 80 | Generate fastq_dataframe for pipeline input. 81 | 82 | Parameters 83 | ---------- 84 | file_path 85 | Accept 1. path pattern contain wildcard, 2. path list, 3. path of one file contain all the paths. 86 | barcode_version 87 | Only accept two options: 1) V1 for 8 random index; 2) V2 for 384 random index. 88 | output_path 89 | output path of the fastq dataframe 90 | Returns 91 | ------- 92 | fastq_dataframe for pipeline input. 93 | """ 94 | barcode_version = barcode_version.upper() 95 | if barcode_version == 'V1': 96 | parser = _parse_v1_fastq_path 97 | elif barcode_version == 'V2': 98 | parser = _parse_v2_fastq_path 99 | else: 100 | raise ValueError(f'Primer Version can only be V1 or V2, got {barcode_version}.') 101 | 102 | if isinstance(file_path, str) and ('*' in file_path): 103 | file_path = [str(pathlib.Path(p).absolute()) for p in glob.glob(file_path)] 104 | elif isinstance(file_path, list): 105 | pass 106 | else: 107 | with open(file_path) as f: 108 | file_path = [line.strip() for line in f] 109 | log.info(f'{len(file_path)} FASTQ file paths in input') 110 | 111 | fastq_data = [] 112 | for path in file_path: 113 | name_series = parser(path) 114 | fastq_data.append(name_series) 115 | fastq_df = pd.DataFrame(fastq_data) 116 | log.info(f'{fastq_df.shape[0]} valid fastq names.') 117 | if fastq_df.shape[0] == 0: 118 | log.info('No fastq name remained, check if the name pattern is correct.') 119 | return None 120 | 121 | # make sure UID is unique 122 | for _, df in fastq_df.groupby(['lane', 'read_type']): 123 | if df['uid'].unique().size != df['uid'].size: 124 | raise ValueError(f'UID column is not unique.') 125 | if output_path is not None: 126 | fastq_df.to_csv(output_path, index=False) 127 | return fastq_df 128 | -------------------------------------------------------------------------------- /cemba_data/dmr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/dmr/__init__.py -------------------------------------------------------------------------------- /cemba_data/dmr/dss/__init__.py: -------------------------------------------------------------------------------- 1 | from .TwoGroup import run_dss_two_group 2 | from .MultiGroup import run_dss_multi_group -------------------------------------------------------------------------------- /cemba_data/files/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/default_config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/default_config/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_4m.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | [mode] 18 | mode = 4m 19 | 20 | 21 | [multiplexIndex] 22 | ; This section is for demultiplex step 23 | ; V1: 8 random index version 24 | ; V2: 384 random index version 25 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 26 | 27 | 28 | [fastqTrim] 29 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 30 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 31 | ; Universal illumina adapter 32 | 33 | overlap = 6 34 | ; least overlap of base and illumina adapter 35 | 36 | r1_left_cut = 10 37 | ; constant length to trim at 5 prime end, apply before quality trim. 38 | ; Aim to cut random primer part, determined by random primer length. 39 | ; Random primer can impact results https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 40 | 41 | r1_right_cut = 10 42 | ; constant length to trim at 3 prime end, apply before quality trim. 43 | 44 | r2_left_cut = 10 45 | ; constant length to trim at 5 prime end, apply before quality trim. 46 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 47 | 48 | r2_right_cut = 10 49 | ; constant length to trim at 3 prime end, apply before quality trim. 50 | 51 | quality_threshold = 20 52 | ; reads quality score threshold for trimming. 53 | 54 | length_threshold = 30 55 | ; reads length threshold after all trim steps. 56 | 57 | total_read_pairs_min = 1 58 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 59 | 60 | total_read_pairs_max = 6000000 61 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 62 | 63 | 64 | [mapping reference] 65 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 66 | ; reference directory of bismark 67 | 68 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 69 | ; reference prefix for the HISAT-3N DNA mapping 70 | 71 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 72 | ; reference prefix for the HISAT-3N RNA mapping 73 | 74 | hisat3n_repeat_index_type = no-repeat 75 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 76 | ; if "no-repeat", will run hisat-3n in the normal mode. 77 | 78 | 79 | [readSplit] 80 | trim_on_both_end = 5 81 | ; whether trim the unmapped reads before split. 82 | 83 | split_left_size = 40 84 | ; length of the left part of the split 85 | 86 | split_right_size = 40 87 | ; length of the right part of the split 88 | 89 | split_middle_min_size = 30 90 | ; minimum length of the middle part after the split, middle part shorter than this will not be used. 91 | 92 | split_min_read_length = 30 93 | ; minimum length of the read to perform split, read shorter than this will not be used. 94 | 95 | 96 | [star] 97 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR 98 | ; reference directory of STAR 99 | 100 | 101 | [bamFilter] 102 | mapq_threshold = 10 103 | ; reads MAPQ threshold 104 | 105 | 106 | [DNAReadsFilter] 107 | mc_rate_max_threshold = 0.5 108 | ; if read CH ratio >= mc_rate_max_threshold, skip this read 109 | 110 | dna_cov_min_threshold = 3 111 | ; if read CH sites <= cov_min_threshold, skip this read 112 | 113 | 114 | [RNAReadsFilter] 115 | mc_rate_min_threshold = 0.9 116 | ; if read CH ratio <= mc_rate_min_threshold, skip this read 117 | 118 | rna_cov_min_threshold = 3 119 | ; if read CH sites <= cov_min_threshold, skip this read 120 | 121 | nome_flag_str = --nome 122 | ; if '--nome', will exclude GpC sites from the read-level methylation fraction calculation 123 | 124 | 125 | [callMethylation] 126 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 127 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 128 | 129 | num_upstr_bases = 1 130 | ; number of base to include before mC 131 | ; change this to 1 for NOMe treatment to get GpCNN 132 | 133 | num_downstr_bases = 2 134 | ; number of base to include after mC 135 | 136 | compress_level = 5 137 | ; ALLC file compress level 138 | 139 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN 140 | ; mC patterns to check when calculate ALLC summary 141 | 142 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH 143 | ; alias for the above mC patterns in the summary table 144 | 145 | 146 | [featureCount] 147 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF 148 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference. 149 | 150 | feature_type = gene 151 | ; type of feature to count, pass to featureCount -t parameter 152 | 153 | id_type = gene_id 154 | ; type of feature id to use in the output file, pass to featureCount -g parameter 155 | 156 | 157 | [contact] 158 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 159 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 160 | ; chrom size file has two tab-separated columns and not header 161 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 162 | 163 | min_gap = 2500 164 | ; minimum gap distance for a read pair being considered as cis-long -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_m3c.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | 18 | [mode] 19 | mode = m3c 20 | 21 | 22 | [multiplexIndex] 23 | ; This section is for demultiplex step 24 | ; V1: 8 random index version 25 | ; V2: 384 random index version 26 | ; put V1 or V2 here 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 28 | 29 | 30 | [fastqTrim] 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 33 | ; Universal illumina adapter 34 | 35 | overlap = 6 36 | ; least overlap of base and illumina adapter 37 | 38 | r1_left_cut = 10 39 | ; constant length to trim at 5 prime end, apply before quality trim. 40 | ; Aim to cut random primer part, determined by random primer length. 41 | ; Random primer can impact results, see bellow: 42 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 43 | 44 | r1_right_cut = 10 45 | ; constant length to trim at 3 prime end, apply before quality trim. 46 | 47 | r2_left_cut = 10 48 | ; constant length to trim at 5 prime end, apply before quality trim. 49 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 50 | 51 | r2_right_cut = 10 52 | ; constant length to trim at 3 prime end, apply before quality trim. 53 | 54 | quality_threshold = 20 55 | ; reads quality score threshold for trimming. 56 | 57 | length_threshold = 30 58 | ; reads length threshold after all trim steps. 59 | 60 | total_read_pairs_min = 1 61 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 62 | 63 | total_read_pairs_max = 6000000 64 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 65 | 66 | 67 | [mapping reference] 68 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 69 | ; reference directory of bismark 70 | 71 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 72 | ; reference prefix for the HISAT-3N DNA mapping 73 | 74 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 75 | ; reference prefix for the HISAT-3N RNA mapping 76 | 77 | hisat3n_repeat_index_type = no-repeat 78 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 79 | ; if "no-repeat", will run hisat-3n in the normal mode. 80 | 81 | 82 | [readSplit] 83 | trim_on_both_end = 5 84 | ; whether trim the unmapped reads before split. 85 | 86 | split_left_size = 40 87 | ; length of the left part of the split 88 | 89 | split_right_size = 40 90 | ; length of the right part of the split 91 | 92 | split_middle_min_size = 30 93 | ; minimum length of the middle part after the split, middle part shorter than this will not be used. 94 | 95 | split_min_read_length = 30 96 | ; minimum length of the read to perform split, read shorter than this will not be used. 97 | 98 | 99 | [bamFilter] 100 | mapq_threshold = 10 101 | ; reads MAPQ threshold 102 | 103 | 104 | [callMethylation] 105 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 106 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 107 | 108 | num_upstr_bases = 0 109 | ; number of base to include before mC, use 0 for normal snmC, use 1 for NOMe treatment 110 | 111 | num_downstr_bases = 2 112 | ; number of base to include after mC 113 | 114 | compress_level = 5 115 | ; ALLC file compress level 116 | 117 | mc_stat_feature = CHN CGN CCC 118 | ; this is based on the num_upstr_bases and num_downstr_bases 119 | ; mC patterns to check when calculate ALLC summary, separated by space 120 | 121 | mc_stat_alias = mCH mCG mCCC 122 | ; alias for the above mC patterns in the summary table, 123 | ; separated by space and follow the same order as mc_stat_feature 124 | 125 | 126 | [contact] 127 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 128 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 129 | ; chrom size file has two tab-separated columns and not header 130 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 131 | 132 | min_gap = 2500 133 | ; minimum gap distance for a read pair being considered as cis-long -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_mc.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | 18 | [mode] 19 | mode = mc 20 | 21 | 22 | [multiplexIndex] 23 | ; This section is for demultiplex step 24 | ; V1: 8 random index version 25 | ; V2: 384 random index version 26 | ; put V1 or V2 here 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 28 | 29 | 30 | [fastqTrim] 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 33 | ; Universal illumina adapter 34 | 35 | overlap = 6 36 | ; least overlap of base and illumina adapter 37 | 38 | r1_left_cut = 10 39 | ; constant length to trim at 5 prime end, apply before quality trim. 40 | ; Aim to cut random primer part, determined by random primer length. 41 | ; Random primer can impact results, see bellow: 42 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 43 | 44 | r1_right_cut = 10 45 | ; constant length to trim at 3 prime end, apply before quality trim. 46 | 47 | r2_left_cut = 10 48 | ; constant length to trim at 5 prime end, apply before quality trim. 49 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 50 | 51 | r2_right_cut = 10 52 | ; constant length to trim at 3 prime end, apply before quality trim. 53 | 54 | quality_threshold = 20 55 | ; reads quality score threshold for trimming. 56 | 57 | length_threshold = 30 58 | ; reads length threshold after all trim steps. 59 | 60 | total_read_pairs_min = 1 61 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 62 | 63 | total_read_pairs_max = 6000000 64 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 65 | 66 | 67 | [mapping reference] 68 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 69 | ; reference directory of bismark 70 | 71 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 72 | ; reference prefix for the HISAT-3N DNA mapping 73 | 74 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 75 | ; reference prefix for the HISAT-3N RNA mapping 76 | 77 | hisat3n_repeat_index_type = no-repeat 78 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 79 | ; if "no-repeat", will run hisat-3n in the normal mode. 80 | 81 | unmapped_fastq = False 82 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 83 | 84 | [bamFilter] 85 | mapq_threshold = 10 86 | ; reads MAPQ threshold 87 | 88 | 89 | [callMethylation] 90 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 91 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 92 | 93 | num_upstr_bases = 0 94 | ; number of base to include before mC, use 0 for normal snmC, use 1 for NOMe treatment 95 | 96 | num_downstr_bases = 2 97 | ; number of base to include after mC 98 | 99 | compress_level = 5 100 | ; ALLC file compress level 101 | 102 | mc_stat_feature = CHN CGN CCC 103 | ; this is based on the num_upstr_bases and num_downstr_bases 104 | ; mC patterns to check when calculate ALLC summary, separated by space 105 | 106 | mc_stat_alias = mCH mCG mCCC 107 | ; alias for the above mC patterns in the summary table, 108 | ; separated by space and follow the same order as mc_stat_feature 109 | 110 | [allcPostprocessing] 111 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 112 | ; This file is needed when extract mCG sites from ALLC file. 113 | ; The UCSC chrom sizes file contain two tab separated columns 114 | ; the 1st column is the names of chromosomes, the names should be the same as your reference_fasta 115 | ; the 2nd column is the length of chromosomes. 116 | -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_mct-nome.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | [mode] 18 | # for mCAT, we still using mCT mode for simplicity, 19 | # the two differences specifically changed in this file for NOMe treatment are: 20 | # 1. [callMethylation] num_upstr_bases = 1 21 | # 2. [callMethylation] mc_stat_feature and mc_stat_alias changed 22 | mode = mct 23 | 24 | 25 | [multiplexIndex] 26 | ; This section is for demultiplex step 27 | ; V1: 8 random index version 28 | ; V2: 384 random index version 29 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 30 | 31 | 32 | [fastqTrim] 33 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 34 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 35 | ; Universal illumina adapter 36 | 37 | overlap = 6 38 | ; least overlap of base and illumina adapter 39 | 40 | r1_left_cut = 10 41 | ; constant length to trim at 5 prime end, apply before quality trim. 42 | ; Aim to cut random primer part, determined by random primer length. 43 | ; Random primer can impact results, see bellow 44 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 45 | 46 | r1_right_cut = 10 47 | ; constant length to trim at 3 prime end, apply before quality trim. 48 | 49 | r2_left_cut = 10 50 | ; constant length to trim at 5 prime end, apply before quality trim. 51 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 52 | 53 | r2_right_cut = 10 54 | ; constant length to trim at 3 prime end, apply before quality trim. 55 | 56 | quality_threshold = 20 57 | ; reads quality score threshold for trimming. 58 | 59 | length_threshold = 30 60 | ; reads length threshold after all trim steps. 61 | 62 | total_read_pairs_min = 1 63 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 64 | 65 | total_read_pairs_max = 6000000 66 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 67 | 68 | 69 | [mapping reference] 70 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 71 | ; reference directory of bismark 72 | 73 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 74 | ; reference prefix for the HISAT-3N DNA mapping 75 | 76 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 77 | ; reference prefix for the HISAT-3N RNA mapping 78 | 79 | hisat3n_repeat_index_type = no-repeat 80 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 81 | ; if "no-repeat", will run hisat-3n in the normal mode. 82 | 83 | unmapped_fastq = False 84 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 85 | 86 | 87 | [star] 88 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR 89 | ; reference directory of STAR 90 | 91 | 92 | [bamFilter] 93 | mapq_threshold = 10 94 | ; reads MAPQ threshold 95 | 96 | 97 | [DNAReadsFilter] 98 | mc_rate_max_threshold = 0.5 99 | ; if read CH ratio >= mc_rate_max_threshold, skip this read 100 | 101 | dna_cov_min_threshold = 3 102 | ; if read CH sites <= cov_min_threshold, skip this read 103 | 104 | [RNAReadsFilter] 105 | mc_rate_min_threshold = 0.9 106 | ; if read CH ratio <= mc_rate_min_threshold, skip this read 107 | 108 | rna_cov_min_threshold = 3 109 | ; if read CH sites <= cov_min_threshold, skip this read 110 | 111 | nome_flag_str = 112 | ; if '--nome', will exclude GpC sites from the read-level methylation fraction calculation 113 | 114 | 115 | [callMethylation] 116 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 117 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 118 | 119 | num_upstr_bases = 1 120 | ; number of base to include before mC 121 | ; change this to 1 for NOMe treatment to get GpCNN 122 | 123 | num_downstr_bases = 2 124 | ; number of base to include after mC 125 | 126 | compress_level = 5 127 | ; ALLC file compress level 128 | 129 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN 130 | ; mC patterns to check when calculate ALLC summary 131 | 132 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH 133 | ; alias for the above mC patterns in the summary table 134 | 135 | [featureCount] 136 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF 137 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference. 138 | 139 | feature_type = gene 140 | ; type of feature to count, pass to featureCount -t parameter 141 | 142 | id_type = gene_id 143 | ; type of feature id to use in the output file, pass to featureCount -g parameter 144 | 145 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 146 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 147 | ; chrom size file has two tab-separated columns and not header 148 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 149 | -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_mct.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | 18 | [mode] 19 | mode = mct 20 | 21 | 22 | [multiplexIndex] 23 | ; This section is for demultiplex step 24 | ; V1: 8 random index version 25 | ; V2: 384 random index version 26 | ; put V1 or V2 here 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 28 | 29 | 30 | [fastqTrim] 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 33 | ; Universal illumina adapter 34 | 35 | overlap = 6 36 | ; least overlap of base and illumina adapter 37 | 38 | r1_left_cut = 10 39 | ; constant length to trim at 5 prime end, apply before quality trim. 40 | ; Aim to cut random primer part, determined by random primer length. 41 | ; Random primer can impact results https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 42 | 43 | r1_right_cut = 10 44 | ; constant length to trim at 3 prime end, apply before quality trim. 45 | 46 | r2_left_cut = 10 47 | ; constant length to trim at 5 prime end, apply before quality trim. 48 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 49 | 50 | r2_right_cut = 10 51 | ; constant length to trim at 3 prime end, apply before quality trim. 52 | 53 | quality_threshold = 20 54 | ; reads quality score threshold for trimming. 55 | 56 | length_threshold = 30 57 | ; reads length threshold after all trim steps. 58 | 59 | total_read_pairs_min = 1 60 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 61 | 62 | total_read_pairs_max = 6000000 63 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 64 | 65 | 66 | [mapping reference] 67 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 68 | ; reference directory of bismark 69 | 70 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 71 | ; reference prefix for the HISAT-3N DNA mapping 72 | 73 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 74 | ; reference prefix for the HISAT-3N RNA mapping 75 | 76 | hisat3n_repeat_index_type = no-repeat 77 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 78 | ; if "no-repeat", will run hisat-3n in the normal mode. 79 | 80 | 81 | unmapped_fastq = False 82 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 83 | 84 | 85 | [star] 86 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR 87 | ; reference directory of STAR 88 | 89 | 90 | [bamFilter] 91 | mapq_threshold = 10 92 | ; reads MAPQ threshold 93 | 94 | 95 | [DNAReadsFilter] 96 | mc_rate_max_threshold = 0.5 97 | ; if read CH ratio >= mc_rate_max_threshold, skip this read 98 | 99 | dna_cov_min_threshold = 3 100 | ; if read CH sites <= cov_min_threshold, skip this read 101 | 102 | 103 | [RNAReadsFilter] 104 | mc_rate_min_threshold = 0.9 105 | ; if read CH ratio <= mc_rate_min_threshold, skip this read 106 | 107 | rna_cov_min_threshold = 3 108 | ; if read CH sites <= cov_min_threshold, skip this read 109 | 110 | nome_flag_str = 111 | 112 | [callMethylation] 113 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 114 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 115 | 116 | num_upstr_bases = 0 117 | ; number of base to include before mC 118 | 119 | num_downstr_bases = 2 120 | ; number of base to include after mC 121 | 122 | compress_level = 5 123 | ; ALLC file compress level 124 | 125 | mc_stat_feature = CHN CGN CCC 126 | ; mC patterns to check when calculate ALLC summary 127 | 128 | mc_stat_alias = mCH mCG mCCC 129 | ; alias for the above mC patterns in the summary table 130 | 131 | [featureCount] 132 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF 133 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference. 134 | 135 | feature_type = gene 136 | ; type of feature to count, pass to featureCount -t parameter 137 | 138 | id_type = gene_id 139 | ; type of feature id to use in the output file, pass to featureCount -g parameter 140 | 141 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 142 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 143 | ; chrom size file has two tab-separated columns and not header 144 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 145 | -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_nome.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | [mode] 18 | # for NOMe treated snmC, we still using mc mode for simplicity, 19 | # the two differences specifically changed in this file for NOMe treatment are: 20 | # 1. [callMethylation] num_upstr_bases = 1 21 | # 2. [callMethylation] mc_stat_feature and mc_stat_alias changed 22 | mode = mc 23 | 24 | 25 | [multiplexIndex] 26 | ; This section is for demultiplex step 27 | ; V1: 8 random index version 28 | ; V2: 384 random index version 29 | ; put V1 or V2 here 30 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 31 | 32 | 33 | [fastqTrim] 34 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 35 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 36 | ; Universal illumina adapter 37 | 38 | overlap = 6 39 | ; least overlap of base and illumina adapter 40 | 41 | r1_left_cut = 10 42 | ; constant length to trim at 5 prime end, apply before quality trim. 43 | ; Aim to cut random primer part, determined by random primer length. 44 | ; Random primer can impact results, see bellow: 45 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 46 | 47 | r1_right_cut = 10 48 | ; constant length to trim at 3 prime end, apply before quality trim. 49 | 50 | r2_left_cut = 10 51 | ; constant length to trim at 5 prime end, apply before quality trim. 52 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 53 | 54 | r2_right_cut = 10 55 | ; constant length to trim at 3 prime end, apply before quality trim. 56 | 57 | quality_threshold = 20 58 | ; reads quality score threshold for trimming. 59 | 60 | length_threshold = 30 61 | ; reads length threshold after all trim steps. 62 | 63 | total_read_pairs_min = 1 64 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 65 | 66 | total_read_pairs_max = 6000000 67 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 68 | 69 | [mapping reference] 70 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 71 | ; reference directory of bismark 72 | 73 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 74 | ; reference prefix for the HISAT-3N DNA mapping 75 | 76 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 77 | ; reference prefix for the HISAT-3N RNA mapping 78 | 79 | hisat3n_repeat_index_type = no-repeat 80 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 81 | ; if "no-repeat", will run hisat-3n in the normal mode. 82 | 83 | unmapped_fastq = False 84 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 85 | 86 | [bamFilter] 87 | mapq_threshold = 10 88 | ; reads MAPQ threshold 89 | 90 | 91 | [callMethylation] 92 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 93 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 94 | 95 | num_upstr_bases = 1 96 | ; number of base to include before mC 97 | 98 | num_downstr_bases = 2 99 | ; number of base to include after mC 100 | 101 | compress_level = 5 102 | ; ALLC file compress level 103 | 104 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN 105 | ; mC patterns to check when calculate ALLC summary 106 | 107 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH 108 | ; alias for the above mC patterns in the summary table 109 | 110 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 111 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 112 | ; chrom size file has two tab-separated columns and not header 113 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 114 | -------------------------------------------------------------------------------- /cemba_data/files/mapping_summary_template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/mapping_summary_template/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/plate_info_template_v1.txt: -------------------------------------------------------------------------------- 1 | # .__ 2 | # ___________ _____ ______ | | ____ 3 | # / ___/\__ \ / \\____ \| | _/ __ \ 4 | # \___ \ / __ \| Y Y \ |_> > |_\ ___/ 5 | # /____ >(____ /__|_| / __/|____/\___ > 6 | # \/ \/ \/|__| \/ 7 | # .__ __ ._. 8 | # _____| |__ ____ _____/ |_| | 9 | # / ___/ | \_/ __ \_/ __ \ __\ | 10 | # \___ \| Y \ ___/\ ___/| | \| 11 | # /____ >___| /\___ >\___ >__| __ 12 | # \/ \/ \/ \/ \/ 13 | # 14 | # ____ ________ 15 | # \ \ / /_ | 16 | # \ Y / | | 17 | # \ / | | 18 | # \___/ |___| 19 | # 20 | # 21 | # PlateInfo template of single cell sequencing demultiplex 22 | # 23 | # This file template contain 3 sections. 24 | # 25 | # [CriticalInfo] 26 | # [LibraryInfo] 27 | # [PlateInfo] 28 | # 29 | # The final sample id will be values of each part concatenated by "-" in the following order 30 | # [Values in LibraryInfo] + [Additional values in PlateInfo] + [Sample UID determined by library strategy] 31 | # 32 | # Empty lines and line start with "#" will be ignored. You can remove these if you understand the template. 33 | # 34 | 35 | 36 | # ===================================================================================================== 37 | 38 | [CriticalInfo] 39 | 40 | # ===================================================================================================== 41 | 42 | # Explain: 43 | # Every key=value pairs are required. key name can not be change. 44 | # Some values have limited options, they are: 45 | # n_random_index choice: 8 (V1), if your n_random_index=384, use V2 template! 46 | # input_plate_size choice: 384 47 | # 48 | # Example: 49 | # n_random_index=8 50 | # input_plate_size=384 51 | # pool_id=Pool_NN 52 | # tube_label=Pool_NN_MM_AA_BB # often times 2 libraries are pooled together on Nova-Seq, but there is no rule on this. 53 | # email=your-email@salk.edu 54 | # 55 | 56 | # if your n_random_index=384, use V2 template! 57 | n_random_index=8 58 | input_plate_size=384 59 | pool_id= 60 | tube_label= 61 | email= 62 | 63 | 64 | # ===================================================================================================== 65 | 66 | [LibraryInfo] 67 | 68 | # ===================================================================================================== 69 | # 70 | # Explain: 71 | # library metadata that applies to all plates 72 | # this whole part is optional, may contain any "key=value" pairs necessary to describe the library. 73 | # All the values will be concatenate by "-" into the sample id and present in file name. Use UNIX path safe characters. 74 | # Any character does not belong to [a-zA-Z0-9] will be replaced by "_" 75 | # Here are the recommended information to include, you can define your own based on your needs, 76 | # non of these information is actually used in demultiplex or mapping: 77 | # these keys are ALL optional, but better be consistent throughout the project. 78 | # 79 | # Example: 80 | # lib_comp_date=180101 81 | # project=CEMBA 82 | # organism=mm 83 | # dev_stage_age=P56 84 | # tissue_cell_type=1A 85 | # exp_cond=1 86 | # bio_rep=1 87 | # tech_rep=1 88 | # lib_type=snmC-seq2 89 | # sequencer=NovaSeq 90 | # se_pe=pe 91 | # read_length=150 92 | # 93 | 94 | 95 | 96 | 97 | 98 | # ===================================================================================================== 99 | 100 | [PlateInfo] 101 | 102 | # ===================================================================================================== 103 | 104 | # Explain: 105 | # Plate metadata that specific to certain plates, a tab separated table 106 | # First row must be header start with: plate_id primer_quarter 107 | # First 2 columns are required and must be in the order of: plate_id primer_quarter 108 | # You can add more plate specific info into additional columns, those info will be appended to LibraryInfo as part of sample_id. 109 | # All the values will be concatenate by "-" into the sample id and present in file name. 110 | # So better not to include "-" in value and use UNIX path safe characters. 111 | # 112 | # If your experiment design contain sup-plate difference (e.g. some rows come from 1 sample, some rows come from another), 113 | # you should maintain your own metadata about this and added into the mapping summary table later after mapping by yourself 114 | # Because here the plate info is just for barcode demultiplexing, so that we can get single cell data AND the plate position of each cell 115 | # with the plate position, it should be very convenient for you to add any custom information you designed in your experiment. 116 | # 117 | # primer_quarter valid values are: 118 | # Set1_Q1, Set1_Q2, Set1_Q3, Set1_Q4 119 | # SetB_Q1, SetB_Q2, SetB_Q3, SetB_Q4 120 | # 121 | # Example: 122 | # plate_id primer_quarter 123 | # CEMBA190530_9C_1 SetB_Q1 124 | # CEMBA190530_9C_2 SetB_Q1 125 | # CEMBA190530_9C_3 SetB_Q2 126 | # CEMBA190530_9C_4 SetB_Q2 127 | # CEMBA190620_9C_1 SetB_Q3 128 | # CEMBA190620_9C_2 SetB_Q3 129 | # CEMBA190620_9C_3 SetB_Q4 130 | # CEMBA190620_9C_4 SetB_Q4 131 | # 132 | # Remember the columns MUST be separate by tab not space 133 | # 134 | 135 | 136 | # ===================================================================================================== 137 | # if your n_random_index=384, use V2 template! 138 | # ===================================================================================================== 139 | 140 | plate_id primer_quarter 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /cemba_data/files/plate_info_template_v2.txt: -------------------------------------------------------------------------------- 1 | # .__ 2 | # ___________ _____ ______ | | ____ 3 | # / ___/\__ \ / \\____ \| | _/ __ \ 4 | # \___ \ / __ \| Y Y \ |_> > |_\ ___/ 5 | # /____ >(____ /__|_| / __/|____/\___ > 6 | # \/ \/ \/|__| \/ 7 | # .__ __ ._. 8 | # _____| |__ ____ _____/ |_| | 9 | # / ___/ | \_/ __ \_/ __ \ __\ | 10 | # \___ \| Y \ ___/\ ___/| | \| 11 | # /____ >___| /\___ >\___ >__| __ 12 | # \/ \/ \/ \/ \/ 13 | # 14 | # ____ ____________ 15 | # \ \ / /\_____ \ 16 | # \ Y / / ____/ 17 | # \ / / \ 18 | # \___/ \_______ \ 19 | # \/ 20 | # 21 | # PlateInfo template of single cell sequencing demultiplex 22 | # 23 | # This file template contain 3 sections. 24 | # 25 | # [CriticalInfo] 26 | # [LibraryInfo] 27 | # [PlateInfo] 28 | # 29 | # The final sample id will be values of each part concatenated by "-" in the following order 30 | # [Values in LibraryInfo] + [Additional values in PlateInfo] + [Sample UID determined by library strategy] 31 | # 32 | # Empty lines and line start with "#" will be ignored. You can remove these if you understand the template. 33 | # 34 | 35 | 36 | # ===================================================================================================== 37 | 38 | [CriticalInfo] 39 | 40 | # ===================================================================================================== 41 | 42 | # Explain: 43 | # Every key=value pairs are required. key name can not be change. 44 | # Some values have limited options, they are: 45 | # n_random_index choice: 384 (V2), if your n_random_index=8, use V1 template! 46 | # input_plate_size choice: 384 47 | # 48 | # 49 | # Example: 50 | # n_random_index=8 51 | # input_plate_size=384 52 | # pool_id=Pool_73 53 | # tube_label=Pool_72_73_9A_10C # often times 2 library are pooled together on Nova-Seq 54 | # email=your-email@salk.edu 55 | # 56 | 57 | # if your n_random_index=8, use V1 template! 58 | n_random_index=384 59 | input_plate_size=384 60 | pool_id= 61 | tube_label= 62 | email= 63 | 64 | 65 | # ===================================================================================================== 66 | 67 | [LibraryInfo] 68 | 69 | # ===================================================================================================== 70 | # 71 | # Explain: 72 | # library metadata that applies to all plates 73 | # this whole part is optional, may contain any "key=value" pairs necessary to describe the library. 74 | # All the values will be concatenate by "-" into the sample id and present in file name. Use UNIX path safe characters. 75 | # Any character does not belong to [a-zA-Z0-9] will be replaced by "_" 76 | # Here are the recommended information to include, you can define your own based on your needs, 77 | # non of these information is actually used in demultiplex or mapping: 78 | # these keys are ALL optional, but better be consistent throughout the project. 79 | # 80 | # Example: 81 | # lib_comp_date=180101 82 | # project=CEMBA 83 | # organism=mm 84 | # dev_stage_age=P56 85 | # tissue_cell_type=1A 86 | # exp_cond=1 87 | # bio_rep=1 88 | # tech_rep=1 89 | # lib_type=snmC-seq2 90 | # sequencer=NovaSeq 91 | # se_pe=pe 92 | # read_length=150 93 | # 94 | # 95 | 96 | 97 | 98 | 99 | 100 | # ===================================================================================================== 101 | 102 | [PlateInfo] 103 | 104 | # ===================================================================================================== 105 | 106 | # Explain: 107 | # Plate metadata that specific to certain plates, a tab separated table 108 | # First row must be header start with: plate_id primer_quarter 109 | # First 3 columns are required and must be in the order of: plate_id multiplex_group primer_name 110 | # You can add more plate specific info into additional columns, those info will be appended to LibraryInfo as part of sample_id. 111 | # All the values will be concatenate by "-" into the sample id and present in file name. 112 | # So better not to include "-" in value and use UNIX path safe characters. 113 | # 114 | # If your experiment design contain sup-plate difference (e.g. some rows come from 1 sample, some rows come from another), 115 | # you should maintain your own metadata about this and added into the mapping summary table later after mapping by yourself 116 | # Because here the plate info is just for barcode demultiplexing, so that we can get single cell data AND the plate position of each cell 117 | # with the plate position, it should be very convenient for you to add any custom information you designed in your experiment. 118 | # 119 | # primer_name valid values are: 120 | # [A-P][1-24] 121 | # 122 | # Example: 123 | # plate_id multiplex_group primer_name 124 | # Plate_1 1 B1 125 | # Plate_1 2 B3 126 | # Plate_1 3 B5 127 | # Plate_1 4 B7 128 | # Plate_1 5 B9 129 | # Plate_1 6 B11 130 | # 131 | # Remember the columns MUST be separate by tab, not space or comma 132 | # 133 | 134 | 135 | # ===================================================================================================== 136 | # if your n_random_index=8, use V1 template! 137 | # ===================================================================================================== 138 | 139 | plate_id multiplex_group primer_name 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v1.fa: -------------------------------------------------------------------------------- 1 | >AD001 2 | ^ATCACG 3 | >AD002 4 | ^CGATGT 5 | >AD004 6 | ^TGACCA 7 | >AD006 8 | ^GCCAAT 9 | >AD007 10 | ^CAGATC 11 | >AD008 12 | ^ACTTGA 13 | >AD010 14 | ^TAGCTT 15 | >AD012 16 | ^CTTGTA 17 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/random_index_v2/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_1.fa: -------------------------------------------------------------------------------- 1 | >A1 2 | ^ACGATCAG 3 | >A13 4 | ^ATCATGCG 5 | >C1 6 | ^GTAGCGTA 7 | >C13 8 | ^GTCCTAAG 9 | >E1 10 | ^GATCAAGG 11 | >E13 12 | ^TACCGGAT 13 | >G1 14 | ^CAGTCACA 15 | >G13 16 | ^ACCTCAGT 17 | >I1 18 | ^TACTGCTC 19 | >I13 20 | ^GTGGTATG 21 | >K1 22 | ^AGCTACCA 23 | >K13 24 | ^CAGACGTT 25 | >M1 26 | ^AGGTCAAC 27 | >M13 28 | ^CAATCAGG 29 | >O1 30 | ^AACAGGTG 31 | >O13 32 | ^CTACAAGG 33 | >A2 34 | ^TGATAGGC 35 | >A14 36 | ^ACAACGTG 37 | >C2 38 | ^CAGGTAAG 39 | >C14 40 | ^AATTCCGG 41 | >E2 42 | ^ACAAGCTC 43 | >E14 44 | ^GTGATCCA 45 | >G2 46 | ^AACCGTGT 47 | >G14 48 | ^GTCCTTGA 49 | >I2 50 | ^ATTCCGCT 51 | >I14 52 | ^ACTGCGAA 53 | >K2 54 | ^CACGCAAT 55 | >K14 56 | ^AAGCGACT 57 | >M2 58 | ^AGAAGGAC 59 | >M14 60 | ^CGAATACG 61 | >O2 62 | ^AGCAGACA 63 | >O14 64 | ^GCCTTAAC 65 | >B1 66 | ^GAACGAAG 67 | >B13 68 | ^GACTACGA 69 | >D1 70 | ^ATACGCAG 71 | >D13 72 | ^CCTGTCAA 73 | >F1 74 | ^GTTGCTGT 75 | >F13 76 | ^CGAATTGC 77 | >H1 78 | ^CCAAGGTT 79 | >H13 80 | ^TCTACGCA 81 | >J1 82 | ^TGCACTTG 83 | >J13 84 | ^AGAGCAGA 85 | >L1 86 | ^GATGCTAC 87 | >L13 88 | ^CGACCTAA 89 | >N1 90 | ^TCAGCCTT 91 | >N13 92 | ^CCGTTATG 93 | >P1 94 | ^TGACCGTT 95 | >P13 96 | ^AGCTAAGC 97 | >B2 98 | ^AGGCAATG 99 | >B14 100 | ^ACGCTTCT 101 | >D2 102 | ^GCGTTAGA 103 | >D14 104 | ^TCAATCCG 105 | >F2 106 | ^CTAGGTTG 107 | >F14 108 | ^GCATAGTC 109 | >H2 110 | ^CTCGGTAA 111 | >H14 112 | ^CAACTTGG 113 | >J2 114 | ^CCTAAGTC 115 | >J14 116 | ^TTCCTCCT 117 | >L2 118 | ^AAGCGTTC 119 | >L14 120 | ^CTTAGGAC 121 | >N2 122 | ^CAACTGAC 123 | >N14 124 | ^CTCACCAA 125 | >P2 126 | ^CTCTATCG 127 | >P14 128 | ^CGCAATGT 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_2.fa: -------------------------------------------------------------------------------- 1 | >A3 2 | ^TCGAGAGT 3 | >A15 4 | ^TGTTCCGT 5 | >C3 6 | ^AGAGTCCA 7 | >C15 8 | ^TATGGCAC 9 | >E3 10 | ^TCTTCGAC 11 | >E15 12 | ^TTGCAACG 13 | >G3 14 | ^TCGATGAC 15 | >G15 16 | ^CGTCTTCA 17 | >I3 18 | ^GACGAACT 19 | >I15 20 | ^CCAACTTC 21 | >K3 22 | ^AGATTGCG 23 | >K15 24 | ^CTGAACGT 25 | >M3 26 | ^TACACACG 27 | >M15 28 | ^TCGTGCAT 29 | >O3 30 | ^AGTCGAAG 31 | >O15 32 | ^CGATGTTC 33 | >A4 34 | ^CATCCAAG 35 | >A16 36 | ^TGCTGTGA 37 | >C4 38 | ^GTATCGAG 39 | >C16 40 | ^TCTAGGAG 41 | >E4 42 | ^GAACCTTC 43 | >E16 44 | ^ACTGGTGT 45 | >G4 46 | ^CGCGTATT 47 | >G16 48 | ^CAGGTTCA 49 | >I4 50 | ^AAGCTCAC 51 | >I16 52 | ^TCTGTCGT 53 | >K4 54 | ^AGCTTCAG 55 | >K16 56 | ^CCTACCTA 57 | >M4 58 | ^GCGTATCA 59 | >M16 60 | ^TGCTTGCT 61 | >O4 62 | ^GTTAAGCG 63 | >O16 64 | ^GTTGGCAT 65 | >B3 66 | ^ACCTAGAC 67 | >B15 68 | ^TTACGTGC 69 | >D3 70 | ^AAGACCGT 71 | >D15 72 | ^CTATGCCT 73 | >F3 74 | ^AGAACCAG 75 | >F15 76 | ^CAAGAAGC 77 | >H3 78 | ^ACGTATGG 79 | >H15 80 | ^TGGCTCTT 81 | >J3 82 | ^TCACTCGA 83 | >J15 84 | ^CTTCGGTT 85 | >L3 86 | ^AGGAACAC 87 | >L15 88 | ^CTCTCAGA 89 | >N3 90 | ^AAGCATCG 91 | >N15 92 | ^CTAGCAGT 93 | >P3 94 | ^CATCTGCT 95 | >P15 96 | ^GTTCCATG 97 | >B4 98 | ^TCACCTAG 99 | >B16 100 | ^GAGTAGAG 101 | >D4 102 | ^TTGCGAGA 103 | >D16 104 | ^GACTTGTG 105 | >F4 106 | ^GTGTCCTT 107 | >F16 108 | ^CTCCTGAA 109 | >H4 110 | ^TACAGAGC 111 | >H16 112 | ^TCAGTAGG 113 | >J4 114 | ^TTCGTACG 115 | >J16 116 | ^GCTGTAAG 117 | >L4 118 | ^CGATTCTG 119 | >L16 120 | ^ATAGTCGG 121 | >N4 122 | ^TGCTCTAC 123 | >N16 124 | ^CAGAACTG 125 | >P4 126 | ^ACTCTCCA 127 | >P16 128 | ^CCTAGAGA 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_3.fa: -------------------------------------------------------------------------------- 1 | >A5 2 | ^CTAGCTCA 3 | >A17 4 | ^ATTAGCCG 5 | >C5 6 | ^GCTACTCT 7 | >C17 8 | ^TCGGATTC 9 | >E5 10 | ^ATCGTGGT 11 | >E17 12 | ^CACTTCAC 13 | >G5 14 | ^GAAGTGCT 15 | >G17 16 | ^TGCGTAAC 17 | >I5 18 | ^CTTCGCAA 19 | >I17 20 | ^GACGTCAT 21 | >K5 22 | ^CACACATC 23 | >K17 24 | ^TTGGACTG 25 | >M5 26 | ^CAAGTCGT 27 | >M17 28 | ^TAACGTCG 29 | >O5 30 | ^TGGAAGCA 31 | >O17 32 | ^ACCGGTTA 33 | >A6 34 | ^GTGAGACT 35 | >A18 36 | ^CCAAGTAG 37 | >C6 38 | ^TTCACGGA 39 | >C18 40 | ^ATCCGTTG 41 | >E6 42 | ^AGCGAGAT 43 | >E18 44 | ^CTAACCTG 45 | >G6 46 | ^AGTTCGCA 47 | >G18 48 | ^CCAACACT 49 | >I6 50 | ^TGATCACG 51 | >I18 52 | ^CTCAAGCT 53 | >K6 54 | ^CCTCGTTA 55 | >K18 56 | ^ATCTCCTG 57 | >M6 58 | ^CAACACAG 59 | >M18 60 | ^CTCGAACA 61 | >O6 62 | ^CATGGATC 63 | >O18 64 | ^CAACCTCT 65 | >B5 66 | ^TACGACGT 67 | >B17 68 | ^ACTGCTTG 69 | >D5 70 | ^CTCCAATC 71 | >D17 72 | ^TTCGGCTA 73 | >F5 74 | ^GATGTCGA 75 | >F17 76 | ^CACCAGTT 77 | >H5 78 | ^AAGGACCA 79 | >H17 80 | ^CCTTCCAT 81 | >J5 82 | ^CACTGTAG 83 | >J17 84 | ^ACAACAGC 85 | >L5 86 | ^ACCATCCT 87 | >L17 88 | ^AGGCTGAA 89 | >N5 90 | ^GCCAATAC 91 | >N17 92 | ^GCCAGAAT 93 | >P5 94 | ^CGCTGATA 95 | >P17 96 | ^GCATCCTA 97 | >B6 98 | ^CATACGGA 99 | >B18 100 | ^ATGCCTAG 101 | >D6 102 | ^ACACCGAT 103 | >D18 104 | ^CCGATGTA 105 | >F6 106 | ^TACCTGCA 107 | >F18 108 | ^AACGCACA 109 | >H6 110 | ^GCATAACG 111 | >H18 112 | ^ACAGCAAG 113 | >J6 114 | ^TCCTGGTA 115 | >J18 116 | ^GACATCTC 117 | >L6 118 | ^GCAACCAT 119 | >L18 120 | ^GAGACCAA 121 | >N6 122 | ^CATCACGT 123 | >N18 124 | ^AGAAGCCT 125 | >P6 126 | ^CAGCATAC 127 | >P18 128 | ^TACTAGCG 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_4.fa: -------------------------------------------------------------------------------- 1 | >A7 2 | ^ATCGTCTC 3 | >A19 4 | ^CGATCGAT 5 | >C7 6 | ^CTCTGGAT 7 | >C19 8 | ^AACAGCGA 9 | >E7 10 | ^CGGTAATC 11 | >E19 12 | ^TAGCCATG 13 | >G7 14 | ^CTTCCTTC 15 | >G19 16 | ^AACACGCT 17 | >I7 18 | ^ATGGCGAT 19 | >I19 20 | ^ACGTCCAA 21 | >K7 22 | ^GAGCAATC 23 | >K19 24 | ^GTCTGCAA 25 | >M7 26 | ^AGCTAGTG 27 | >M19 28 | ^AAGGCGTA 29 | >O7 30 | ^CTCGTTCT 31 | >O19 32 | ^GAACGGTT 33 | >A8 34 | ^CTGATGAG 35 | >A20 36 | ^AACTGAGG 37 | >C8 38 | ^GAGCTCTA 39 | >C20 40 | ^GATAGCCA 41 | >E8 42 | ^CCGTAACT 43 | >E20 44 | ^AGCCAACT 45 | >G8 46 | ^TAGTCAGC 47 | >G20 48 | ^GAGAGTAC 49 | >I8 50 | ^CAATGCGA 51 | >I20 52 | ^AACCACTC 53 | >K8 54 | ^TGAGACGA 55 | >K20 56 | ^TCACGATG 57 | >M8 58 | ^TCCACGTT 59 | >M20 60 | ^ACATGGAG 61 | >O8 62 | ^ACAGAGGT 63 | >O20 64 | ^TGGATGGT 65 | >B7 66 | ^TTGAGCTC 67 | >B19 68 | ^GCCTATGT 69 | >D7 70 | ^TCTGGACA 71 | >D19 72 | ^ACCGACAA 73 | >F7 74 | ^AGGAGGTT 75 | >F19 76 | ^GTATTCCG 77 | >H7 78 | ^TATGCGGT 79 | >H19 80 | ^ATACTGGC 81 | >J7 82 | ^GTACGATC 83 | >J19 84 | ^AGCCGTAA 85 | >L7 86 | ^GAACGTGA 87 | >L19 88 | ^ATCGGAGA 89 | >N7 90 | ^GACACAGT 91 | >N19 92 | ^CGAGAGAA 93 | >P7 94 | ^TCGTCTGA 95 | >P19 96 | ^CCATGAAC 97 | >B8 98 | ^GTCATCGT 99 | >B20 100 | ^CAACTCCA 101 | >D8 102 | ^CGTATCTC 103 | >D20 104 | ^TAGGAGCT 105 | >F8 106 | ^CCTTAGGT 107 | >F20 108 | ^TAGTCTCG 109 | >H8 110 | ^GATCAGAC 111 | >H20 112 | ^GAATGGCA 113 | >J8 114 | ^CATTGACG 115 | >J20 116 | ^CAACCGTA 117 | >L8 118 | ^AATCCAGC 119 | >L20 120 | ^AACAAGGC 121 | >N8 122 | ^GCCACTTA 123 | >N20 124 | ^CACGATTC 125 | >P8 126 | ^TACTCCAG 127 | >P20 128 | ^CGTCCATT 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_5.fa: -------------------------------------------------------------------------------- 1 | >A9 2 | ^TCGACAAG 3 | >A21 4 | ^GATCTTGC 5 | >C9 6 | ^AGATCGTC 7 | >C21 8 | ^CCAACGAA 9 | >E9 10 | ^AGTTGTGC 11 | >E21 12 | ^ACAGGCAT 13 | >G9 14 | ^CGAACAAC 15 | >G21 16 | ^ACTCGATC 17 | >I9 18 | ^ACATGCCA 19 | >I21 20 | ^GATCCACT 21 | >K9 22 | ^ATAGAGCG 23 | >K21 24 | ^CCACATTG 25 | >M9 26 | ^CTCCTAGT 27 | >M21 28 | ^TCTTACGG 29 | >O9 30 | ^ACGAGAAC 31 | >O21 32 | ^CTGTACCA 33 | >A10 34 | ^ACGGTACA 35 | >A22 36 | ^AGGTAGGA 37 | >C10 38 | ^GTCAGTCA 39 | >C22 40 | ^TATGACCG 41 | >E10 42 | ^TCAGACAC 43 | >E22 44 | ^CCAGTTGA 45 | >G10 46 | ^AACACCAC 47 | >G22 48 | ^AGATACGG 49 | >I10 50 | ^ATGCGTCA 51 | >I22 52 | ^CTTACAGC 53 | >K10 54 | ^CACAGGAA 55 | >K22 56 | ^CCACAACA 57 | >M10 58 | ^ATCGCAAC 59 | >M22 60 | ^ACAAGACG 61 | >O10 62 | ^TAAGTGGC 63 | >O22 64 | ^CTATCCAC 65 | >B9 66 | ^AGTACACG 67 | >B21 68 | ^GTACCACA 69 | >D9 70 | ^AACACTGG 71 | >D21 72 | ^CGTAGATG 73 | >F9 74 | ^AATCGCTG 75 | >F21 76 | ^TTCGAAGC 77 | >H9 78 | ^AAGGAAGG 79 | >H21 80 | ^AACCTACG 81 | >J9 82 | ^TGGTGAAG 83 | >J21 84 | ^CTCTTGTC 85 | >L9 86 | ^TAGAACGC 87 | >L21 88 | ^GATACCTG 89 | >N9 90 | ^AAGAGGCA 91 | >N21 92 | ^AACTCGGA 93 | >P9 94 | ^CACATGGT 95 | >P21 96 | ^ATCCACGA 97 | >B10 98 | ^TTACCGAC 99 | >B22 100 | ^AAGTCCTC 101 | >D10 102 | ^AAGGAGAC 103 | >D22 104 | ^CAACGAGT 105 | >F10 106 | ^CACAGACT 107 | >F22 108 | ^ACTCTGAG 109 | >H10 110 | ^CGCAACTA 111 | >H22 112 | ^CGGATCAA 113 | >J10 114 | ^ACCTCTTC 115 | >J22 116 | ^TGCGATAG 117 | >L10 118 | ^AGTGCATC 119 | >L22 120 | ^CCAGTATC 121 | >N10 122 | ^GCTTCACA 123 | >N22 124 | ^AAGCTGGT 125 | >P10 126 | ^GAGGCATT 127 | >P22 128 | ^TCGCTATC 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_6.fa: -------------------------------------------------------------------------------- 1 | >A11 2 | ^CCTTGGAA 3 | >A23 4 | ^AGGATAGC 5 | >C11 6 | ^GCTCAGTT 7 | >C23 8 | ^CAGTGCTT 9 | >E11 10 | ^AATGACGC 11 | >E23 12 | ^AGGTGTTG 13 | >G11 14 | ^AACAACCG 15 | >G23 16 | ^TGAGCTGT 17 | >I11 18 | ^GTCAACAG 19 | >I23 20 | ^AGCCTATC 21 | >K11 22 | ^GACCGATA 23 | >K23 24 | ^GATGGAGT 25 | >M11 26 | ^ACTCCTAC 27 | >M23 28 | ^CGTGTGAT 29 | >O11 30 | ^AAGCCTGA 31 | >O23 32 | ^GCGCATAT 33 | >A12 34 | ^CTCGACTT 35 | >A24 36 | ^TTCGCCAT 37 | >C12 38 | ^CACGTCTA 39 | >C24 40 | ^CGATTGGA 41 | >E12 42 | ^CGAAGTCA 43 | >E24 44 | ^AAGTGCAG 45 | >G12 46 | ^GTAAGCAC 47 | >G24 48 | ^GTTCTTCG 49 | >I12 50 | ^TACATCGG 51 | >I24 52 | ^AGTCTTGG 53 | >K12 54 | ^ACTCAACG 55 | >K24 56 | ^AGGTCTGT 57 | >M12 58 | ^ACGTCGTT 59 | >M24 60 | ^CGCCTTAT 61 | >O12 62 | ^AGTCAGGT 63 | >O24 64 | ^GATCTCAG 65 | >B11 66 | ^TGTCAGTG 67 | >B23 68 | ^TAGTGGTG 69 | >D11 70 | ^TTGGTGCA 71 | >D23 72 | ^CTGTATGC 73 | >F11 74 | ^AGTGACCT 75 | >F23 76 | ^AGACCTTG 77 | >H11 78 | ^AGCGTGTA 79 | >H23 80 | ^CATACTCG 81 | >J11 82 | ^TAGCTGAG 83 | >J23 84 | ^CAGATCCT 85 | >L11 86 | ^AACCAGAG 87 | >L23 88 | ^TCCTGACT 89 | >N11 90 | ^GAAGACTG 91 | >N23 92 | ^ACAGTTCG 93 | >P11 94 | ^CGAGTTAG 95 | >P23 96 | ^GAGAAGGT 97 | >B12 98 | ^ACCTTCGA 99 | >B24 100 | ^GTCGATTG 101 | >D12 102 | ^TGTCGACT 103 | >D24 104 | ^TGTGTCAG 105 | >F12 106 | ^TCGAACCT 107 | >F24 108 | ^GTTATGGC 109 | >H12 110 | ^TCCGATCA 111 | >H24 112 | ^ACTGCACT 113 | >J12 114 | ^CATTCGTC 115 | >J24 116 | ^TGGTTCGA 117 | >L12 118 | ^GCATTGGT 119 | >L24 120 | ^CCTCGAAT 121 | >N12 122 | ^ACCGAATG 123 | >N24 124 | ^GCAATGAG 125 | >P12 126 | ^ACACCTCA 127 | >P24 128 | ^AATGGTCG 129 | -------------------------------------------------------------------------------- /cemba_data/files/sample_sheet_header.txt: -------------------------------------------------------------------------------- 1 | [Header],,,,,,,,,, 2 | IEMFileVersion,4,,,,,,,,, 3 | Date,,,,,,,,,, 4 | Workflow,GenerateFASTQ,,,,,,,,, 5 | Application,HiSeq_FASTQ_Only,,,,,,,,, 6 | Assay,TruSeq_HT,,,,,,,,, 7 | Description,,,,,,,,,, 8 | Chemistry,,,,,,,,,, 9 | ,,,,,,,,,, 10 | [Reads],,,,,,,,,, 11 | 151,,,,,,,,,, 12 | 151,,,,,,,,,, 13 | ,,,,,,,,,, 14 | [Settings],,,,,,,,,, 15 | Adapter,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,,,,,,,,, 16 | AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT,,,,,,,,, 17 | ,,,,,,,,,, 18 | [Data],,,,,,,,,, 19 | -------------------------------------------------------------------------------- /cemba_data/files/sbatch_template_schicluster.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Notes from TACC: 4 | # 5 | # -- Launch this script by executing 6 | # -- Copy/edit this script as desired. Launch by executing 7 | # "sbatch knl.openmp.slurm" on a Stampede2 login node. 8 | # 9 | # -- OpenMP codes run on a single node (upper case N = 1). 10 | # OpenMP ignores the value of lower case n, 11 | # but slurm needs a plausible value to schedule the job. 12 | # 13 | # -- Default value of OMP_NUM_THREADS is 1; be sure to change it! 14 | # 15 | # -- Increase thread count gradually while looking for optimal setting. 16 | # If there is sufficient memory available, the optimal setting 17 | # is often 68 (1 thread per core) or 136 (2 threads per core). 18 | # 19 | #---------------------------------------------------- 20 | 21 | #SBATCH -J {job_name} # Job name 22 | #SBATCH -o {log_dir}/{job_name}.o%j # Name of stdout output file 23 | #SBATCH -e {log_dir}/{job_name}.e%j # Name of stderr error file 24 | #SBATCH -p {queue} # Queue (partition) name 25 | #SBATCH -N 1 # Total # of nodes (must be 1 for OpenMP) 26 | #SBATCH -n 1 # Total # of mpi tasks (should be 1 for OpenMP) 27 | #SBATCH -t {time_str} # Run time (hh:mm:ss) 28 | {email_str} 29 | {email_type_str} 30 | 31 | #---------------------------------------------------- 32 | # Clone the whole miniconda into /tmp so the snakemake command do not access $WORK 33 | mkdir /tmp/test_{env_dir_random} 34 | 35 | # use micromamba 36 | export PATH=/work/05622/lhq/stampede2/bin:$PATH 37 | micromamba shell init -s bash -p /tmp/test_{env_dir_random} 38 | source ~/.bashrc 39 | 40 | # activate base environment 41 | micromamba activate 42 | 43 | # create schicluster environment 44 | micromamba create -y -n schicluster python=3.8 numpy scipy scikit-learn h5py \ 45 | joblib cooler pandas statsmodels rpy2 anndata xarray snakemake pybedtools htslib=1.9 pysam=0.18 46 | micromamba activate schicluster 47 | 48 | # export correct PYTHONPATH 49 | export PYTHONPATH=/tmp/test_{env_dir_random}/envs/schicluster/lib/python3.8/site-packages 50 | 51 | # install schicluster 52 | pip install schicluster 53 | which hicluster 54 | 55 | # Installation finished 56 | #---------------------------------------------------- 57 | 58 | 59 | # --------------------------------------------------- 60 | # actual command 61 | 62 | # print some info 63 | date 64 | hostname 65 | pwd 66 | # If you want to profile the job (CPU, MEM usage, etc.) 67 | # load remora with 68 | # "module load remora" 69 | # and change the command to 70 | # "remora {command}" 71 | 72 | 73 | # Set thread count (default value is 1)... 74 | export OMP_NUM_THREADS=48 75 | 76 | for i in `seq 1 5` 77 | do 78 | {command} --batch summary=${{i}}/5 79 | done 80 | 81 | # {command} 82 | 83 | # delete everything in /tmp 84 | 85 | rm -rf /tmp/test* 86 | # --------------------------------------------------- 87 | -------------------------------------------------------------------------------- /cemba_data/files/sbatch_template_yap.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Notes from TACC: 4 | # 5 | # -- Launch this script by executing 6 | # -- Copy/edit this script as desired. Launch by executing 7 | # "sbatch knl.openmp.slurm" on a Stampede2 login node. 8 | # 9 | # -- OpenMP codes run on a single node (upper case N = 1). 10 | # OpenMP ignores the value of lower case n, 11 | # but slurm needs a plausible value to schedule the job. 12 | # 13 | # -- Default value of OMP_NUM_THREADS is 1; be sure to change it! 14 | # 15 | # -- Increase thread count gradually while looking for optimal setting. 16 | # If there is sufficient memory available, the optimal setting 17 | # is often 68 (1 thread per core) or 136 (2 threads per core). 18 | # 19 | #---------------------------------------------------- 20 | 21 | #SBATCH -J {job_name} # Job name 22 | #SBATCH -o {log_dir}/{job_name}.o%j # Name of stdout output file 23 | #SBATCH -e {log_dir}/{job_name}.e%j # Name of stderr error file 24 | #SBATCH -p {queue} # Queue (partition) name 25 | #SBATCH -N 1 # Total # of nodes (must be 1 for OpenMP) 26 | #SBATCH -n 1 # Total # of mpi tasks (should be 1 for OpenMP) 27 | #SBATCH -t {time_str} # Run time (hh:mm:ss) 28 | {email_str} 29 | {email_type_str} 30 | 31 | 32 | #---------------------------------------------------- 33 | # Clone the whole miniconda into /tmp so the snakemake command do not access $WORK 34 | mkdir /tmp/test_{env_dir_random} 35 | tar -xf /work2/05622/lhq/test_conda.tar -C /tmp/test_{env_dir_random} 36 | export CONDA_PREFIX=/tmp/test_{env_dir_random}/test/miniconda3 37 | export CONDA_PYTHON_EXE=/tmp/test_{env_dir_random}/test/miniconda3/bin/python 38 | export CONDA_EXE=/tmp/test_{env_dir_random}/test/miniconda3/bin/conda 39 | export PATH=/dev/shm/bin:/tmp/test_{env_dir_random}/test/miniconda3/envs/mapping/bin:/tmp/test_{env_dir_random}/test/miniconda3/bin:/opt/apps/cmake/3.16.1/bin:/opt/apps/intel18/python2/2.7.15/bin:/opt/apps/autotools/1.1/bin:/opt/apps/git/2.24.1/bin:/opt/apps/libfabric/1.7.0/bin:/opt/apps/intel18/impi/18.0.2/bin:/opt/intel/compilers_and_libraries_2018.2.199/linux/mpi/intel64/bin:/opt/intel/compilers_and_libraries_2018.2.199/linux/bin/intel64:/opt/apps/gcc/6.3.0/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/opt/dell/srvadmin/bin:. 40 | find /tmp/test_{env_dir_random}/test/miniconda3/ -type f -print0 | sed 's/ /\\ /g; s/(/\\(/g; s/)/\\)/g' | xargs -0 -P 30 -I % sh -c '/bin/sed -i "s/\/tmp\/test\/miniconda3\/envs\/mapping\/bin\/python/\/tmp\/test_{env_dir_random}\/test\/miniconda3\/envs\/mapping\/bin\/python/" %' 41 | 42 | pip install cemba_data --upgrade 43 | pip install schicluster --upgrade 44 | 45 | # Check the path 46 | which python 47 | which snakemake 48 | which yap 49 | which allcools 50 | which bismark 51 | 52 | # Installation finished 53 | #---------------------------------------------------- 54 | 55 | 56 | # --------------------------------------------------- 57 | # actual command 58 | 59 | # print some info 60 | date 61 | hostname 62 | pwd 63 | # If you want to profile the job (CPU, MEM usage, etc.) 64 | # load remora with 65 | # "module load remora" 66 | # and change the command to 67 | # "remora {command}" 68 | 69 | 70 | # Set thread count (default value is 1)... 71 | export OMP_NUM_THREADS=48 72 | 73 | {command} 74 | 75 | # delete everything in /tmp 76 | 77 | rm -rf /tmp/test* 78 | # --------------------------------------------------- 79 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/__init__.py: -------------------------------------------------------------------------------- 1 | from .hisat3n_general import \ 2 | separate_unique_and_multi_align_reads, \ 3 | convert_hisat_bam_strandness, \ 4 | make_snakefile_hisat3n 5 | from .utilities import validate_cwd_fastq_paths, read_mapping_config 6 | from .hisat3n_mct import select_mct_reads, aggregate_feature_counts 7 | from .summary import snmc_summary, snmct_summary, snm3c_summary 8 | from .hisat3n_m3c import \ 9 | split_hisat3n_unmapped_reads, \ 10 | call_chromatin_contacts, \ 11 | remove_overlap_read_parts 12 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from .hisat3n_m3c import remove_overlap_read_parts 4 | 5 | 6 | @click.command('remove_overlap_read_parts') 7 | @click.argument('in_bam_path') 8 | @click.argument('out_bam_path') 9 | def _remove_overlap_read_parts(in_bam_path, out_bam_path): 10 | remove_overlap_read_parts(in_bam_path, out_bam_path) 11 | return 12 | 13 | 14 | @click.group() 15 | def _main(): 16 | return 17 | 18 | 19 | def main(): 20 | _main.add_command(_remove_overlap_read_parts) 21 | _main() 22 | return 23 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/config/__init__.py -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/gcp.md: -------------------------------------------------------------------------------- 1 | # Setup GCP image for mapping 2 | 3 | ## Create base image 4 | 5 | ```bash 6 | # init install system tools 7 | sudo yum install -y zsh tree wget screen git nfs-utils make gcc 8 | 9 | # install mambaforge 10 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh 11 | sh Mambaforge-Linux-x86_64.sh -b -p $HOME/mambaforge 12 | rm -f Mambaforge-Linux-x86_64.sh 13 | ./mambaforge/bin/mamba init zsh 14 | ./mambaforge/bin/mamba init bash 15 | exec /bin/zsh 16 | mamba install -y gxx 17 | 18 | # Create mapping env hisat3n_env.yml 19 | wget https://raw.githubusercontent.com/lhqing/cemba_data/master/hisat3n_env.yml 20 | mamba env update -f hisat3n_env.yml # this should install things in the base env 21 | 22 | # Install packages 23 | mkdir -p ~/pkg 24 | 25 | # install hisat-3n 26 | cd ~/pkg 27 | git clone https://github.com/DaehwanKimLab/hisat2.git hisat-3n 28 | cd hisat-3n 29 | git checkout hisat-3n-dev-directional-mapping-reverse 30 | make 31 | # put hisat-3n in the PATH 32 | echo 'export PATH=$HOME/pkg/hisat-3n:$PATH' >> ~/.bashrc 33 | source ~/.bashrc 34 | echo 'export PATH=$HOME/pkg/hisat-3n:$PATH' >> ~/.zshrc 35 | source ~/.zshrc 36 | 37 | # make sure allcools and yap is upto date 38 | cd ~/pkg 39 | git clone https://github.com/lhqing/cemba_data.git 40 | cd cemba_data 41 | pip install -e . 42 | 43 | cd ~/pkg 44 | git clone https://github.com/lhqing/ALLCools.git 45 | cd ALLCoools 46 | pip install -e . 47 | 48 | ## Create genome reference 49 | 50 | # add genome reference file 51 | # prepare and copy specific genome reference file to $HOME 52 | 53 | # prepare a $HOME/mapping.yaml file the records the path of required genome reference files 54 | 55 | # clean unnecessary cache files 56 | mamba clean -y -a 57 | ``` 58 | 59 | ## Actual mapping 60 | 61 | ```bash 62 | mkdir -p ~/mapping 63 | cd ~/mapping 64 | gsutil cp gs://PATH/TO/FASTQ_DIR/fastq ./ 65 | cp ~/pkg/cemba_data/hisat3n/snakefile/SNAKEFILE_YOU_WANT_TO_USE ./Snakefile 66 | 67 | # run snakemake 68 | snakemake --configfile ~/mapping.yaml -j 69 | ``` 70 | 71 | ## Build hisat-3n index 72 | ```bash 73 | # non-repeat index 74 | hisat-3n-build --base-change C,T genome.fa genome 75 | # repeat index 76 | hisat-3n-build --base-change T,C --repeat-index genome.fa genome 77 | # Build the repeat HISAT-3N integrated index with splice site information 78 | hisat-3n-build --base-change C,T --repeat-index --ss genome.ss --exon genome.exon genome.fa genome 79 | ``` 80 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/hisat-3n-build.sh: -------------------------------------------------------------------------------- 1 | # normal index 2 | hisat-3n-build --base-change C,T -p THREAD \ 3 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.fa \ 4 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl 5 | 6 | # repeat index 7 | hisat-3n-build --base-change C,T -p THREAD --repeat-index \ 8 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.fa \ 9 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.repeat 10 | 11 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/hisat3n_mapping_env.yaml: -------------------------------------------------------------------------------- 1 | name: mapping 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.8 8 | - pip 9 | - bedtools 10 | - cutadapt 11 | - htslib>=1.9 12 | - natsort 13 | - picard 14 | - pybedtools 15 | - pyBigWig 16 | - pysam 17 | - samtools 18 | - seaborn 19 | - snakemake 20 | - subread=2.0 21 | - yaml 22 | - pip: 23 | - allcools 24 | - cemba_data 25 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/vm_init.sh: -------------------------------------------------------------------------------- 1 | sudo yum install -y zsh tree wget screen git nfs-utils make gcc 2 | 3 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh 4 | sh Mambaforge-Linux-x86_64.sh -b -p $HOME/mambaforge 5 | rm -f Mambaforge-Linux-x86_64.sh 6 | ./mambaforge/bin/mamba init zsh 7 | ./mambaforge/bin/mamba init bash 8 | 9 | mamba install -y gxx 10 | exec /bin/zsh 11 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/hisat3n_general.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import pathlib 3 | import cemba_data 4 | import subprocess 5 | from ..utilities import get_configuration 6 | 7 | 8 | def bam_read_to_fastq_read(read, read_type=None): 9 | if read_type is None: 10 | if read.is_read1: 11 | read_type = '1' 12 | else: 13 | read_type = '2' 14 | 15 | fastq_record = f"@{read.qname}_{read_type}\n" \ 16 | f"{read.query_sequence}\n" \ 17 | f"+\n" \ 18 | f"{read.qual}\n" 19 | return fastq_record 20 | 21 | 22 | def separate_unique_and_multi_align_reads(in_bam_path, 23 | out_unique_path, 24 | out_multi_path, 25 | out_unmappable_path=None, 26 | unmappable_format='auto', 27 | mapq_cutoff=10, 28 | qlen_cutoff=30, 29 | primary_only=True, 30 | read_type=None): 31 | """ 32 | Separate unique aligned, multi-aligned, and unaligned reads from hisat-3n bam file. 33 | 34 | Parameters 35 | ---------- 36 | in_bam_path 37 | Path to hisat-3n bam file. 38 | out_unique_path 39 | Path to output unique aligned bam file. 40 | out_multi_path 41 | Path to output multi-aligned bam file. 42 | out_unmappable_path 43 | Path to output unmappable file. 44 | unmappable_format 45 | Format of unmappable file, only "bam" and "fastq" supported. 46 | mapq_cutoff 47 | MAPQ cutoff for uniquely aligned reads, 48 | note that for hisat-3n, unique aligned reads always have MAPQ=60 49 | qlen_cutoff 50 | read length cutoff for any reads 51 | primary_only 52 | If True, only primary alignments (FLAG 256) are considered for multi-aligned reads. 53 | read_type 54 | read type, only None, "1" and "2" supported. If the BAM file is paired-end, use None. 55 | Returns 56 | ------- 57 | None 58 | """ 59 | if out_unmappable_path is not None: 60 | if unmappable_format == 'auto': 61 | if out_unmappable_path.endswith('.bam'): 62 | unmappable_format = 'bam' 63 | elif out_unmappable_path.endswith('.fastq'): 64 | unmappable_format = 'fastq' 65 | else: 66 | raise ValueError(f'Unmappable format {unmappable_format} not supported.') 67 | else: 68 | if unmappable_format not in ['bam', 'fastq']: 69 | raise ValueError(f'Unmappable format {unmappable_format} not supported.') 70 | 71 | with pysam.AlignmentFile(in_bam_path, index_filename=None) as bam: 72 | header = bam.header 73 | with pysam.AlignmentFile(out_unique_path, header=header, mode='wb') as unique_bam, \ 74 | pysam.AlignmentFile(out_multi_path, header=header, mode='wb') as multi_bam: 75 | if out_unmappable_path is not None: 76 | if unmappable_format == 'bam': 77 | unmappable_file = pysam.AlignmentFile(out_unmappable_path, header=header, mode='wb') 78 | else: 79 | unmappable_file = open(out_unmappable_path, 'w') 80 | else: 81 | unmappable_file = None 82 | 83 | for read in bam: 84 | # skip reads that are too short 85 | if read.qlen < qlen_cutoff: 86 | continue 87 | 88 | if read.mapq > mapq_cutoff: 89 | unique_bam.write(read) 90 | elif read.mapq > 0: 91 | if primary_only and read.is_secondary: 92 | # skip secondary alignments if primary_only is True, 93 | # read.is_secondary is True when FLAG contains 256. 94 | continue 95 | multi_bam.write(read) 96 | else: 97 | # unmappable reads 98 | if unmappable_file is not None: 99 | if unmappable_format == 'bam': 100 | unmappable_file.write(read) 101 | else: 102 | unmappable_file.write(bam_read_to_fastq_read(read, read_type=read_type)) 103 | 104 | if unmappable_file is not None: 105 | unmappable_file.close() 106 | return 107 | 108 | 109 | def convert_hisat_bam_strandness(in_bam_path, out_bam_path): 110 | with pysam.AlignmentFile(in_bam_path) as in_bam, \ 111 | pysam.AlignmentFile(out_bam_path, header=in_bam.header, mode='wb') as out_bam: 112 | for read in in_bam: 113 | if read.get_tag('YZ') == '+': 114 | read.is_forward = True 115 | if read.is_paired: 116 | read.mate_is_forward = True 117 | else: 118 | read.is_forward = False 119 | if read.is_paired: 120 | read.mate_is_forward = False 121 | out_bam.write(read) 122 | return 123 | 124 | 125 | def make_snakefile_hisat3n(output_dir): 126 | output_dir = pathlib.Path(output_dir) 127 | 128 | mapping_config_name = list(output_dir.glob('mapping_config.*'))[0].name 129 | 130 | config = get_configuration(output_dir / mapping_config_name) 131 | try: 132 | mode = config['mode'] 133 | except KeyError: 134 | raise KeyError('mode not found in the config file.') 135 | 136 | skip_dirs = ['stats', 'snakemake', 'scool'] 137 | mapping_job_dirs = [p for p in output_dir.glob('*') 138 | if p.is_dir() and (p.name not in skip_dirs)] 139 | 140 | snakemake_dir = output_dir / 'snakemake' 141 | snakemake_dir.mkdir(exist_ok=True) 142 | stats_dir = output_dir / 'stats' 143 | stats_dir.mkdir(exist_ok=True) 144 | 145 | package_dir = cemba_data.__path__[0] 146 | snakefile_path = f'{package_dir}/hisat3n/snakefile/{mode.lower()}.smk' 147 | if not pathlib.Path(snakefile_path).exists(): 148 | print('Possible snakefile templates:') 149 | for p in pathlib.Path(f'{package_dir}/hisat3n/snakefile/').glob('*.smk'): 150 | print(p) 151 | raise ValueError(f'Mode {mode} not supported, ' 152 | f'because Snakefile {snakefile_path} not found.') 153 | 154 | for p in mapping_job_dirs: 155 | subprocess.run(['cp', f'{output_dir}/{mapping_config_name}', 156 | f'{p}/{mapping_config_name}'], check=True) 157 | subprocess.run(['cp', snakefile_path, f'{p}/Snakefile'], check=True) 158 | 159 | # leave a flag to indicate using hisat-3n pipeline 160 | subprocess.run(['touch', f'{output_dir}/snakemake/hisat3n'], check=True) 161 | return 162 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/snakefile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/snakefile/__init__.py -------------------------------------------------------------------------------- /cemba_data/hisat3n/stats_col_names.py: -------------------------------------------------------------------------------- 1 | # "DELETE" means this column will be removed from the output file 2 | # "" means the name will not be changed 3 | # "CertainMetricNames" means the name will be changed to "CertainMetricNames" 4 | 5 | 6 | COL_NAMES = { 7 | ('cell_parser_cutadapt_trim_stats', 'status'): 'DELETE', 8 | ('cell_parser_cutadapt_trim_stats', 'in_reads'): 'InputReadPairs', 9 | ('cell_parser_cutadapt_trim_stats', 'in_bp'): 'InputReadPairsBP', 10 | ('cell_parser_cutadapt_trim_stats', 'too_short'): 'DELETE', 11 | ('cell_parser_cutadapt_trim_stats', 'too_long'): 'DELETE', 12 | ('cell_parser_cutadapt_trim_stats', 'too_many_n'): 'DELETE', 13 | ('cell_parser_cutadapt_trim_stats', 'out_reads'): 'TrimmedReadPairs', 14 | ('cell_parser_cutadapt_trim_stats', 'w/adapters'): 'R1WithAdapters', 15 | ('cell_parser_cutadapt_trim_stats', 'qualtrim_bp'): 'R1QualTrimBP', 16 | ('cell_parser_cutadapt_trim_stats', 'out_bp'): 'R1TrimmedReadsBP', 17 | ('cell_parser_cutadapt_trim_stats', 'w/adapters2'): 'R2WithAdapters', 18 | ('cell_parser_cutadapt_trim_stats', 'qualtrim2_bp'): 'R2QualTrimBP', 19 | ('cell_parser_cutadapt_trim_stats', 'out2_bp'): 'R2TrimmedReadsBP', 20 | ('cell_parser_hisat_summary', 'ReadPairsMappedInPE'): 'DELETE', 21 | ('cell_parser_hisat_summary', 'PEUnmappableReadPairs'): 'DELETE', 22 | ('cell_parser_hisat_summary', 'PEUniqueMappedReadPairs'): 'DELETE', 23 | ('cell_parser_hisat_summary', 'PEMultiMappedReadPairs'): 'DELETE', 24 | ('cell_parser_hisat_summary', 'PEDiscordantlyUniqueMappedReadPairs'): 'DELETE', 25 | ('cell_parser_hisat_summary', 'ReadsMappedInSE'): 'DELETE', 26 | ('cell_parser_hisat_summary', 'SEUnmappableReads'): 'DELETE', 27 | ('cell_parser_hisat_summary', 'SEUniqueMappedReads'): 'DELETE', 28 | ('cell_parser_hisat_summary', 'SEMultiMappedReads'): 'DELETE', 29 | ('cell_parser_hisat_summary', 'UniqueMappedReads'): 'UniqueMappedReads', 30 | ('cell_parser_hisat_summary', 'MultiMappedReads'): 'MultiMappedReads', 31 | ('cell_parser_hisat_summary', 'UniqueMappingRate'): 'UniqueMappingRate', 32 | ('cell_parser_hisat_summary', 'MultiMappingRate'): 'MultiMappingRate', 33 | ('cell_parser_hisat_summary', 'OverallMappingRate'): 'OverallMappingRate', 34 | ('cell_parser_picard_dedup_stat', 'LIBRARY'): 'DELETE', 35 | ('cell_parser_picard_dedup_stat', 'UNPAIRED_READS_EXAMINED'): 'DELETE', 36 | ('cell_parser_picard_dedup_stat', 'READ_PAIRS_EXAMINED'): 'DELETE', 37 | ('cell_parser_picard_dedup_stat', 'SECONDARY_OR_SUPPLEMENTARY_RDS'): 'DELETE', 38 | ('cell_parser_picard_dedup_stat', 'UNMAPPED_READS'): 'DELETE', 39 | ('cell_parser_picard_dedup_stat', 'UNPAIRED_READ_DUPLICATES'): 'DELETE', 40 | ('cell_parser_picard_dedup_stat', 'READ_PAIR_DUPLICATES'): 'DELETE', 41 | ('cell_parser_picard_dedup_stat', 'READ_PAIR_OPTICAL_DUPLICATES'): 'DELETE', 42 | ('cell_parser_picard_dedup_stat', 'PERCENT_DUPLICATION'): 'DELETE', 43 | ('cell_parser_picard_dedup_stat', 'ESTIMATED_LIBRARY_SIZE'): 'DELETE', 44 | ('cell_parser_picard_dedup_stat', 'FinalReads'): '', 45 | ('cell_parser_picard_dedup_stat', 'DuplicatedReads'): '', 46 | ('cell_parser_picard_dedup_stat', 'PCRDuplicationRate'): '', 47 | ('cell_parser_feature_count_summary', 'Assigned'): 'AssignedRNAReads', 48 | ('cell_parser_feature_count_summary', 'Unassigned_Unmapped'): 'DELETE', 49 | ('cell_parser_feature_count_summary', 'Unassigned_Read_Type'): 'DELETE', 50 | ('cell_parser_feature_count_summary', 'Unassigned_Singleton'): 'DELETE', 51 | ('cell_parser_feature_count_summary', 'Unassigned_MappingQuality'): 'DELETE', 52 | ('cell_parser_feature_count_summary', 'Unassigned_Chimera'): 'DELETE', 53 | ('cell_parser_feature_count_summary', 'Unassigned_FragmentLength'): 'DELETE', 54 | ('cell_parser_feature_count_summary', 'Unassigned_Duplicate'): 'DELETE', 55 | ('cell_parser_feature_count_summary', 'Unassigned_MultiMapping'): 'DELETE', 56 | ('cell_parser_feature_count_summary', 'Unassigned_Secondary'): 'DELETE', 57 | ('cell_parser_feature_count_summary', 'Unassigned_NonSplit'): 'DELETE', 58 | ('cell_parser_feature_count_summary', 'Unassigned_NoFeatures'): 'DELETE', 59 | ('cell_parser_feature_count_summary', 'Unassigned_Overlapping_Length'): 'DELETE', 60 | ('cell_parser_feature_count_summary', 'Unassigned_Ambiguity'): 'DELETE', 61 | ('cell_parser_feature_count_summary', 'Unassigned_Total'): 'UnassignedRNAReads', 62 | ('cell_parser_feature_count_summary', 'AssignedRNAReadsRate'): 'AssignedRNAReadsRate', 63 | ('cell_parser_call_chromatin_contacts', 'cis'): 'CisContacts', 64 | ('cell_parser_call_chromatin_contacts', 'ciscut'): 'CisCutContacts', 65 | ('cell_parser_call_chromatin_contacts', 'cis_multi'): 'CisMultiContacts', 66 | ('cell_parser_call_chromatin_contacts', 'ciscut_multi'): 'CisCutMultiContacts', 67 | ('cell_parser_call_chromatin_contacts', 'trans'): 'TransContacts', 68 | ('cell_parser_call_chromatin_contacts', 'transcut',): 'TransCutContacts', 69 | ('cell_parser_call_chromatin_contacts', 'trans_multi'): 'TransMultiContacts', 70 | ('cell_parser_call_chromatin_contacts', 'transcut_multi'): 'TransCutMultiContacts', 71 | ('cell_parser_call_chromatin_contacts', 'chimeric'): 'ChimericContacts', 72 | ('cell_parser_call_chromatin_contacts', 'no'): 'NoContacts', 73 | ('cell_parser_call_chromatin_contacts', 'mapped_frag'): 'MappedFragments', 74 | ('cell_parser_call_chromatin_contacts', 'dedup_frag'): 'DeduppedContacts', 75 | ('cell_parser_call_chromatin_contacts', 'dup_rate'): 'ContactsDeduplicationRate', 76 | ('cell_parser_call_chromatin_contacts', 'TotalCisContacts'): '', 77 | ('cell_parser_call_chromatin_contacts', 'TotalTransContacts'): '', 78 | ('cell_parser_call_chromatin_contacts', 'TotalMultiContacts'): '', 79 | ('cell_parser_call_chromatin_contacts', 'CisContactsRatio'): '', 80 | ('cell_parser_call_chromatin_contacts', 'TransContactsRatio'): '', 81 | ('cell_parser_call_chromatin_contacts', 'MultiContactsRatio'): '', 82 | } 83 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/summary.py: -------------------------------------------------------------------------------- 1 | from .stats_parser import * 2 | 3 | 4 | def snmc_summary(): 5 | """ 6 | Generate snmC pipeline MappingSummary.csv.gz and save into cwd 7 | 8 | Returns 9 | ------- 10 | pd.DataFrame 11 | """ 12 | all_stats = [] 13 | 14 | # fastq trimming stats 15 | df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt', 16 | cell_parser_cutadapt_trim_stats) 17 | all_stats.append(df) 18 | 19 | # hisat-3n mapping 20 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt', 21 | cell_parser_hisat_summary) 22 | all_stats.append(df) 23 | 24 | # uniquely mapped reads dedup 25 | df = parse_single_stats_set(f'bam/*.unique_align.deduped.matrix.txt', 26 | cell_parser_picard_dedup_stat, prefix='UniqueAlign') 27 | all_stats.append(df) 28 | 29 | # multi mapped reads dedup 30 | df = parse_single_stats_set(f'bam/*.multi_align.deduped.matrix.txt', 31 | cell_parser_picard_dedup_stat, prefix='MultiAlign') 32 | all_stats.append(df) 33 | 34 | # allc count 35 | df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv', 36 | cell_parser_allc_count) 37 | all_stats.append(df) 38 | 39 | # concatenate all stats 40 | all_stats = pd.concat(all_stats, axis=1) 41 | all_stats.index.name = 'cell' 42 | all_stats.to_csv(f'MappingSummary.csv.gz') 43 | return all_stats 44 | 45 | 46 | def snmct_summary(): 47 | """ 48 | Generate snmCT pipeline MappingSummary.csv.gz and save into cwd 49 | 50 | Returns 51 | ------- 52 | pd.DataFrame 53 | """ 54 | all_stats = [] 55 | 56 | # fastq trimming stats 57 | df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt', 58 | cell_parser_cutadapt_trim_stats) 59 | all_stats.append(df) 60 | 61 | # hisat-3n DNA mapping 62 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt', 63 | cell_parser_hisat_summary, prefix='DNA') 64 | all_stats.append(df) 65 | 66 | # hisat-3n RNA mapping 67 | df = parse_single_stats_set(f'rna_bam/*.hisat3n_rna_summary.txt', 68 | cell_parser_hisat_summary, prefix='RNA') 69 | all_stats.append(df) 70 | 71 | # uniquely mapped reads dedup 72 | df = parse_single_stats_set(f'bam/*.unique_align.deduped.matrix.txt', 73 | cell_parser_picard_dedup_stat, prefix='DNAUniqueAlign') 74 | all_stats.append(df) 75 | 76 | # multi mapped reads dedup 77 | df = parse_single_stats_set(f'bam/*.multi_align.deduped.matrix.txt', 78 | cell_parser_picard_dedup_stat, prefix='DNAMultiAlign') 79 | all_stats.append(df) 80 | 81 | # uniquely mapped dna reads selection 82 | df = parse_single_stats_set('bam/*.hisat3n_dna.unique_align.deduped.dna_reads.reads_mch_frac.csv', 83 | cell_parser_reads_mc_frac_profile, prefix='UniqueAlign') 84 | all_stats.append(df) 85 | 86 | # multi mapped dna reads selection 87 | df = parse_single_stats_set('bam/*.hisat3n_dna.multi_align.deduped.dna_reads.reads_mch_frac.csv', 88 | cell_parser_reads_mc_frac_profile, prefix='MultiAlign') 89 | all_stats.append(df) 90 | 91 | # uniquely mapped rna reads selection 92 | df = parse_single_stats_set('rna_bam/*.hisat3n_rna.unique_align.rna_reads.reads_mch_frac.csv', 93 | cell_parser_reads_mc_frac_profile) 94 | all_stats.append(df) 95 | 96 | # allc count 97 | df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv', 98 | cell_parser_allc_count) 99 | all_stats.append(df) 100 | 101 | # feature count 102 | df = parse_single_stats_set(f'rna_bam/*.feature_count.tsv.summary', 103 | cell_parser_feature_count_summary) 104 | all_stats.append(df) 105 | 106 | # concatenate all stats 107 | all_stats = pd.concat(all_stats, axis=1) 108 | all_stats.index.name = 'cell' 109 | all_stats.to_csv(f'MappingSummary.csv.gz') 110 | return all_stats 111 | 112 | 113 | def snm3c_summary(): 114 | """ 115 | Generate snm3C pipeline MappingSummary.csv.gz and save into cwd 116 | 117 | Returns 118 | ------- 119 | pd.DataFrame 120 | """ 121 | all_stats = [] 122 | 123 | # fastq trimming stats 124 | df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt', 125 | cell_parser_cutadapt_trim_stats) 126 | all_stats.append(df) 127 | 128 | # hisat-3n mapping PE 129 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt', 130 | cell_parser_hisat_summary) 131 | all_stats.append(df) 132 | 133 | # hisat-3n mapping split-reads SE 134 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_split_reads_summary.txt', 135 | cell_parser_hisat_summary, prefix='SplitReads') 136 | all_stats.append(df) 137 | 138 | # uniquely mapped reads dedup 139 | df = parse_single_stats_set(f'bam/*.all_reads.deduped.matrix.txt', 140 | cell_parser_picard_dedup_stat, prefix='UniqueAlign') 141 | all_stats.append(df) 142 | 143 | # call chromatin contacts 144 | df = parse_single_stats_set(f'hic/*.all_reads.contact_stats.csv', 145 | cell_parser_call_chromatin_contacts) 146 | all_stats.append(df) 147 | 148 | # allc count 149 | df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv', 150 | cell_parser_allc_count) 151 | all_stats.append(df) 152 | 153 | # concatenate all stats 154 | all_stats = pd.concat(all_stats, axis=1) 155 | all_stats.index.name = 'cell' 156 | all_stats.to_csv(f'MappingSummary.csv.gz') 157 | return all_stats 158 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/utilities.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import re 3 | import yaml 4 | import pandas as pd 5 | 6 | from ..utilities import get_configuration 7 | 8 | 9 | def _read_yaml_config(config_path): 10 | with open(config_path, 'r') as f: 11 | config = yaml.safe_load(f) 12 | return config 13 | 14 | 15 | def _read_ini_config(config_path): 16 | return get_configuration(config_path) 17 | 18 | 19 | def read_mapping_config(cwd: str = '.'): 20 | tried = [] 21 | yaml_path = None 22 | for name in ['config', 'mapping_config']: 23 | for config_dir in [cwd, f'{cwd}/..']: 24 | for suffix in ['yaml', 'yml']: 25 | path = f'{config_dir}/{name}.{suffix}' 26 | tried.append(path) 27 | if pathlib.Path(path).exists(): 28 | yaml_path = path 29 | default_path = f'~/mapping_config.yaml' 30 | if pathlib.Path(default_path).exists(): 31 | yaml_path = default_path 32 | 33 | ini_path = None 34 | for name in ['config', 'mapping_config']: 35 | for config_dir in [cwd, f'{cwd}/..']: 36 | path = f'{config_dir}/{name}.ini' 37 | tried.append(path) 38 | if pathlib.Path(path).exists(): 39 | ini_path = path 40 | 41 | if yaml_path is not None: 42 | config = _read_yaml_config(yaml_path) 43 | elif ini_path is not None: 44 | config = _read_ini_config(ini_path) 45 | else: 46 | config = {} 47 | return config 48 | 49 | 50 | def validate_cwd_fastq_paths(cwd: str = '.'): 51 | """ 52 | Validate fastq paths in the fastq subdirectory of cwd. 53 | Parameters 54 | ---------- 55 | cwd : 56 | Path of the current working directory. 57 | 58 | Returns 59 | ------- 60 | fastq_table : pandas.DataFrame 61 | """ 62 | # get all fastq file paths 63 | fastq_paths = [p 64 | for p in pathlib.Path(f'{cwd}/fastq/').glob('*.[fq.gz][fastq.gz]') 65 | if 'trim' not in p.name] 66 | 67 | # parse cell id and match fastq pairs 68 | fastq_pattern = re.compile(r'(?P.+)(-|_)(?P(R1|R2|r1|r2)).(fastq|fq)(.gz)*') 69 | fastq_records = {} 70 | for p in fastq_paths: 71 | match = fastq_pattern.match(p.name) 72 | if match is None: 73 | # print(f'WARNING: {p} has FASTQ file path suffix, but do not match ' 74 | # f'expected file name pattern {fastq_pattern}') 75 | pass 76 | else: 77 | cell_id = match.group('cell_id') 78 | read_type = match.group('read_type') 79 | fastq_records[cell_id, read_type.upper()] = str(p) 80 | 81 | if len(fastq_records) == 0: 82 | raise ValueError('No fastq files found in fastq folder, ' 83 | 'or no fastq files match expected file name pattern') 84 | 85 | fastq_table = pd.Series(fastq_records).unstack() 86 | if 'R1' not in fastq_table.columns or 'R2' not in fastq_table.columns: 87 | raise ValueError('No R1 or R2 fastq files found') 88 | fastq_table = fastq_table[['R1', 'R2']].copy() 89 | 90 | # raise error if fastq file not paired 91 | missing_file = fastq_table.isna().sum(axis=1) > 0 92 | if missing_file.sum() > 0: 93 | for cell in missing_file[missing_file].index: 94 | print(f'{cell} missing R1 or R2 FASTQ file.') 95 | raise FileNotFoundError(f'FASTQ files in {pathlib.Path(f"{cwd}/fastq/").absolute()} is not all paired.') 96 | return fastq_table 97 | -------------------------------------------------------------------------------- /cemba_data/mapping/Snakefile_template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/mapping/Snakefile_template/__init__.py -------------------------------------------------------------------------------- /cemba_data/mapping/Snakefile_template/mc.Snakefile: -------------------------------------------------------------------------------- 1 | 2 | # Snakemake rules below 3 | # suitable for snmC-seq2, snmC-seq3, NOMe-seq 4 | 5 | # use diff mcg_context for normal mC or NOMe 6 | mcg_context = 'CGN' if num_upstr_bases == 0 else 'HCGN' 7 | 8 | # the summary rule is the final target 9 | rule summary: 10 | input: 11 | expand("allc/{cell_id}.allc.tsv.gz", cell_id=CELL_IDS), 12 | expand("allc-{mcg_context}/{cell_id}.{mcg_context}-Merge.allc.tsv.gz", cell_id=CELL_IDS, 13 | mcg_context=mcg_context), 14 | # also add all the stats path here, 15 | # once summary is generated, snakemake will delete these stats 16 | expand("allc/{cell_id}.allc.tsv.gz.count.csv", cell_id=CELL_IDS), 17 | expand("fastq/{cell_id}-R1.trimmed.stats.tsv", cell_id=CELL_IDS), 18 | expand("fastq/{cell_id}-R2.trimmed.stats.tsv", cell_id=CELL_IDS), 19 | expand("bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.matrix.txt", cell_id=CELL_IDS), 20 | expand("bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.matrix.txt", cell_id=CELL_IDS), 21 | expand("bam/{cell_id}-R1.trimmed_bismark_bt2_SE_report.txt", cell_id=CELL_IDS), 22 | expand("bam/{cell_id}-R2.trimmed_bismark_bt2_SE_report.txt", cell_id=CELL_IDS), 23 | output: 24 | "MappingSummary.csv.gz" 25 | shell: 26 | "yap-internal summary --output_dir ./" 27 | 28 | # Trim reads 29 | rule trim_r1: 30 | input: 31 | "fastq/{cell_id}-R1.fq.gz" 32 | output: 33 | fq=temp("fastq/{cell_id}-R1.trimmed.fq.gz"), 34 | stats=temp("fastq/{cell_id}-R1.trimmed.stats.tsv") 35 | threads: 36 | 2 37 | shell: 38 | "cutadapt --report=minimal -a {r1_adapter} {input} 2> {output.stats} | " 39 | "cutadapt --report=minimal -O 6 -q 20 -u {r1_left_cut} -u -{r1_right_cut} -m 30 " 40 | "-o {output.fq} - >> {output.stats}" 41 | 42 | rule trim_r2: 43 | input: 44 | "fastq/{cell_id}-R2.fq.gz" 45 | output: 46 | fq=temp("fastq/{cell_id}-R2.trimmed.fq.gz"), 47 | stats=temp("fastq/{cell_id}-R2.trimmed.stats.tsv") 48 | threads: 49 | 2 50 | shell: 51 | "cutadapt --report=minimal -a {r2_adapter} {input} 2> {output.stats} | " 52 | "cutadapt --report=minimal -O 6 -q 20 -u {r2_left_cut} -u -{r2_right_cut} -m 30 " 53 | "-o {output.fq} - >> {output.stats}" 54 | 55 | # bismark mapping, R1 and R2 separately 56 | rule bismark_r1: 57 | input: 58 | "fastq/{cell_id}-R1.trimmed.fq.gz" 59 | output: 60 | bam=temp("bam/{cell_id}-R1.trimmed_bismark_bt2.bam"), 61 | stats=temp("bam/{cell_id}-R1.trimmed_bismark_bt2_SE_report.txt") 62 | threads: 63 | 3 64 | resources: 65 | mem_mb=14000 66 | shell: 67 | # map R1 with --pbat mode 68 | "bismark {bismark_reference} {unmapped_param_str} --bowtie2 {input} " 69 | "--pbat -o bam/ --temp_dir bam/" 70 | 71 | rule bismark_r2: 72 | input: 73 | "fastq/{cell_id}-R2.trimmed.fq.gz" 74 | output: 75 | bam=temp("bam/{cell_id}-R2.trimmed_bismark_bt2.bam"), 76 | stats=temp("bam/{cell_id}-R2.trimmed_bismark_bt2_SE_report.txt") 77 | threads: 78 | 3 79 | resources: 80 | mem_mb=14000 81 | shell: 82 | # map R2 with normal SE mode 83 | "bismark {bismark_reference} {unmapped_param_str} --bowtie2 {input} " 84 | "-o bam/ --temp_dir bam/" 85 | 86 | # filter bam 87 | rule filter_r1_bam: 88 | input: 89 | "bam/{cell_id}-R1.trimmed_bismark_bt2.bam" 90 | output: 91 | temp("bam/{cell_id}-R1.trimmed_bismark_bt2.filter.bam") 92 | shell: 93 | "samtools view -b -h -q 10 -o {output} {input}" 94 | 95 | rule filter_r2_bam: 96 | input: 97 | "bam/{cell_id}-R2.trimmed_bismark_bt2.bam" 98 | output: 99 | temp("bam/{cell_id}-R2.trimmed_bismark_bt2.filter.bam") 100 | shell: 101 | "samtools view -b -h -q 10 -o {output} {input}" 102 | 103 | # sort bam 104 | rule sort_r1_bam: 105 | input: 106 | "bam/{cell_id}-R1.trimmed_bismark_bt2.filter.bam" 107 | output: 108 | temp("bam/{cell_id}-R1.trimmed_bismark_bt2.sorted.bam") 109 | resources: 110 | mem_mb=1000 111 | shell: 112 | "samtools sort -o {output} {input}" 113 | 114 | rule sort_r2_bam: 115 | input: 116 | "bam/{cell_id}-R2.trimmed_bismark_bt2.filter.bam" 117 | output: 118 | temp("bam/{cell_id}-R2.trimmed_bismark_bt2.sorted.bam") 119 | resources: 120 | mem_mb=1000 121 | shell: 122 | "samtools sort -o {output} {input}" 123 | 124 | # remove PCR duplicates 125 | rule dedup_r1_bam: 126 | input: 127 | "bam/{cell_id}-R1.trimmed_bismark_bt2.sorted.bam" 128 | output: 129 | bam=temp("bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.bam"), 130 | stats=temp("bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.matrix.txt") 131 | resources: 132 | mem_mb=1000 133 | shell: 134 | "picard MarkDuplicates I={input} O={output.bam} M={output.stats} " 135 | "REMOVE_DUPLICATES=true TMP_DIR=bam/temp/" 136 | 137 | rule dedup_r2_bam: 138 | input: 139 | "bam/{cell_id}-R2.trimmed_bismark_bt2.sorted.bam" 140 | output: 141 | bam=temp("bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.bam"), 142 | stats=temp("bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.matrix.txt") 143 | resources: 144 | mem_mb=1000 145 | shell: 146 | "picard MarkDuplicates I={input} O={output.bam} M={output.stats} " 147 | "REMOVE_DUPLICATES=true TMP_DIR=bam/temp/" 148 | 149 | # merge R1 and R2, get final bam 150 | rule merge_bam: 151 | input: 152 | "bam/{cell_id}-R1.trimmed_bismark_bt2.deduped.bam", 153 | "bam/{cell_id}-R2.trimmed_bismark_bt2.deduped.bam" 154 | output: 155 | "bam/{cell_id}.final.bam" 156 | shell: 157 | "samtools merge -f {output} {input}" 158 | 159 | # generate ALLC 160 | rule allc: 161 | input: 162 | "bam/{cell_id}.final.bam" 163 | output: 164 | allc="allc/{cell_id}.allc.tsv.gz", 165 | stats=temp("allc/{cell_id}.allc.tsv.gz.count.csv") 166 | threads: 167 | 2 168 | resources: 169 | mem_mb=500 170 | shell: 171 | 'allcools bam-to-allc ' 172 | '--bam_path {input} ' 173 | '--reference_fasta {reference_fasta} ' 174 | '--output_path {output.allc} ' 175 | '--cpu 1 ' 176 | '--num_upstr_bases {num_upstr_bases} ' 177 | '--num_downstr_bases {num_downstr_bases} ' 178 | '--compress_level {compress_level} ' 179 | '--save_count_df' 180 | 181 | 182 | # CGN extraction from ALLC 183 | rule cgn_extraction: 184 | input: 185 | "allc/{cell_id}.allc.tsv.gz", 186 | output: 187 | "allc-{mcg_context}/{cell_id}.{mcg_context}-Merge.allc.tsv.gz", 188 | params: 189 | prefix="allc-{mcg_context}/{cell_id}", 190 | threads: 191 | 1 192 | resources: 193 | mem_mb=100 194 | shell: 195 | 'allcools extract-allc ' 196 | '--strandness merge ' 197 | '--allc_path {input} ' 198 | '--output_prefix {params.prefix} ' 199 | '--mc_contexts {mcg_context} ' 200 | '--chrom_size_path {chrom_size_path} ' 201 | -------------------------------------------------------------------------------- /cemba_data/mapping/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import print_default_mapping_config 2 | from .pipelines import prepare_run, start_from_cell_fastq 3 | from .stats import final_summary 4 | from .stats.plot import * 5 | -------------------------------------------------------------------------------- /cemba_data/mapping/config.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import cemba_data 4 | from ..utilities import MAPPING_MODE_CHOICES 5 | 6 | # Load defaults 7 | PACKAGE_DIR = pathlib.Path(cemba_data.__path__[0]) 8 | 9 | 10 | def print_default_mapping_config(mode, 11 | barcode_version, 12 | genome_fasta, 13 | bismark_ref=None, 14 | hisat3n_dna_ref=None, 15 | hisat3n_rna_ref=None, 16 | star_ref=None, 17 | gtf=None, 18 | nome=False, 19 | chrom_size_path=None): 20 | mode = mode.lower() 21 | if mode not in MAPPING_MODE_CHOICES: 22 | raise ValueError(f'Unknown mode {mode}') 23 | 24 | barcode_version = barcode_version.upper() 25 | if barcode_version not in ['V1', 'V2']: 26 | raise ValueError(f'Unknown mode {barcode_version}') 27 | 28 | if bismark_ref is not None: 29 | bismark_ref = pathlib.Path(bismark_ref).absolute() 30 | else: 31 | if hisat3n_rna_ref is None or hisat3n_dna_ref is None: 32 | raise ValueError('bismark_ref is required if hisat3n_rna_ref and hisat3n_dna_ref are not specified.') 33 | hisat3n_rna_ref = pathlib.Path(hisat3n_rna_ref).absolute() 34 | hisat3n_dna_ref = pathlib.Path(hisat3n_dna_ref).absolute() 35 | 36 | if mode == 'mct': 37 | if star_ref is None: 38 | if hisat3n_rna_ref is None: 39 | raise ValueError('star_ref or hisat3n_rna_ref is required if mode is mct.') 40 | else: 41 | star_ref = pathlib.Path(star_ref).absolute() 42 | if gtf is None: 43 | raise ValueError('gtf must be provided when mode is mct.') 44 | gtf = pathlib.Path(gtf).absolute() 45 | 46 | if chrom_size_path is None: 47 | raise ValueError('chrom_size_path must be provided.') 48 | chrom_size_path = pathlib.Path(chrom_size_path).absolute() 49 | 50 | if mode == 'm3c': 51 | pass 52 | 53 | if mode == '4m': 54 | if (star_ref is None) and (hisat3n_rna_ref is None): 55 | raise ValueError('star_ref or hisat3n_rna_ref is required if mode is mct.') 56 | star_ref = pathlib.Path(star_ref).absolute() 57 | 58 | if gtf is None: 59 | raise ValueError('gtf must be provided when mode is mct.') 60 | gtf = pathlib.Path(gtf).absolute() 61 | 62 | genome_fasta = pathlib.Path(genome_fasta).absolute() 63 | 64 | if mode == 'mc': 65 | if nome: 66 | config_path = PACKAGE_DIR / 'files/default_config/mapping_config_nome.ini' 67 | else: 68 | config_path = PACKAGE_DIR / 'files/default_config/mapping_config_mc.ini' 69 | with open(config_path) as f: 70 | config_content = f.read() 71 | elif mode == 'mct': 72 | if nome: 73 | config_path = PACKAGE_DIR / 'files/default_config/mapping_config_mct-nome.ini' 74 | else: 75 | config_path = PACKAGE_DIR / 'files/default_config/mapping_config_mct.ini' 76 | with open(config_path) as f: 77 | config_content = f.read() 78 | if hisat3n_rna_ref is None: 79 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR', str(star_ref)) 80 | else: 81 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE', 82 | str(hisat3n_rna_ref)) 83 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF', str(gtf)) 84 | elif mode == 'm3c': 85 | config_path = PACKAGE_DIR / 'files/default_config/mapping_config_m3c.ini' 86 | with open(config_path) as f: 87 | config_content = f.read() 88 | elif mode == '4m': 89 | config_path = PACKAGE_DIR / 'files/default_config/mapping_config_4m.ini' 90 | with open(config_path) as f: 91 | config_content = f.read() 92 | if hisat3n_rna_ref is None: 93 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR', str(star_ref)) 94 | else: 95 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE', 96 | str(hisat3n_rna_ref)) 97 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF', str(gtf)) 98 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH', str(chrom_size_path)) 99 | else: 100 | raise 101 | 102 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH', str(chrom_size_path)) 103 | config_content = config_content.replace('USE_CORRECT_BARCODE_VERSION_HERE', barcode_version) 104 | if hisat3n_dna_ref is None: 105 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR', str(bismark_ref)) 106 | else: 107 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE', str(hisat3n_dna_ref)) 108 | config_content = config_content.replace('CHANGE_THIS_TO_YOUR_REFERENCE_FASTA', str(genome_fasta)) 109 | print(config_content) 110 | return 111 | -------------------------------------------------------------------------------- /cemba_data/mapping/mct/__init__.py: -------------------------------------------------------------------------------- 1 | from .mct_bismark_bam_filter import select_dna_reads 2 | from .mct_star_bam_filter import select_rna_reads 3 | -------------------------------------------------------------------------------- /cemba_data/mapping/mct/mct_bismark_bam_filter.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import re 3 | import pysam 4 | import pandas as pd 5 | 6 | 7 | def read_mc_level(read, frac=True, nome=False): 8 | bismark_tag = read.get_tag('XM') 9 | if nome: 10 | m_c = 0 11 | normal_c = 0 12 | seq = read.seq.upper() 13 | read_length = len(seq) 14 | for pos, xm_base in enumerate(bismark_tag): 15 | if xm_base in '.ZzUu': 16 | # skip unrelated base (.), CpG (Zz), CpUnknown (Uu) 17 | continue 18 | # Skip GpC 19 | try: 20 | if read.is_reverse: 21 | if (pos == read_length) or (read.seq[pos + 1] == 'C'): 22 | continue 23 | else: 24 | if (pos == 0) or (read.seq[pos - 1] == 'G'): 25 | continue 26 | except IndexError: 27 | # start or end of the read 28 | continue 29 | if xm_base in 'xh': 30 | normal_c += 1 31 | elif xm_base in 'XH': 32 | m_c += 1 33 | else: 34 | pass 35 | else: 36 | m_c = bismark_tag.count('X') + bismark_tag.count('H') 37 | normal_c = bismark_tag.count('x') + bismark_tag.count('h') 38 | 39 | total_c = m_c + normal_c 40 | if total_c == 0: 41 | return 0, 0 42 | else: 43 | if frac: 44 | read_mc_rate = m_c / total_c 45 | return read_mc_rate, total_c 46 | else: 47 | return m_c, total_c 48 | 49 | 50 | def select_dna_reads_normal(input_bam, 51 | output_bam, 52 | mc_rate_max_threshold=0.5, 53 | cov_min_threshold=3, 54 | nome=False): 55 | read_profile_dict = defaultdict(int) 56 | # init dict to make sure the series has something 57 | read_profile_dict[(50, 50)] = 0 58 | with pysam.AlignmentFile(input_bam) as f: 59 | with pysam.AlignmentFile(output_bam, header=f.header, 60 | mode='wb') as out_f: 61 | for read in f: 62 | mc_frac, cov = read_mc_level(read, nome=nome) 63 | read_profile_dict[(int(100 * mc_frac), cov)] += 1 64 | 65 | # split reads 66 | if (mc_frac > mc_rate_max_threshold) or (cov < 67 | cov_min_threshold): 68 | continue 69 | out_f.write(read) 70 | with open(str(output_bam) + '.reads_profile.csv', 'w') as stat_f: 71 | stat_f.write('mc_frac,cov,count\n') 72 | for (mc_frac, cov), count in read_profile_dict.items(): 73 | stat_f.write(f'{mc_frac},{cov},{count}\n') 74 | return 75 | 76 | 77 | def select_dna_reads_split_reads(input_bam, 78 | output_bam, 79 | mc_rate_max_threshold=0.5, 80 | cov_min_threshold=3, 81 | nome=False): 82 | splited_read_name_pattern = re.compile('.+-[lrm]$') 83 | 84 | # first pass: determine read methylation level 85 | read_level_mcs = defaultdict(int) 86 | read_level_covs = defaultdict(int) 87 | with pysam.AlignmentFile(input_bam) as f: 88 | for read in f: 89 | mc, cov = read_mc_level(read, frac=False, nome=nome) 90 | read_name = read.qname 91 | if splited_read_name_pattern.search(read_name): 92 | read_level_mcs[read_name[:-2]] += mc 93 | read_level_covs[read_name[:-2]] += cov 94 | else: 95 | read_level_mcs[read_name] += mc 96 | read_level_covs[read_name] += cov 97 | read_level_data = pd.DataFrame({ 98 | 'mc': read_level_mcs, 99 | 'cov': read_level_covs 100 | }) 101 | read_level_data['mc_frac'] = read_level_data['mc'] / (read_level_data['cov'] + 102 | 0.001) 103 | read_level_data['mc_frac'] = (read_level_data['mc_frac'] * 100).astype(int) 104 | if read_level_data.shape[0] == 0: 105 | # in case there is no read at all: 106 | with open(f'{output_bam}.reads_profile.csv', 'w') as f: 107 | f.write('mc_frac,cov,count\n') 108 | f.write('0,1,0\n') 109 | else: 110 | profile = read_level_data.groupby('mc_frac')['cov'].value_counts() 111 | profile.name = 'count' 112 | profile = profile.reset_index() 113 | profile.to_csv(f'{output_bam}.reads_profile.csv', index=None) 114 | 115 | # filter reads 116 | use_reads = read_level_data[ 117 | (read_level_data['mc_frac'] < mc_rate_max_threshold) 118 | & (read_level_data['cov'] >= cov_min_threshold)].index.tolist() 119 | use_reads = set(use_reads) 120 | del read_level_data 121 | 122 | # second pass: write passed reads 123 | with pysam.AlignmentFile(input_bam) as f: 124 | with pysam.AlignmentFile(output_bam, header=f.header, 125 | mode='wb') as out_f: 126 | for read in f: 127 | read_name = read.qname 128 | if (read_name in use_reads) or (read_name[:-2] in use_reads): 129 | # read name or read name without suffix 130 | out_f.write(read) 131 | return 132 | 133 | 134 | def select_dna_reads(input_bam, 135 | output_bam, 136 | mc_rate_max_threshold=0.5, 137 | cov_min_threshold=3, 138 | nome=False, 139 | assay_type='mc'): 140 | if assay_type == 'mc': 141 | select_dna_reads_normal(input_bam, 142 | output_bam, 143 | mc_rate_max_threshold=mc_rate_max_threshold, 144 | cov_min_threshold=cov_min_threshold, 145 | nome=nome) 146 | elif assay_type == 'm3c': 147 | select_dna_reads_split_reads(input_bam, 148 | output_bam, 149 | mc_rate_max_threshold=mc_rate_max_threshold, 150 | cov_min_threshold=cov_min_threshold, 151 | nome=nome) 152 | else: 153 | raise ValueError(f'Unknown assay_type {assay_type}.') 154 | return 155 | -------------------------------------------------------------------------------- /cemba_data/mapping/pipelines/_4m.py: -------------------------------------------------------------------------------- 1 | def _4m_config_str(config): 2 | """Change the dtype of parameters and make a appropriate string""" 3 | int_parameters = { 4 | 'overlap': 6, 5 | 'r1_left_cut': 10, 6 | 'r1_right_cut': 10, 7 | 'r2_left_cut': 10, 8 | 'r2_right_cut': 10, 9 | 'quality_threshold': 20, 10 | 'length_threshold': 30, 11 | 'total_read_pairs_min': 1, 12 | 'total_read_pairs_max': 6000000, 13 | 'mapq_threshold': 10, 14 | 'num_upstr_bases': 0, 15 | 'num_downstr_bases': 2, 16 | 'compress_level': 5, 17 | 'dna_cov_min_threshold': 3, 18 | 'rna_cov_min_threshold': 3, 19 | 'split_left_size': 40, 20 | 'split_right_size': 40, 21 | 'split_middle_min_size': 30, 22 | 'min_gap': 2500, 23 | 'trim_on_both_end': 5 24 | } 25 | 26 | float_parameters = { 27 | 'mc_rate_max_threshold': 0.5, 28 | 'mc_rate_min_threshold': 0.9 29 | } 30 | 31 | str_parameters = { 32 | 'mode': 'mc', 33 | 'barcode_version': 'required', 34 | 'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC', 35 | 'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA', 36 | 'bismark_reference': 'required', 37 | 'reference_fasta': 'required', 38 | 'star_reference': 'required', 39 | 'hisat3n_dna_reference': 'required', 40 | 'hisat3n_rna_reference': 'required', 41 | 'hisat3n_repeat_index_type': 'no-repeat', 42 | 'gtf_path': 'required', 43 | 'feature_type': 'gene', 44 | 'id_type': 'gene_id', 45 | 'mc_stat_feature': 'CHN CGN CCC', 46 | 'mc_stat_alias': 'mCH mCG mCCC', 47 | 'chrom_size_path': 'required', 48 | 'nome_flag_str': '--nome' 49 | } 50 | if 'hisat3n_dna_reference' in config: 51 | del str_parameters['bismark_reference'] 52 | del str_parameters['star_reference'] 53 | 54 | typed_config = {} 55 | for k, default in int_parameters.items(): 56 | if k in config: 57 | typed_config[k] = int(config[k]) 58 | else: 59 | if default != 'required': 60 | typed_config[k] = default 61 | else: 62 | raise ValueError(f'Required parameter {k} not found in config. ' 63 | f'You can print the newest mapping config template via "yap default-mapping-config".') 64 | 65 | for k, default in float_parameters.items(): 66 | if k in config: 67 | typed_config[k] = float(config[k]) 68 | else: 69 | if default != 'required': 70 | typed_config[k] = default 71 | else: 72 | raise ValueError(f'Required parameter {k} not found in config.') 73 | 74 | for k, default in str_parameters.items(): 75 | if k in config: 76 | typed_config[k] = f"'{config[k]}'" 77 | else: 78 | if default != 'required': 79 | typed_config[k] = f"'{default}'" 80 | else: 81 | raise ValueError(f'Required parameter {k} not found in config. ' 82 | f'You can print the newest mapping config template via "yap default-mapping-config".') 83 | 84 | config_str = "" 85 | for k, v in typed_config.items(): 86 | config_str += f"{k} = {v}\n" 87 | return config_str 88 | -------------------------------------------------------------------------------- /cemba_data/mapping/pipelines/m3c.py: -------------------------------------------------------------------------------- 1 | def m3c_config_str(config): 2 | """Change the dtype of parameters and make a appropriate string""" 3 | int_parameters = { 4 | 'overlap': 6, 5 | 'r1_left_cut': 10, 6 | 'r1_right_cut': 10, 7 | 'r2_left_cut': 10, 8 | 'r2_right_cut': 10, 9 | 'quality_threshold': 20, 10 | 'length_threshold': 30, 11 | 'total_read_pairs_min': 1, 12 | 'total_read_pairs_max': 6000000, 13 | 'mapq_threshold': 10, 14 | 'num_upstr_bases': 0, 15 | 'num_downstr_bases': 2, 16 | 'compress_level': 5, 17 | 'split_left_size': 40, 18 | 'split_right_size': 40, 19 | 'split_middle_min_size': 30, 20 | 'min_gap': 2500, 21 | 'trim_on_both_end': 5 22 | } 23 | 24 | str_parameters = { 25 | 'mode': 'mc', 26 | 'barcode_version': 'required', 27 | 'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC', 28 | 'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA', 29 | 'bismark_reference': 'required', 30 | 'hisat3n_dna_reference': 'required', 31 | 'hisat3n_repeat_index_type': 'no-repeat', 32 | 'reference_fasta': 'required', 33 | 'mc_stat_feature': 'CHN CGN CCC', 34 | 'mc_stat_alias': 'mCH mCG mCCC', 35 | 'chrom_size_path': 'required' 36 | } 37 | if 'hisat3n_dna_reference' in config: 38 | del str_parameters['bismark_reference'] 39 | else: 40 | del str_parameters['hisat3n_dna_reference'] 41 | del str_parameters['hisat3n_repeat_index_type'] 42 | 43 | typed_config = {} 44 | for k, default in int_parameters.items(): 45 | if k in config: 46 | typed_config[k] = int(config[k]) 47 | else: 48 | if default != 'required': 49 | typed_config[k] = default 50 | else: 51 | raise ValueError(f'Required parameter {k} not found in config. ' 52 | f'You can print the newest mapping config template via "yap default-mapping-config".') 53 | 54 | for k, default in str_parameters.items(): 55 | if k in config: 56 | typed_config[k] = f"'{config[k]}'" 57 | else: 58 | if default != 'required': 59 | typed_config[k] = f"'{default}'" 60 | else: 61 | raise ValueError(f'Required parameter {k} not found in config. ' 62 | f'You can print the newest mapping config template via "yap default-mapping-config".') 63 | 64 | config_str = "" 65 | for k, v in typed_config.items(): 66 | config_str += f"{k} = {v}\n" 67 | return config_str 68 | -------------------------------------------------------------------------------- /cemba_data/mapping/pipelines/mc.py: -------------------------------------------------------------------------------- 1 | def mc_config_str(config): 2 | """Change the dtype of parameters and make a appropriate string""" 3 | int_parameters = { 4 | 'overlap': 6, 5 | 'r1_left_cut': 10, 6 | 'r1_right_cut': 10, 7 | 'r2_left_cut': 10, 8 | 'r2_right_cut': 10, 9 | 'quality_threshold': 20, 10 | 'length_threshold': 30, 11 | 'total_read_pairs_min': 1, 12 | 'total_read_pairs_max': 6000000, 13 | 'mapq_threshold': 10, 14 | 'num_upstr_bases': 0, 15 | 'num_downstr_bases': 2, 16 | 'compress_level': 5 17 | } 18 | 19 | bool_parameters = {'unmapped_fastq': False} 20 | 21 | str_parameters = { 22 | 'mode': 'mc', 23 | 'barcode_version': 'required', 24 | 'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC', 25 | 'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA', 26 | 'bismark_reference': 'required', 27 | 'hisat3n_dna_reference': 'required', 28 | 'hisat3n_repeat_index_type': 'no-repeat', 29 | 'reference_fasta': 'required', 30 | 'chrom_size_path': 'required', 31 | 'mc_stat_feature': 'CHN CGN CCC', 32 | 'mc_stat_alias': 'mCH mCG mCCC' 33 | } 34 | if 'hisat3n_dna_reference' in config: 35 | del str_parameters['bismark_reference'] 36 | else: 37 | del str_parameters['hisat3n_dna_reference'] 38 | del str_parameters['hisat3n_repeat_index_type'] 39 | 40 | typed_config = {} 41 | for k, default in int_parameters.items(): 42 | if k in config: 43 | typed_config[k] = int(config[k]) 44 | else: 45 | if default != 'required': 46 | typed_config[k] = default 47 | else: 48 | raise ValueError(f'Required parameter {k} not found in config.') 49 | 50 | for k, default in bool_parameters.items(): 51 | if k in config: 52 | v = config[k] 53 | if v.lower().startswith('t'): 54 | v = True 55 | else: 56 | v = False 57 | typed_config[k] = v 58 | else: 59 | if default != 'required': 60 | typed_config[k] = default 61 | else: 62 | raise ValueError(f'Required parameter {k} not found in config. ' 63 | f'You can print the newest mapping config template via "yap default-mapping-config".') 64 | # judge unmapped_fastq specifically 65 | unmapped_param_str = '--un' if typed_config['unmapped_fastq'] else '' 66 | typed_config['unmapped_param_str'] = f"'{unmapped_param_str}'" 67 | 68 | for k, default in str_parameters.items(): 69 | if k in config: 70 | typed_config[k] = f"'{config[k]}'" 71 | else: 72 | if default != 'required': 73 | typed_config[k] = f"'{default}'" 74 | else: 75 | raise ValueError(f'Required parameter {k} not found in config. ' 76 | f'You can print the newest mapping config template via "yap default-mapping-config".') 77 | 78 | config_str = "" 79 | for k, v in typed_config.items(): 80 | config_str += f"{k} = {v}\n" 81 | return config_str 82 | -------------------------------------------------------------------------------- /cemba_data/mapping/pipelines/mct.py: -------------------------------------------------------------------------------- 1 | def mct_config_str(config): 2 | """Change the dtype of parameters and make a appropriate string""" 3 | int_parameters = { 4 | 'overlap': 6, 5 | 'r1_left_cut': 10, 6 | 'r1_right_cut': 10, 7 | 'r2_left_cut': 10, 8 | 'r2_right_cut': 10, 9 | 'quality_threshold': 20, 10 | 'length_threshold': 30, 11 | 'total_read_pairs_min': 1, 12 | 'total_read_pairs_max': 6000000, 13 | 'mapq_threshold': 10, 14 | 'num_upstr_bases': 0, 15 | 'num_downstr_bases': 2, 16 | 'compress_level': 5, 17 | 'dna_cov_min_threshold': 3, 18 | 'rna_cov_min_threshold': 3 19 | } 20 | 21 | float_parameters = { 22 | 'mc_rate_max_threshold': 0.5, 23 | 'mc_rate_min_threshold': 0.9 24 | } 25 | bool_parameters = {'unmapped_fastq': False} 26 | 27 | str_parameters = { 28 | 'mode': 'mc', 29 | 'barcode_version': 'required', 30 | 'r1_adapter': 'AGATCGGAAGAGCACACGTCTGAAC', 31 | 'r2_adapter': 'AGATCGGAAGAGCGTCGTGTAGGGA', 32 | 'bismark_reference': 'required', 33 | 'hisat3n_dna_reference': 'required', 34 | 'hisat3n_rna_reference': 'required', 35 | 'hisat3n_repeat_index_type': 'no-repeat', 36 | 'reference_fasta': 'required', 37 | 'star_reference': 'required', 38 | 'gtf_path': 'required', 39 | 'feature_type': 'gene', 40 | 'id_type': 'gene_id', 41 | 'nome_flag_str': 'required' 42 | } 43 | if 'hisat3n_dna_reference' in config: 44 | del str_parameters['bismark_reference'] 45 | del str_parameters['star_reference'] 46 | else: 47 | del str_parameters['hisat3n_dna_reference'] 48 | del str_parameters['hisat3n_rna_reference'] 49 | del str_parameters['hisat3n_repeat_index_type'] 50 | 51 | typed_config = {} 52 | for k, default in int_parameters.items(): 53 | if k in config: 54 | typed_config[k] = int(config[k]) 55 | else: 56 | if default != 'required': 57 | typed_config[k] = default 58 | else: 59 | raise ValueError(f'Required parameter {k} not found in config.') 60 | 61 | for k, default in float_parameters.items(): 62 | if k in config: 63 | typed_config[k] = float(config[k]) 64 | else: 65 | if default != 'required': 66 | typed_config[k] = default 67 | else: 68 | raise ValueError(f'Required parameter {k} not found in config.') 69 | 70 | for k, default in bool_parameters.items(): 71 | if k in config: 72 | v = config[k] 73 | if v.lower().startswith('t'): 74 | v = True 75 | else: 76 | v = False 77 | typed_config[k] = v 78 | else: 79 | if default != 'required': 80 | typed_config[k] = default 81 | else: 82 | raise ValueError(f'Required parameter {k} not found in config. ' 83 | f'You can print the newest mapping config template via "yap default-mapping-config".') 84 | # judge unmapped_fastq specifically 85 | unmapped_param_str = '--un' if typed_config['unmapped_fastq'] else '' 86 | typed_config['unmapped_param_str'] = f"'{unmapped_param_str}'" 87 | 88 | for k, default in str_parameters.items(): 89 | if k in config: 90 | typed_config[k] = f"'{config[k]}'" 91 | else: 92 | if default != 'required': 93 | typed_config[k] = f"'{default}'" 94 | else: 95 | raise ValueError(f'Required parameter {k} not found in config. ' 96 | f'You can print the newest mapping config template via "yap default-mapping-config".') 97 | 98 | config_str = "" 99 | for k, v in typed_config.items(): 100 | config_str += f"{k} = {v}\n" 101 | return config_str 102 | -------------------------------------------------------------------------------- /cemba_data/mapping/stats/_4m.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pandas as pd 4 | import pysam 5 | 6 | from .mc import mc_mapping_stats 7 | from .mct import _count_reads_by_rg_in_star_bam, \ 8 | summary_rna_mapping, \ 9 | summarize_select_dna_reads, \ 10 | aggregate_feature_counts 11 | from .m3c import m3c_mapping_stats 12 | 13 | 14 | def _4m_mapping_stats(output_dir, config): 15 | """this may apply to single UID dir, so config is provided as parameter""" 16 | m3c_stats_df = m3c_mapping_stats(output_dir, config) 17 | select_dna_stats_df = summarize_select_dna_reads(output_dir, config) 18 | rna_stats_df = summary_rna_mapping(output_dir) 19 | final_df = pd.concat([m3c_stats_df, select_dna_stats_df, rna_stats_df], axis=1) 20 | return final_df 21 | 22 | 23 | def _4m_additional_cols(final_df, output_dir): 24 | final_df = final_df.copy() 25 | final_df['CellInputReadPairs'] = final_df['R1InputReads'].astype(int) 26 | if 'PCRIndex' in final_df.columns: # plate info might not exist if the cell name is abnormal 27 | cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum()) 28 | for _, i in final_df.groupby('PCRIndex')]) 29 | final_df['CellBarcodeRatio'] = cell_barcode_ratio 30 | 31 | # snm3C part 32 | final_df['FinalmCReads'] = final_df['R1DeduppedReads'] + final_df['R2DeduppedReads'] 33 | # use % to be consistent with others 34 | final_df['R1MappingRate'] = final_df['R1UniqueMappedReads'] / final_df['R1TrimmedReads'] * 100 35 | final_df['R2MappingRate'] = final_df['R2UniqueMappedReads'] / final_df['R2TrimmedReads'] * 100 36 | final_df['R1DuplicationRate'] = (1 - final_df['R1DeduppedReads'] / final_df['R1UniqueMappedReads']) * 100 37 | final_df['R2DuplicationRate'] = (1 - final_df['R2DeduppedReads'] / final_df['R2UniqueMappedReads']) * 100 38 | final_df['TotalContacts'] = final_df[ 39 | ['CisShortContact', 'CisLongContact', 'TransContact']].sum(axis=1) 40 | final_df['CisShortRatio'] = final_df['CisShortContact'] / final_df['TotalContacts'] 41 | final_df['CisLongRatio'] = final_df['CisLongContact'] / final_df['TotalContacts'] 42 | final_df['TransRatio'] = final_df['TransContact'] / final_df['TotalContacts'] 43 | 44 | # snmCT part 45 | stats = pd.read_hdf(output_dir / 'TotalRNAData.h5', key='stats') 46 | final_df['GenesDetected'] = stats['GenesDetected'] 47 | # calculate some mCT specific ratios 48 | final_df['DNAReadsYield'] = final_df['FinalDNAReads'] / ( 49 | final_df['CellInputReadPairs'] * 2) 50 | final_df['RNAReadsYield'] = final_df['FinalRNAReads'] / final_df[ 51 | 'CellInputReadPairs'] 52 | final_df['RNA/(DNA+RNA)'] = final_df['FinalRNAReads'].fillna(0) / ( 53 | final_df['R1DeduppedReads'].fillna(0) + 1) 54 | return final_df 55 | -------------------------------------------------------------------------------- /cemba_data/mapping/stats/__init__.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import subprocess 3 | 4 | import pandas as pd 5 | from papermill import execute_notebook, PapermillExecutionError 6 | 7 | from .m3c import m3c_mapping_stats, m3c_additional_cols 8 | from .mc import mc_mapping_stats, mc_additional_cols 9 | from .mct import mct_mapping_stats, mct_additional_cols 10 | from ._4m import _4m_mapping_stats, _4m_additional_cols 11 | from .plate_info import get_plate_info 12 | from ..pipelines import PACKAGE_DIR 13 | from ...utilities import get_configuration 14 | 15 | 16 | def mapping_stats(output_dir): 17 | """This is UID level mapping summary, the config file is in parent dir""" 18 | output_dir = pathlib.Path(output_dir).absolute() 19 | config = get_configuration(output_dir.parent / 'mapping_config.ini') 20 | mode = config['mode'] 21 | 22 | if mode == 'mc': 23 | final_df = mc_mapping_stats(output_dir, config) 24 | elif mode == 'mct': 25 | final_df = mct_mapping_stats(output_dir, config) 26 | elif mode == 'm3c': 27 | final_df = m3c_mapping_stats(output_dir, config) 28 | elif mode == '4m': 29 | final_df = _4m_mapping_stats(output_dir, config) 30 | else: 31 | raise ValueError 32 | 33 | # plate info, which is tech independent. 34 | _plate_info = get_plate_info(final_df.index, barcode_version=config['barcode_version']) 35 | final_df = pd.concat([_plate_info, final_df], axis=1) 36 | 37 | # save 38 | final_df.to_csv(output_dir / 'MappingSummary.csv.gz') 39 | return 40 | 41 | 42 | def final_summary(output_dir, cleanup=True, notebook=None): 43 | output_dir = pathlib.Path(output_dir).absolute() 44 | mode = get_configuration(output_dir / 'mapping_config.ini')['mode'] 45 | path_to_remove = [] 46 | 47 | # Before running summary, 48 | # first make sure all the UID dir having Snakefile also has mapping summary (means successful) 49 | snakefile_list = list(output_dir.glob('*/Snakefile')) 50 | summary_paths = [] 51 | missing_summary_dirs = [] 52 | for path in snakefile_list: 53 | uid_dir = path.parent 54 | summary_path = uid_dir / 'MappingSummary.csv.gz' 55 | if summary_path.exists(): 56 | summary_paths.append(summary_path) 57 | else: 58 | missing_summary_dirs.append(uid_dir) 59 | 60 | if len(missing_summary_dirs) != 0: 61 | print('These sub dir missing MappingSummary files:') 62 | for p in missing_summary_dirs: 63 | print(p) 64 | raise FileNotFoundError(f'Note that all sub dir should be successfully mapped ' 65 | f'before generating final summary. \n' 66 | f'The MappingSummary.csv.gz is the final target file of snakefile in {path}. \n' 67 | f'Run the corresponding snakemake command again to retry mapping.\n' 68 | f'The snakemake commands can be found in output_dir/snakemake/*/snakemake_cmd.txt') 69 | 70 | # aggregate mapping summaries 71 | total_mapping_summary = pd.concat([pd.read_csv(path, index_col=0) 72 | for path in summary_paths]) 73 | total_mapping_summary_path = output_dir / 'stats/MappingSummary.csv.gz' 74 | 75 | # if this is mct, aggregate all the gene counts 76 | if mode in ['mct', '4m']: 77 | from ..stats.mct import aggregate_feature_counts 78 | aggregate_feature_counts(output_dir) 79 | 80 | # add additional columns based on some calculation 81 | if mode == 'mc': 82 | total_mapping_summary = mc_additional_cols(total_mapping_summary) 83 | elif mode == 'mct': 84 | total_mapping_summary = mct_additional_cols(total_mapping_summary, output_dir=output_dir) 85 | elif mode == 'm3c': 86 | total_mapping_summary = m3c_additional_cols(total_mapping_summary) 87 | elif mode == '4m': 88 | total_mapping_summary = _4m_additional_cols(total_mapping_summary, output_dir=output_dir) 89 | else: 90 | raise 91 | 92 | # save total mapping summary 93 | total_mapping_summary.to_csv(total_mapping_summary_path) 94 | 95 | # add .snakemake files to deletion 96 | snakemake_hiding_dirs = list(output_dir.glob('*/.snakemake')) 97 | path_to_remove += snakemake_hiding_dirs 98 | 99 | # add temp dir in the bam dirs to deletion 100 | mapping_temp_dirs = list(output_dir.glob('*/bam/temp')) 101 | path_to_remove += mapping_temp_dirs 102 | 103 | # write a ALLC path file for generating MCDS 104 | allc_paths = pd.Series({path.name.split('.')[0]: str(path) 105 | for path in output_dir.glob('*/allc/*tsv.gz')}) 106 | allc_paths.to_csv(output_dir / 'stats/AllcPaths.tsv', sep='\t', header=False) 107 | 108 | if 'Plate' in total_mapping_summary.columns: # only run notebook when plate info exist 109 | # run summary notebook 110 | nb_path = output_dir / 'stats/MappingSummary.ipynb' 111 | try: 112 | mode = get_configuration(output_dir / 'mapping_config.ini')['mode'] 113 | if notebook is None: 114 | template_notebook = PACKAGE_DIR / f'files/mapping_summary_template/{mode}_template.ipynb' 115 | else: 116 | template_notebook = str(notebook) 117 | print(f'Using notebook template from {template_notebook}') 118 | print('Executing summary plotting notebook...') 119 | execute_notebook( 120 | input_path=str(template_notebook), 121 | output_path=str(nb_path), 122 | parameters=dict(output_dir=str(output_dir)) 123 | ) 124 | print('Summary notebook successfully executed. Exporting HTML...') 125 | subprocess.run(['jupyter', 'nbconvert', '--to', 'html', str(nb_path)]) 126 | print(f'See the summary plots here: {str(nb_path)[:-5]}html') 127 | print(f'Or customize the summary plots here: {nb_path}') 128 | except PapermillExecutionError: 129 | print(f'Ops, summary plotting notebook got some error, check the information in {nb_path}') 130 | cleanup = False 131 | 132 | # delete 133 | if cleanup: 134 | print('Clean up snakemake log (might take several minutes) ...') 135 | for path in path_to_remove: 136 | subprocess.run(['rm', '-rf', str(path)], check=True) 137 | return 138 | 139 | 140 | -------------------------------------------------------------------------------- /cemba_data/mapping/stats/m3c.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pandas as pd 4 | from pysam import AlignmentFile 5 | 6 | from .utilities import parse_trim_fastq_stats, parse_trim_fastq_stats_mct, generate_allc_stats 7 | 8 | 9 | def m3c_bam_unique_read_counts(bam_path, read_type_int): 10 | unique_reads = set() 11 | with AlignmentFile(bam_path) as bam: 12 | for read in bam: 13 | unique_reads.add(read.query_name.split(f'_{read_type_int}:N:0:')[0]) 14 | return len(unique_reads) 15 | 16 | 17 | def m3c_count_bams(bam_dir, cell_id, read_type): 18 | bam_path_dict = { 19 | f'{read_type}UniqueMappedReads': bam_dir / f'{cell_id}-{read_type}.two_mapping.filter.bam', 20 | f'{read_type}DeduppedReads': bam_dir / f'{cell_id}-{read_type}.two_mapping.deduped.bam', 21 | } 22 | read_counts = {name: m3c_bam_unique_read_counts(path, 1 if read_type == 'R1' else 2) 23 | for name, path in bam_path_dict.items()} 24 | return pd.Series(read_counts, name=cell_id) 25 | 26 | 27 | def m3c_mapping_stats(output_dir, config): 28 | """this may apply to single UID dir, so config is provided as parameter""" 29 | output_dir = pathlib.Path(output_dir).absolute() 30 | fastq_dir = output_dir / 'fastq' 31 | bam_dir = output_dir / 'bam' 32 | hic_dir = output_dir / 'hic' 33 | cell_stats = [] 34 | cell_ids = [path.name.split('.')[0] 35 | for path in bam_dir.glob('*.3C.sorted.bam')] 36 | 37 | for cell_id in cell_ids: 38 | total_stats = [] # list of series 39 | for read_type in ['R1', 'R2']: 40 | # fastq reads 41 | if config['mode'] in ['4m', 'mct']: 42 | total_stats.append( 43 | parse_trim_fastq_stats_mct( 44 | fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.txt')) 45 | else: 46 | total_stats.append( 47 | parse_trim_fastq_stats( 48 | fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.tsv')) 49 | # bam reads 50 | total_stats.append( 51 | m3c_count_bams(bam_dir, cell_id, read_type) 52 | ) 53 | # contacts 54 | contact_counts = pd.read_csv(hic_dir / f'{cell_id}.3C.contact.tsv.counts.txt', 55 | header=None, index_col=0, squeeze=True) 56 | contact_counts.name = cell_id 57 | total_stats.append(contact_counts) 58 | 59 | cell_stats.append(pd.concat(total_stats)) 60 | total_df = pd.DataFrame(cell_stats) 61 | 62 | # add allc stats 63 | allc_df = generate_allc_stats(output_dir, config) 64 | final_df = pd.concat([total_df, allc_df], sort=True, axis=1) 65 | return final_df 66 | 67 | 68 | def m3c_additional_cols(final_df): 69 | final_df['FinalmCReads'] = final_df['R1DeduppedReads'] + final_df['R2DeduppedReads'] 70 | final_df['CellInputReadPairs'] = final_df['R1InputReads'] 71 | # use % to be consistent with others 72 | final_df['R1MappingRate'] = final_df['R1UniqueMappedReads'] / final_df['R1TrimmedReads'] * 100 73 | final_df['R2MappingRate'] = final_df['R2UniqueMappedReads'] / final_df['R2TrimmedReads'] * 100 74 | final_df['R1DuplicationRate'] = (1 - final_df['R1DeduppedReads'] / final_df['R1UniqueMappedReads']) * 100 75 | final_df['R2DuplicationRate'] = (1 - final_df['R2DeduppedReads'] / final_df['R2UniqueMappedReads']) * 100 76 | 77 | if 'PCRIndex' in final_df.columns: # plate info might not exist if the cell name is abnormal 78 | cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum()) 79 | for _, i in final_df.groupby('PCRIndex')]) 80 | final_df['CellBarcodeRatio'] = cell_barcode_ratio 81 | 82 | final_df['TotalContacts'] = final_df[ 83 | ['CisShortContact', 'CisLongContact', 'TransContact']].sum(axis=1) 84 | final_df['CisShortRatio'] = final_df['CisShortContact'] / final_df['TotalContacts'] 85 | final_df['CisLongRatio'] = final_df['CisLongContact'] / final_df['TotalContacts'] 86 | final_df['TransRatio'] = final_df['TransContact'] / final_df['TotalContacts'] 87 | return final_df 88 | -------------------------------------------------------------------------------- /cemba_data/mapping/stats/mc.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pandas as pd 4 | 5 | from .utilities import parse_trim_fastq_stats, parse_trim_fastq_stats_mct, \ 6 | parse_bismark_report, parse_deduplicate_stat, \ 7 | generate_allc_stats 8 | 9 | 10 | def mc_mapping_stats(output_dir, config): 11 | """this may apply to single UID dir, so config is provided as parameter""" 12 | output_dir = pathlib.Path(output_dir).absolute() 13 | fastq_dir = output_dir / 'fastq' 14 | bam_dir = output_dir / 'bam' 15 | allc_dir = output_dir / 'allc' 16 | cell_stats = [] 17 | cell_ids = [path.name.split('.')[0] 18 | for path in allc_dir.glob(f'*.allc.tsv.gz')] 19 | 20 | for cell_id in cell_ids: 21 | print(f'Parsing stats of {cell_id}.') 22 | total_stats = [] 23 | for read_type in ['R1', 'R2']: 24 | if config['mode'] in ['4m', 'mct']: 25 | total_stats.append( 26 | parse_trim_fastq_stats_mct( 27 | fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.txt')) 28 | else: 29 | total_stats.append( 30 | parse_trim_fastq_stats( 31 | fastq_dir / f'{cell_id}-{read_type}.trimmed.stats.tsv')) 32 | total_stats.append( 33 | parse_bismark_report( 34 | bam_dir / f'{cell_id}-{read_type}.trimmed_bismark_bt2_SE_report.txt')) 35 | total_stats.append( 36 | parse_deduplicate_stat( 37 | bam_dir / f'{cell_id}-{read_type}.trimmed_bismark_bt2.deduped.matrix.txt' 38 | )) 39 | cell_stats.append(pd.concat(total_stats)) 40 | mapping_df = pd.DataFrame(cell_stats) 41 | mapping_df.index.name = 'cell_id' 42 | 43 | # add allc stats 44 | allc_df = generate_allc_stats(output_dir, config) 45 | final_df = pd.concat([mapping_df, allc_df], sort=True, axis=1) 46 | return final_df 47 | 48 | 49 | def mc_additional_cols(final_df): 50 | """Additional columns for mC mapping summary""" 51 | final_df = final_df.copy() 52 | final_df['CellInputReadPairs'] = final_df['R1InputReads'].astype(int) # == final_df['R2InputReads'] 53 | if 'PCRIndex' in final_df.columns: # plate info might not exist if the cell name is abnormal 54 | cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum()) 55 | for _, i in final_df.groupby('PCRIndex')]) 56 | final_df['CellBarcodeRatio'] = cell_barcode_ratio 57 | 58 | final_df['FinalmCReads'] = final_df['R1FinalBismarkReads'] + final_df['R2FinalBismarkReads'] 59 | return final_df 60 | -------------------------------------------------------------------------------- /cemba_data/mapping/stats/mct.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pandas as pd 4 | import pysam 5 | 6 | from .mc import mc_mapping_stats 7 | 8 | 9 | def _count_reads_by_rg_in_star_bam(bam_path): 10 | try: 11 | bam = pysam.AlignmentFile(bam_path) 12 | except ValueError: 13 | # empty bam file 14 | return 15 | 16 | cell_read_counts = {cell['ID']: 0 for cell in bam.header['RG']} 17 | 18 | for read in bam: 19 | cell = read.get_tag('RG') 20 | cell_read_counts[cell] += 1 21 | read_count = pd.Series(cell_read_counts, name='Reads') 22 | read_count.index.name = 'cell_id' 23 | return read_count 24 | 25 | 26 | def summary_rna_mapping(output_dir): 27 | output_dir = pathlib.Path(output_dir) 28 | 29 | # summarize read counts for each cell before filter by mC rate 30 | total_star_mapped_reads = _count_reads_by_rg_in_star_bam(output_dir / 'rna_bam/TotalRNAAligned.filtered.bam') 31 | 32 | # feature count summary 33 | total_counts = pd.read_csv(output_dir / 'rna_bam/TotalRNAAligned.rna_reads.feature_count.tsv.summary', 34 | sep='\t', index_col=0).T 35 | total_counts.index = total_counts.index.map(lambda i: i.split(':')[-1]) 36 | feature_count_summary = total_counts[['Assigned']].copy() 37 | feature_count_summary['FinalRNAReads'] = total_counts.sum(axis=1) 38 | feature_count_summary.columns = ['FinalCountedReads', 'FinalRNAReads'] 39 | 40 | total_rna_stat = feature_count_summary.copy() 41 | total_rna_stat['RNAUniqueMappedReads'] = total_star_mapped_reads 42 | total_rna_stat['SelectedRNAReadsRatio'] = total_rna_stat['FinalRNAReads'] / total_rna_stat['RNAUniqueMappedReads'] 43 | total_rna_stat.index.name = 'cell_id' 44 | return total_rna_stat 45 | 46 | 47 | def summarize_select_dna_reads(output_dir, 48 | config): 49 | bam_dir = pathlib.Path(output_dir) / 'bam' 50 | mc_rate_max_threshold = float(config['mc_rate_max_threshold']) * 100 51 | cov_min_threshold = float(config['dna_cov_min_threshold']) 52 | 53 | records = [] 54 | select_dna_reads_stat_list = list(bam_dir.glob('*.reads_profile.csv')) 55 | for path in select_dna_reads_stat_list: 56 | try: 57 | _df = pd.read_csv(path) 58 | except pd.errors.EmptyDataError: 59 | # means the bam file is empty 60 | continue 61 | 62 | cell_id = path.name.split('.')[0] 63 | if cell_id.endswith('-R1') or cell_id.endswith('-R2'): 64 | # select DNA preformed in R1 R2 separately: 65 | cell_id = cell_id[:-3] 66 | _df['cell_id'] = cell_id 67 | _df['mc_rate_max_threshold'] = mc_rate_max_threshold 68 | _df['cov_min_threshold'] = cov_min_threshold 69 | records.append(_df) 70 | total_stats_df = pd.concat(records) 71 | 72 | selected_reads = total_stats_df[ 73 | (total_stats_df['cov'] >= cov_min_threshold) 74 | & (total_stats_df['mc_frac'] < mc_rate_max_threshold)] 75 | 76 | selected_reads = selected_reads.groupby('cell_id')['count'].sum() 77 | selected_ratio = selected_reads / total_stats_df.groupby('cell_id')['count'].sum() 78 | final_stat = pd.DataFrame({'FinalDNAReads': selected_reads, 'SelectedDNAReadsRatio': selected_ratio}) 79 | final_stat.index.name = 'cell_id' 80 | return final_stat 81 | 82 | 83 | def mct_mapping_stats(output_dir, config): 84 | """this may apply to single UID dir, so config is provided as parameter""" 85 | mc_stats_df = mc_mapping_stats(output_dir, config) 86 | select_dna_stats_df = summarize_select_dna_reads(output_dir, config) 87 | rna_stats_df = summary_rna_mapping(output_dir) 88 | final_df = pd.concat([mc_stats_df, select_dna_stats_df, rna_stats_df], axis=1) 89 | return final_df 90 | 91 | 92 | def aggregate_feature_counts(output_dir): 93 | output_dir = pathlib.Path(output_dir) 94 | cell_data = [] 95 | 96 | count_paths = list(output_dir.glob('*/rna_bam/TotalRNAAligned.rna_reads.feature_count.tsv')) 97 | if len(count_paths) == 0: 98 | return 99 | 100 | data = None 101 | for path in count_paths: 102 | data = pd.read_csv(path, sep='\t', index_col=0, comment='#') 103 | cell_data.append(data.iloc[:, 5:]) 104 | cell_data = pd.concat(cell_data, axis=1, sort=True) 105 | cell_data.columns = cell_data.columns.str.split(':').str[1] 106 | 107 | # all count table should have the same info, so read the last one 108 | # chr, start, end, strand, length 109 | gene_info = data.iloc[:, :5] 110 | with pd.HDFStore(output_dir / 'TotalRNAData.h5', mode='w', complevel=5) as hdf: 111 | hdf['data'] = cell_data.T # cell by gene 112 | hdf['gene'] = gene_info 113 | hdf['stats'] = pd.DataFrame({'GenesDetected': (cell_data > 0).sum()}) 114 | return 115 | 116 | 117 | def mct_additional_cols(final_df, output_dir): 118 | final_df = final_df.copy() 119 | final_df['CellInputReadPairs'] = final_df['R1InputReads'].astype(int) # == final_df['R2InputReads'] 120 | if 'PCRIndex' in final_df.columns: # plate info might not exist if the cell name is abnormal 121 | cell_barcode_ratio = pd.concat([(i['CellInputReadPairs'] / i['CellInputReadPairs'].sum()) 122 | for _, i in final_df.groupby('PCRIndex')]) 123 | final_df['CellBarcodeRatio'] = cell_barcode_ratio 124 | 125 | stats = pd.read_hdf(output_dir / 'TotalRNAData.h5', key='stats') 126 | final_df['GenesDetected'] = stats['GenesDetected'] 127 | 128 | # calculate some mCT specific ratios 129 | final_df['DNAReadsYield'] = final_df['FinalDNAReads'] / ( 130 | final_df['CellInputReadPairs'] * 2) 131 | final_df['RNAReadsYield'] = final_df['FinalRNAReads'] / final_df[ 132 | 'CellInputReadPairs'] 133 | final_df['RNA/(DNA+RNA)'] = final_df['FinalRNAReads'].fillna(0) / ( 134 | final_df['R1FinalBismarkReads'].fillna(0) + 1) 135 | return final_df 136 | -------------------------------------------------------------------------------- /cemba_data/mapping/stats/plate_info.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def _parse_cell_id_v1(cell_id): 5 | plate1, plate2, pcr_index, random_index = cell_id.split('-') 6 | if random_index.upper() in {'AD001', 'AD002', 'AD004', 'AD006'}: 7 | plate = plate1 8 | else: 9 | plate = plate2 10 | # 96 pos 11 | col96 = int(pcr_index[1:]) - 1 12 | row96 = ord(pcr_index[0]) - 65 # convert A-H to 0-8 13 | # 384 pos 14 | ad_index_384_dict = { 15 | 'AD001': (0, 0), 16 | 'AD002': (0, 1), 17 | 'AD004': (1, 0), 18 | 'AD006': (1, 1), 19 | 'AD007': (0, 0), 20 | 'AD008': (0, 1), 21 | 'AD010': (1, 0), 22 | 'AD012': (1, 1) 23 | } 24 | col384 = 2 * col96 + ad_index_384_dict[random_index][0] 25 | row384 = 2 * row96 + ad_index_384_dict[random_index][1] 26 | record = pd.Series({ 27 | 'Plate': plate, 28 | 'PCRIndex': pcr_index, 29 | 'RandomIndex': random_index, 30 | 'Col384': col384, 31 | 'Row384': row384 32 | }) 33 | return record 34 | 35 | 36 | def _parse_cell_id_v2(cell_id): 37 | plate, multiplex_group, pcr_index, random_index = cell_id.split('-') 38 | # 384 pos 39 | col384 = int(random_index[1:]) - 1 40 | row384 = ord(random_index[0]) - 65 # convert A-P to 0-23 41 | record = pd.Series({ 42 | 'Plate': plate, 43 | 'PCRIndex': pcr_index, 44 | 'MultiplexGroup': multiplex_group, 45 | 'RandomIndex': random_index, 46 | 'Col384': col384, 47 | 'Row384': row384 48 | }) 49 | return record 50 | 51 | 52 | def get_plate_info(cell_ids, barcode_version): 53 | if barcode_version == 'V1': 54 | func = _parse_cell_id_v1 55 | else: 56 | func = _parse_cell_id_v2 57 | try: 58 | plate_info = pd.DataFrame([func(cell_id) for cell_id in cell_ids], 59 | index=cell_ids) 60 | except Exception: 61 | print('Errors occur during parsing the plate info, this happens ' 62 | 'when the input FASTQ file name is not generated by yap. ' 63 | 'The `yap summary` also can not generate html report due to missing the plate info. ' 64 | 'In this case, you need to add the plateinfo by yourself in order to make the plate view plots. ' 65 | 'These information is not necessary for following analysis though.') 66 | plate_info = pd.DataFrame([], index=cell_ids) 67 | return plate_info 68 | -------------------------------------------------------------------------------- /cemba_data/mapping/stats/plot.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sns 8 | from matplotlib.colors import Normalize 9 | 10 | 11 | def plot_on_plate(data, 12 | hue, 13 | groupby, 14 | ncols=4, 15 | plate_base=384, 16 | figsize_scale=1, 17 | row='Row384', 18 | col='Col384', 19 | vmin=0, 20 | vmax=1, 21 | aggregation_func=None): 22 | """ 23 | Plot metadata into 384 or 96 plate view (heatmap) 24 | Parameters 25 | ---------- 26 | data 27 | dataframe contain plate postion and metric used for color 28 | hue 29 | int/float column name used as hue 30 | groupby 31 | groupby column, typically groupby plate id column(s) to plot each plate separately 32 | ncols 33 | number of column for axes, nrows will be calculated accordingly 34 | plate_base 35 | {384, 96} size of the plate view 36 | figsize_scale 37 | scale of figure size 38 | row 39 | column name for rows 40 | col 41 | column name for columns 42 | vmin 43 | cmap vmin 44 | vmax 45 | cmap vmax 46 | aggregation_func 47 | apply to reduce rows after groupby if the row is not unique 48 | """ 49 | 50 | if plate_base == 384: 51 | plate_nrows, plate_ncols = 16, 24 52 | 53 | elif plate_base == 96: 54 | plate_nrows, plate_ncols = 8, 12 55 | else: 56 | raise ValueError(f'Plate base {plate_base} unknown') 57 | 58 | plot_data_list = [] 59 | plate_names = [] 60 | for plate, sub_df in data.groupby(groupby): 61 | # check if plate base are duplicated 62 | duplicated = sub_df[[row, col]].duplicated().sum() != 0 63 | if duplicated: 64 | if aggregation_func is None: 65 | raise ValueError( 66 | 'Row after groupby is not unique, aggregation_func can not be None' 67 | ) 68 | plot_data = sub_df.groupby([row, 69 | col])[[hue]].apply(aggregation_func) 70 | else: 71 | plot_data = sub_df.set_index([row, col])[[hue]] 72 | # reindex, missing value will keep as NA 73 | full_index = pd.MultiIndex.from_tuples([(i, j) 74 | for i in range(plate_nrows) 75 | for j in range(plate_ncols)], 76 | names=[row, col]) 77 | plot_data = plot_data.reindex(full_index).reset_index() 78 | plot_data_list.append(plot_data) 79 | if isinstance(plate, str): 80 | plate_names.append(plate) 81 | else: 82 | plate_names.append('\n'.join(plate)) 83 | 84 | ncols = min(len(plot_data_list), ncols) 85 | nrows = int(np.ceil(len(plot_data_list) / ncols)) 86 | cbar_frac = 0.06 87 | 88 | fig = plt.figure(figsize=((6.2 * ncols) * (1 + cbar_frac) * figsize_scale, 89 | 4 * nrows * figsize_scale)) 90 | gs = fig.add_gridspec(nrows, ncols, wspace=0.1) 91 | cmap = copy.copy(mpl.cm.get_cmap("viridis")) 92 | cmap.set_under(color='#440154') 93 | cmap.set_over(color='#FDE725') 94 | cmap.set_bad(color='#FFFFFF') 95 | cnorm = Normalize(vmin, vmax) 96 | 97 | for i, (name, data) in enumerate(zip(plate_names, plot_data_list)): 98 | ax_row = i // ncols 99 | ax_col = i % ncols 100 | 101 | ax = fig.add_subplot(gs[ax_row, ax_col]) 102 | ax.scatter( 103 | x=data[col], 104 | y=data[row], 105 | # have to do this, otherwise NaN is skipped. 106 | c=[cmap(cnorm(v)) for v in data[hue]], 107 | s=100, 108 | linewidth=1, 109 | edgecolor='lightgray') 110 | ax.set(title=name, 111 | ylabel='', 112 | ylim=(plate_nrows, -1), 113 | yticks=list(range(16)), 114 | yticklabels=[chr(i + 65) for i in range(0, 16)], 115 | xlabel='', 116 | xticks=range(24), 117 | xticklabels=range(1, 25)) 118 | ax.xaxis.set_tick_params(labelsize=8) 119 | ax.yaxis.set_tick_params(labelsize=8) 120 | ax.xaxis.tick_top() 121 | fig.colorbar(mpl.cm.ScalarMappable(norm=cnorm, cmap=cmap), 122 | ax=fig.axes, 123 | shrink=0.6, 124 | fraction=cbar_frac, 125 | label=hue) 126 | return fig, plate_names, plot_data_list 127 | 128 | 129 | def cutoff_vs_cell_remain(data, 130 | xlim_quantile=(0.01, 0.99), 131 | distribution_ylim=None, 132 | bins=100, 133 | kde=False): 134 | xlim = tuple(np.quantile(data, xlim_quantile)) 135 | x = np.linspace(xlim[0], xlim[1], 500) 136 | count_list = np.array([(data > i).sum() for i in x]) 137 | original_total_data = data.size 138 | count_list = count_list / original_total_data * 100 139 | data = data[(data < xlim[1]) & (data > xlim[0])] 140 | 141 | fig, ax1 = plt.subplots(figsize=(6, 3)) 142 | ax1 = sns.distplot(data, bins=bins, kde=kde, ax=ax1) 143 | ax1.set_xlim(xlim) 144 | ax1.set_xlabel(data.name) 145 | if distribution_ylim is not None: 146 | ax1.set_ylim(*distribution_ylim) 147 | 148 | ax2 = ax1.twinx() 149 | ax2.plot(x, count_list, linewidth=1, c='grey') 150 | ax2.set_ylabel('% of Cell Remained') 151 | ax2.set(ylim=(0, 100), yticks=range(0, 101, 10)) 152 | ax2.grid(c='lightgray', linestyle='--', linewidth=0.5) 153 | return fig, xlim 154 | -------------------------------------------------------------------------------- /cemba_data/mapping/test_environment.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | import subprocess 3 | 4 | 5 | def testing_cmd(command, expected_return_code=0): 6 | try: 7 | p = subprocess.run(shlex.split(command), 8 | stderr=subprocess.PIPE, 9 | stdout=subprocess.PIPE, 10 | encoding='utf8', 11 | check=True) 12 | except subprocess.CalledProcessError as e: 13 | if e.returncode == expected_return_code: 14 | return 15 | print(e.stderr) 16 | raise e 17 | return 18 | 19 | 20 | COMMAND_TO_TEST = [ 21 | 'cutadapt --version', 22 | 'bismark -version', 23 | 'bowtie2 --version', 24 | 'samtools --version', 25 | 'tabix --version', 26 | 'bgzip --version', 27 | 'bedtools --version' 28 | ] 29 | 30 | 31 | def testing_mapping_installation(mct=False): 32 | for command in COMMAND_TO_TEST: 33 | testing_cmd(command) 34 | 35 | # picard always return 1... 36 | testing_cmd('picard MarkDuplicates --version', 1) 37 | 38 | if mct: 39 | testing_cmd('STAR --version') 40 | 41 | # test ALLCools 42 | try: 43 | testing_cmd('allcools -h') 44 | except subprocess.CalledProcessError: 45 | print('"allcools -h" return error, see if allcools is installed. \n' 46 | 'https://github.com/lhqing/ALLCools') 47 | -------------------------------------------------------------------------------- /cemba_data/snm3C/__init__.py: -------------------------------------------------------------------------------- 1 | from .prepare_impute import prepare_impute_dir 2 | from .prepare_dataset import prepare_dataset_commands 3 | -------------------------------------------------------------------------------- /cemba_data/snm3C/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import pandas as pd 3 | from .prepare_impute import execute_command 4 | 5 | 6 | def prepare_dataset_commands(output_dir, fasta_path, cpu=10): 7 | output_dir = pathlib.Path(output_dir) 8 | project_name = output_dir.name 9 | scool_dir = output_dir / 'scool' 10 | snakemake_dir = scool_dir / 'snakemake' 11 | snakemake_dir.mkdir(exist_ok=True, parents=True) 12 | raw_dir = scool_dir / 'raw' 13 | raw_dir.mkdir(exist_ok=True) 14 | impute_dir = scool_dir / 'impute' 15 | impute_dir.mkdir(exist_ok=True) 16 | dataset_dir = scool_dir / 'dataset' 17 | dataset_dir.mkdir(exist_ok=True) 18 | 19 | # Calculate compartment at 100Kb resolution 20 | compartment_input_dir = impute_dir / '100K' 21 | compartment_cell_table = pd.Series({ 22 | path.name.split('.')[0]: str(path) 23 | for path in compartment_input_dir.glob('*/*.cool') 24 | }) 25 | compartment_cell_table_path = compartment_input_dir / 'cell_table.tsv' 26 | compartment_cell_table.to_csv(compartment_cell_table_path, sep='\t', header=None) 27 | # prepare a whole genome CpG ratio profile 28 | cpg_path = compartment_input_dir / 'cpg_ratio.hdf' 29 | cpg_ratio_cmd = f'hicluster cpg-ratio --cell_url {compartment_cell_table.iloc[0]} ' \ 30 | f'--fasta_path {fasta_path} --hdf_output_path {cpg_path}' 31 | execute_command(cpg_ratio_cmd) 32 | # compartment command 33 | compartment_cmd = f'hicluster compartment ' \ 34 | f'--cell_table_path {compartment_cell_table_path} ' \ 35 | f'--output_prefix {dataset_dir / project_name} ' \ 36 | f'--cpg_profile_path {cpg_path} ' \ 37 | f'--cpu {cpu}' 38 | 39 | # Calculate domain at 25Kb resolution 40 | domain_input_dir = impute_dir / '25K' 41 | domain_cell_table = pd.Series({ 42 | path.name.split('.')[0]: str(path) 43 | for path in domain_input_dir.glob('*/*.cool') 44 | }) 45 | domain_cell_table_path = domain_input_dir / 'cell_table.tsv' 46 | domain_cell_table.to_csv(domain_cell_table_path, sep='\t', header=None) 47 | domain_cmd = f'hicluster domain ' \ 48 | f'--cell_table_path {domain_cell_table_path} ' \ 49 | f'--output_prefix {dataset_dir / project_name} ' \ 50 | f'--resolution 25000 ' \ 51 | f'--window_size 10 ' \ 52 | f'--cpu {cpu}' 53 | 54 | # Calculate cell embedding/decomposition at 100Kb resolution 55 | embedding_dir = dataset_dir / 'embedding' 56 | embedding_dir.mkdir(exist_ok=True) 57 | embedding_cmd = f'hicluster embedding ' \ 58 | f'--cell_table_path {compartment_cell_table_path} ' \ 59 | f'--output_dir {embedding_dir} ' \ 60 | f'--dim 50 ' \ 61 | f'--dist 1000000 ' \ 62 | f'--resolution 100000 ' \ 63 | f'--scale_factor 100000 ' \ 64 | f'--norm_sig --save_raw ' \ 65 | f'--cpu {cpu}' 66 | 67 | # prepare qsub 68 | qsub_dir = snakemake_dir / 'qsub' 69 | qsub_dir.mkdir(exist_ok=True) 70 | with open(qsub_dir / 'dataset_cmd.txt', 'w') as f: 71 | f.write('\n'.join([compartment_cmd, domain_cmd, embedding_cmd])) 72 | qsub_str = f""" 73 | #!/bin/bash 74 | #$ -N y{project_name} 75 | #$ -V 76 | #$ -l h_rt=999:99:99 77 | #$ -l s_rt=999:99:99 78 | #$ -wd {qsub_dir} 79 | #$ -e {qsub_dir}/qsub_dataset.error.log 80 | #$ -o {qsub_dir}/qsub_dataset.output.log 81 | #$ -pe smp 1 82 | #$ -l h_vmem=3G 83 | 84 | yap qsub --command_file_path {qsub_dir}/dataset_cmd.txt \ 85 | --working_dir {qsub_dir} --project_name y{project_name}_dataset \ 86 | --total_cpu {int(cpu*3)} --qsub_global_parms "-pe smp={cpu};-l h_vmem=5G" 87 | """ 88 | with open(qsub_dir / 'qsub_dataset.sh', 'w') as f: 89 | f.write(qsub_str) 90 | return 91 | -------------------------------------------------------------------------------- /cemba_data/utilities.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import functools 3 | import itertools 4 | import logging 5 | import pathlib 6 | import subprocess 7 | from concurrent.futures import ProcessPoolExecutor, as_completed 8 | 9 | # logger 10 | log = logging.getLogger(__name__) 11 | log.addHandler(logging.NullHandler()) 12 | 13 | 14 | def get_configuration(config_path): 15 | """ 16 | Read .ini config file from given path 17 | """ 18 | if isinstance(config_path, configparser.ConfigParser): 19 | return config_path 20 | ref_path_config = configparser.ConfigParser() 21 | ref_path_config.read(config_path) 22 | 23 | total_config = {} 24 | for name, section in ref_path_config.items(): 25 | for k, v in section.items(): 26 | total_config[k] = v 27 | return total_config 28 | 29 | 30 | def test_cmd(tool_name, cmd_list): 31 | try: 32 | subprocess.run(cmd_list, 33 | stdout=subprocess.PIPE, 34 | stderr=subprocess.PIPE, 35 | encoding='utf8', 36 | check=True) 37 | except subprocess.CalledProcessError as e: 38 | log.error(f'Test {tool_name} got non-zero return code {e.returncode}') 39 | log.error(e.stderr) 40 | raise 41 | return 42 | 43 | 44 | def valid_environments(config): 45 | log.info('Test mapping environments') 46 | 47 | # test cutadapt 48 | test_cmd(tool_name='cutadapt', cmd_list=['cutadapt', '--version']) 49 | # test samtools 50 | test_cmd(tool_name='samtools', cmd_list=['samtools', '--version']) 51 | # test picard, picard always have return code 1... 52 | test_cmd(tool_name='picard', cmd_list=['which', 'picard']) 53 | # test bismark_mapping 54 | test_cmd(tool_name='bismark_mapping', cmd_list=['bismark_mapping', '--version']) 55 | if config['mode'] != 'm3c': 56 | # test bowtie2 57 | test_cmd(tool_name='bowtie2', cmd_list=['bowtie2', '--version']) 58 | else: 59 | # test bowtie1 60 | test_cmd(tool_name='bowtie', cmd_list=['bowtie', '--version']) 61 | # test pigz 62 | test_cmd(tool_name='pigz', cmd_list=['pigz', '-V']) 63 | 64 | bismark_dir = pathlib.Path(config['bismark_reference']) 65 | if not bismark_dir.is_dir(): 66 | raise TypeError(f"Bismark reference must be a directory contain a sub-dir named Bisulfite_Genome, " 67 | f"generated by bismark_genome_preparation. Got a file path") 68 | if not bismark_dir.exists(): 69 | raise FileNotFoundError(f"Bismark reference directory not found. " 70 | f"Path in the config.ini is {bismark_dir}") 71 | 72 | allc_ref_fasta = pathlib.Path(config['reference_fasta']) 73 | allc_ref_fai = pathlib.Path(config['reference_fasta'] + '.fai') 74 | if not allc_ref_fasta.exists(): 75 | raise FileNotFoundError(f"Reference fasta for ALLC generation not found. " 76 | f"Path in the config.ini is {allc_ref_fasta}") 77 | if not allc_ref_fai.exists(): 78 | raise FileNotFoundError(f".fai index for reference fasta not found. " 79 | f"Path of fadix should be {allc_ref_fai}. " 80 | f"You can use 'samtools fadix {allc_ref_fasta}' to generate.") 81 | return 82 | 83 | 84 | def parse_index_fasta(fasta_path): 85 | records = {} 86 | with open(fasta_path) as f: 87 | key_line = True 88 | for line in f: 89 | if key_line: 90 | key = line.lstrip('>').rstrip('\n') 91 | key_line = False 92 | else: 93 | value = line.lstrip('^').rstrip('\n') 94 | records[key] = value 95 | key_line = True 96 | return records 97 | 98 | 99 | def command_runner(commands, runner=None, cpu=1): 100 | if runner is None: 101 | from functools import partial 102 | runner = partial(subprocess.run, 103 | stdout=subprocess.PIPE, 104 | stderr=subprocess.PIPE, 105 | encoding='utf8', 106 | shell=True, 107 | check=True) 108 | if cpu <= 1: 109 | for command in commands: 110 | runner(command) 111 | else: 112 | with ProcessPoolExecutor(cpu) as pool: 113 | futures = [] 114 | for command in commands: 115 | future = pool.submit(runner, command) 116 | futures.append(future) 117 | 118 | for future in as_completed(futures): 119 | try: 120 | future.result() 121 | except subprocess.CalledProcessError as e: 122 | print("Got error in fastq_qc, command was:") 123 | print(command) 124 | print(e.stdout) 125 | print(e.stderr) 126 | raise e 127 | return 128 | 129 | 130 | def snakemake(workdir, snakefile, cores): 131 | try: 132 | subprocess.run([ 133 | 'snakemake', '-d', str(workdir), '--snakefile', 134 | str(snakefile), '--cores', 135 | str(cores) 136 | ], 137 | check=True, 138 | stdin=subprocess.PIPE, 139 | stdout=subprocess.PIPE, 140 | encoding='utf8') 141 | except subprocess.CalledProcessError as e: 142 | print(e.stdout) 143 | print(e.stderr) 144 | raise e 145 | return 146 | 147 | 148 | def get_barcode_version(output_dir): 149 | fastq_dir = pathlib.Path(output_dir) / 'fastq' 150 | with open(fastq_dir / '.barcode_version') as f: 151 | return f.read() 152 | 153 | 154 | def get_mode(output_dir): 155 | fastq_dir = pathlib.Path(output_dir) / 'fastq' 156 | with open(fastq_dir / '.mode') as f: 157 | return f.read() 158 | 159 | 160 | MAPPING_MODE_CHOICES = ['mct', 'mc', 'm3c', '4m'] 161 | IUPAC_TABLE = { 162 | 'A': 'A', 163 | 'T': 'T', 164 | 'C': 'C', 165 | 'G': 'G', 166 | 'R': 'AG', 167 | 'Y': 'CT', 168 | 'S': 'GC', 169 | 'W': 'AT', 170 | 'K': 'GT', 171 | 'M': 'AC', 172 | 'B': 'CGT', 173 | 'D': 'AGT', 174 | 'H': 'ATC', 175 | 'V': 'ACG', 176 | 'N': 'ATCGN' 177 | } 178 | 179 | 180 | @functools.lru_cache(maxsize=100) 181 | def parse_mc_pattern(pattern: str) -> set: 182 | """ 183 | parse mC context pattern 184 | """ 185 | # IUPAC DNA abbr. table 186 | all_pos_list = [] 187 | pattern = pattern.upper() 188 | for base in pattern: 189 | try: 190 | all_pos_list.append(IUPAC_TABLE[base]) 191 | except KeyError: 192 | raise KeyError(f'Base {base} is not in IUPAC table.') 193 | context_set = set([''.join(i) for i in itertools.product(*all_pos_list)]) 194 | return context_set 195 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/TODO_GenerateMCDS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.7.3" 28 | }, 29 | "toc": { 30 | "base_numbering": 1, 31 | "nav_menu": {}, 32 | "number_sections": true, 33 | "sideBar": true, 34 | "skip_h1_title": false, 35 | "title_cell": "Table of Contents", 36 | "title_sidebar": "Contents", 37 | "toc_cell": false, 38 | "toc_position": {}, 39 | "toc_section_display": true, 40 | "toc_window_display": false 41 | } 42 | }, 43 | "nbformat": 4, 44 | "nbformat_minor": 2 45 | } 46 | -------------------------------------------------------------------------------- /doc/TODO_overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "hide_input": false, 13 | "kernelspec": { 14 | "display_name": "Python 3", 15 | "language": "python", 16 | "name": "python3" 17 | }, 18 | "language_info": { 19 | "codemirror_mode": { 20 | "name": "ipython", 21 | "version": 3 22 | }, 23 | "file_extension": ".py", 24 | "mimetype": "text/x-python", 25 | "name": "python", 26 | "nbconvert_exporter": "python", 27 | "pygments_lexer": "ipython3", 28 | "version": "3.7.3" 29 | }, 30 | "toc": { 31 | "base_numbering": 1, 32 | "nav_menu": {}, 33 | "number_sections": true, 34 | "sideBar": true, 35 | "skip_h1_title": true, 36 | "title_cell": "Table of Contents", 37 | "title_sidebar": "Contents", 38 | "toc_cell": false, 39 | "toc_position": {}, 40 | "toc_section_display": true, 41 | "toc_window_display": true 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 2 46 | } 47 | -------------------------------------------------------------------------------- /doc/TechBasic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Technology Basics\n", 8 | "\n", 9 | "## What does yap do\n", 10 | "\n", 11 | "### All sequencing technologies are methylation based\n", 12 | "\n", 13 | "All the technologies covered by yap is based on snmC-seq2, here I visualized the steps and barcoding strategies we used currently in Ecker Lab for snmC-seq2. This is basically all yap mapping is based on.\n", 14 | "\n", 15 | "### Multiplex cell in preparing library\n", 16 | "\n", 17 | "When preparing library, the most important part related to mapping is the cell multiplexing:\n", 18 | "\n", 19 | "1. use random primer (inside pipeline, **index_name** corresponding to each random primer)\n", 20 | "2. use illumina P5/P7 primer pair (inside pipeline, **primer_name** and **uid** corresponding to each illumina P5/P7 primer pair)\n", 21 | " \n", 22 | "### Demultiplex cell in mapping\n", 23 | "\n", 24 | "When mapping use yap (**notice the reverse order**):\n", 25 | "\n", 26 | "1. prepare samplesheet for bcl2fastq, use bcl2fastq to demultiplex illumina P5/P7 primer pair. Each result file set got a **uid**, that **uid** corresponding to the illumina primer pair throughout the pipeline.\n", 27 | "2. use cutadapt to demultiplex random primer. Each result file set got a **index_name**, that **index_name** corresponding to the random primer throughout the pipeline.\n", 28 | "3. **uid** + **index_name** uniquely determine a cell within the same pool on MiSeq or NovaSeq.\n", 29 | "4. After getting single cell files, yap just do mapping steps for each individual cells, and then summarize all the mapping stats for the whole library.\n", 30 | "\n", 31 | "\n", 32 | "## Important Reference\n", 33 | "\n", 34 | "- **snmC-seq original publication**: [Luo, Chongyuan, Christopher L. Keown, Laurie Kurihara, Jingtian Zhou, Yupeng He, Junhao Li, Rosa Castanon, et al. 2017. “Single-Cell Methylomes Identify Neuronal Subtypes and Regulatory Elements in Mammalian Cortex.” Science 357 (6351): 600–604.](http://dx.doi.org/10.1126/science.aan3351)\n", 35 | "- **snmC-seq2**: [Luo, Chongyuan, Angeline Rivkin, Jingtian Zhou, Justin P. Sandoval, Laurie Kurihara, Jacinta Lucero, Rosa Castanon, et al. 2018. “Robust Single-Cell DNA Methylome Profiling with snmC-seq2.” Nature Communications 9 (1): 3824.](http://dx.doi.org/10.1038/s41467-018-06355-2)\n", 36 | "- **snmCT-seq**: [Luo, Chongyuan, Hanqing Liu, Bang-An Wang, Anna Bartlett, Angeline Rivkin, Joseph R. Nery, and Joseph R. Ecker. 2018. “Multi-Omic Profiling of Transcriptome and DNA Methylome in Single Nuclei with Molecular Partitioning.” bioRxiv. https://doi.org/10.1101/434845.](http://dx.doi.org/10.1101/434845)\n", 37 | "\n", 38 | "\n", 39 | "## snmC-seq2 Library\n", 40 | "\n", 41 | "### Molecular steps\n", 42 | "![molecularsteps](files/molecularsteps.png)\n", 43 | "\n", 44 | "### Reads and Primer Structure\n", 45 | "\n", 46 | "![primerstructure](files/primerstructure.png)\n", 47 | "\n", 48 | "## Cell Multiplexing\n", 49 | "\n", 50 | "### V1 (8-random-index)\n", 51 | "\n", 52 | "![v1barcode](files/v1barcode.png)\n", 53 | "\n", 54 | "\n", 55 | "### V2 (384-random-index)\n", 56 | "![v2barcode](files/v2barcode.png)\n" 57 | ] 58 | } 59 | ], 60 | "metadata": { 61 | "hide_input": false, 62 | "kernel_info": { 63 | "name": "python3" 64 | }, 65 | "kernelspec": { 66 | "display_name": "Python 3", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 3 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython3", 80 | "version": "3.7.3" 81 | }, 82 | "nteract": { 83 | "version": "0.12.3" 84 | }, 85 | "toc": { 86 | "base_numbering": 1, 87 | "nav_menu": {}, 88 | "number_sections": true, 89 | "sideBar": true, 90 | "skip_h1_title": true, 91 | "title_cell": "Table of Contents", 92 | "title_sidebar": "Contents", 93 | "toc_cell": true, 94 | "toc_position": {}, 95 | "toc_section_display": true, 96 | "toc_window_display": false 97 | }, 98 | "varInspector": { 99 | "cols": { 100 | "lenName": 16, 101 | "lenType": 16, 102 | "lenVar": 40 103 | }, 104 | "kernels_config": { 105 | "python": { 106 | "delete_cmd_postfix": "", 107 | "delete_cmd_prefix": "del ", 108 | "library": "var_list.py", 109 | "varRefreshCmd": "print(var_dic_list())" 110 | }, 111 | "r": { 112 | "delete_cmd_postfix": ") ", 113 | "delete_cmd_prefix": "rm(", 114 | "library": "var_list.r", 115 | "varRefreshCmd": "cat(var_dic_list()) " 116 | } 117 | }, 118 | "types_to_exclude": [ 119 | "module", 120 | "function", 121 | "builtin_function_or_method", 122 | "instance", 123 | "_Feature" 124 | ], 125 | "window_display": false 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 2 130 | } 131 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'yap' 21 | copyright = '2019, Hanqing Liu' 22 | author = 'Hanqing Liu' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'nbsphinx', 'sphinx.ext.mathjax'] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # List of patterns, relative to source directory, that match files and 37 | # directories to ignore when looking for source files. 38 | # This pattern also affects html_static_path and html_extra_path. 39 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] 40 | 41 | 42 | # -- Options for HTML output ------------------------------------------------- 43 | 44 | # The theme to use for HTML and HTML Help pages. See the documentation for 45 | # a list of builtin themes. 46 | # 47 | html_theme = 'default' 48 | 49 | # Add any paths that contain custom static files (such as style sheets) here, 50 | # relative to this directory. They are copied after the builtin static files, 51 | # so a file named "default.css" will overwrite the builtin "default.css". 52 | html_static_path = ['_static'] 53 | master_doc = 'index' -------------------------------------------------------------------------------- /doc/demultiplex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Demultiplex (start from sequencing)\n", 8 | "\n", 9 | "## Related Commands\n", 10 | "```shell\n", 11 | "# Demultiplex\n", 12 | "yap demultiplex\n", 13 | "```" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Input of this step\n", 21 | "In the previous step, we generated sample sheet based on plate information file, and then used illumina bcl2fastq to demultiplex the sequencing results into **raw FASTQ file sets**. This step only demultiplexed the barcode on the illumina primers, therefore, each set of FASTQ file still contain reads mixed from multiple cells. \n", 22 | "\n", 23 | "Depending on the number of random index used in each barcode version, in V1, each set contain reads from eight cells; in V2, each set contain reads from 384 cells." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Output of this step\n", 31 | "\n", 32 | "- This step demultiplex raw FASTQ files into single cell raw FASTQ files.\n", 33 | "- The random index sequence will be removed from the reads\n", 34 | "- Each cell will have two fastq files in the output directory, with fixed name pattern:\n", 35 | " - `{cell_id}-R1.fq.gz` for R1\n", 36 | " - `{cell_id}-R2.fq.gz` for R2" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Usage" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 12, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "usage: yap demultiplex [-h] --fastq_pattern FASTQ_PATTERN --output_dir\r\n", 56 | " OUTPUT_DIR --barcode_version {V1,V2} --mode\r\n", 57 | " {mc,mct,mc2t} --cpu CPU\r\n", 58 | "\r\n", 59 | "optional arguments:\r\n", 60 | " -h, --help show this help message and exit\r\n", 61 | "\r\n", 62 | "Required inputs:\r\n", 63 | " --fastq_pattern FASTQ_PATTERN\r\n", 64 | " FASTQ files with wildcard to match all bcl2fastq\r\n", 65 | " results, pattern with wildcard must be quoted.\r\n", 66 | " (default: None)\r\n", 67 | " --output_dir OUTPUT_DIR\r\n", 68 | " Pipeline output directory, will be created\r\n", 69 | " recursively. (default: None)\r\n", 70 | " --barcode_version {V1,V2}\r\n", 71 | " Barcode version of this library, V1 for the 8 random\r\n", 72 | " index, V2 for the 384 random index. (default: None)\r\n", 73 | " --mode {mc,mct,mc2t} Technology used in this library. (default: None)\r\n", 74 | " --cpu CPU Number of cores to use. Max is 12. (default: None)\r\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "!yap demultiplex -h" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "### Notes\n", 87 | "- **Remember to use \"\" to quote the fastq pattern like this:\n", 88 | " `--fastq_pattern` \"path/pattern/to/your/bcl2fastq/results/*fastq.gz\"**\n", 89 | "- An error will occor if `output_dir` already exists." 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Runtime notes for NovaSeq\n", 97 | "\n", 98 | "- This command run demultiplex directly, the runtime is roughly ~8 Gb per CPU per hour. For a typical eight-plate NovaSeq library (500GB), the runtime using 12 CPU is ~5-7 hours depending on the file system loads.\n", 99 | "- This command creads lots of files simutaniously, in order to prevent too much berden on the file system, I set default and max CPU = 12" 100 | ] 101 | } 102 | ], 103 | "metadata": { 104 | "kernelspec": { 105 | "display_name": "Python 3", 106 | "language": "python", 107 | "name": "python3" 108 | }, 109 | "language_info": { 110 | "codemirror_mode": { 111 | "name": "ipython", 112 | "version": 3 113 | }, 114 | "file_extension": ".py", 115 | "mimetype": "text/x-python", 116 | "name": "python", 117 | "nbconvert_exporter": "python", 118 | "pygments_lexer": "ipython3", 119 | "version": "3.7.6" 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 4 124 | } 125 | -------------------------------------------------------------------------------- /doc/files/MappingPipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/MappingPipeline.png -------------------------------------------------------------------------------- /doc/files/molecularsteps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/molecularsteps.png -------------------------------------------------------------------------------- /doc/files/primerstructure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/primerstructure.png -------------------------------------------------------------------------------- /doc/files/v1barcode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/v1barcode.png -------------------------------------------------------------------------------- /doc/files/v2barcode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/doc/files/v2barcode.png -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. yap documentation master file, created by 2 | sphinx-quickstart on Fri Sep 13 16:24:00 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | YAP documentation 7 | =============================== 8 | Please read the new documentation of YAP here: 9 | 10 | https://hq-1.gitbook.io/mc/ 11 | 12 | - Code: https://github.com/lhqing/cemba_data 13 | - Author: Hanqing Liu, hanliu@salk.edu 14 | -------------------------------------------------------------------------------- /doc/installation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Installation\n", 7 | "## Setup Conda and Mapping Environment\n", 8 | "### check if conda is installed\n", 9 | "```shell\n", 10 | "conda info\n", 11 | "```\n", 12 | "\n", 13 | "### if not installed, install either miniconda or anaconda.\n", 14 | "- IMPORTANT: select python 3\n", 15 | "- miniconda (recommend if you don't use python a lot): https://conda.io/miniconda.html\n", 16 | "- anaconda (larger): https://www.anaconda.com/download/\n", 17 | "\n", 18 | "\n", 19 | "### Set up bioconda\n", 20 | "[bioconda](https://bioconda.github.io/) is a package manager for most popular biological tools, its wonderful!\n", 21 | "```shell\n", 22 | "# run these command to add bioconda into your conda channel, the order of these 3 line matters\n", 23 | "conda config --add channels defaults\n", 24 | "conda config --add channels bioconda\n", 25 | "conda config --add channels conda-forge\n", 26 | "```\n", 27 | "\n", 28 | "### Create Mapping Environment \n", 29 | "you can change the name into any desired name, but python version need to be 3.7\n", 30 | "```shell\n", 31 | "conda create --name mapping python==3.7\n", 32 | "```\n", 33 | "\n", 34 | "### why using stand alone conda environment?\n", 35 | "- Using environment make sure all the mapping related package is handled by conda and pip in a stand alone place\n", 36 | "- It will not impact any of your other installed packages and vise versa.\n", 37 | "- This make sure the stability of pipeline.\n", 38 | "- The only drawback of using environment is **you need to activate environment every time**, because everything is only installed for that environment.\n", 39 | "- See [here](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) for more info about conda environment.\n", 40 | "\n", 41 | "### activate new environment\n", 42 | "**remember to run this command EVERY TIME before using the pipeline.**\n", 43 | "\n", 44 | "```shell\n", 45 | "source activate mapping\n", 46 | "```" 47 | ], 48 | "metadata": { 49 | "collapsed": false 50 | } 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Install packages\n", 57 | "\n", 58 | "### install packages into new environment\n", 59 | "```shell\n", 60 | "conda install -n mapping bedtools=2.27 bismark=0.20 bowtie2=2.3 cutadapt=1.18 fastqc=0.11 picard=2.18 samtools=1.9 htslib=1.9 pysam=0.15\n", 61 | "# for mCT mapping\n", 62 | "conda install -n mapping STAR=2.7\n", 63 | "\n", 64 | "# for generating ALLC files (single cell base level methylation table)\n", 65 | "# ALLCools is still in developing, right now only support install via github.\n", 66 | "git clone https://github.com/lhqing/ALLCools.git\n", 67 | "cd ALLCools\n", 68 | "pip install .\n", 69 | "```\n", 70 | "\n", 71 | "### clone cemba-data repo and install it\n", 72 | "this step will take some time, a few packages will be installed into this environment\n", 73 | "```shell\n", 74 | "git clone https://github.com/lhqing/cemba_data.git\n", 75 | "cd cemba_data\n", 76 | "pip install .\n", 77 | "```\n", 78 | "\n", 79 | "### test if installed correctly\n", 80 | "```shell\n", 81 | "yap -h\n", 82 | "```\n", 83 | "\n", 84 | "## update the package\n", 85 | "**Again, remember you should do this in mapping environment**\n", 86 | "\n", 87 | "```shell\n", 88 | "source activate mapping\n", 89 | "# or source activate your_environment_name\n", 90 | "\n", 91 | "cd /path/to/original/dir/you/clone/from/github/cemba_data\n", 92 | "git pull\n", 93 | "pip install .\n", 94 | "```" 95 | ] 96 | } 97 | ], 98 | "metadata": { 99 | "hide_input": false, 100 | "kernelspec": { 101 | "display_name": "Python 3", 102 | "language": "python", 103 | "name": "python3" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.7.3" 116 | }, 117 | "toc": { 118 | "base_numbering": 1, 119 | "nav_menu": {}, 120 | "number_sections": true, 121 | "sideBar": true, 122 | "skip_h1_title": true, 123 | "title_cell": "Table of Contents", 124 | "title_sidebar": "Contents", 125 | "toc_cell": true, 126 | "toc_position": {}, 127 | "toc_section_display": true, 128 | "toc_window_display": true 129 | }, 130 | "varInspector": { 131 | "cols": { 132 | "lenName": 16, 133 | "lenType": 16, 134 | "lenVar": 40 135 | }, 136 | "kernels_config": { 137 | "python": { 138 | "delete_cmd_postfix": "", 139 | "delete_cmd_prefix": "del ", 140 | "library": "var_list.py", 141 | "varRefreshCmd": "print(var_dic_list())" 142 | }, 143 | "r": { 144 | "delete_cmd_postfix": ") ", 145 | "delete_cmd_prefix": "rm(", 146 | "library": "var_list.r", 147 | "varRefreshCmd": "cat(var_dic_list()) " 148 | } 149 | }, 150 | "types_to_exclude": [ 151 | "module", 152 | "function", 153 | "builtin_function_or_method", 154 | "instance", 155 | "_Feature" 156 | ], 157 | "window_display": false 158 | }, 159 | "pycharm": { 160 | "stem_cell": { 161 | "cell_type": "raw", 162 | "source": [], 163 | "metadata": { 164 | "collapsed": false 165 | } 166 | } 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 2 171 | } -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /env.yaml: -------------------------------------------------------------------------------- 1 | name: base 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.8 8 | - pip 9 | - jupyter 10 | - snakemake 11 | - pytables 12 | - seaborn 13 | - xarray 14 | - dask 15 | - mamba 16 | - natsort 17 | - netCDF4 18 | - networkx 19 | - opentsne 20 | - plotly 21 | - pynndescent 22 | - leidenalg 23 | - anndata 24 | - scanpy 25 | - scikit-learn 26 | - statsmodels 27 | - xarray 28 | - yaml 29 | - zarr 30 | - biopython 31 | - cutadapt 32 | - bismark=0.20 33 | - bowtie2 34 | - bowtie 35 | - samtools 36 | - picard 37 | - bedtools 38 | - htslib>=1.9 39 | - pysam 40 | - pybedtools 41 | - pyBigWig 42 | - star=2.7.3a 43 | - subread=2.0 44 | - rpy2 45 | - pip: 46 | - papermill 47 | - imblearn 48 | - allcools 49 | - schicluster 50 | - cemba_data 51 | -------------------------------------------------------------------------------- /hisat3n_env.yml: -------------------------------------------------------------------------------- 1 | name: base 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.9 8 | - pip 9 | - jupyter 10 | - snakemake 11 | - pytables 12 | - seaborn 13 | - yaml 14 | - cutadapt 15 | - samtools 16 | - picard 17 | - bedtools 18 | - htslib=1.15 19 | - pysam 20 | - pybedtools 21 | - pyBigWig 22 | - pip: 23 | - papermill 24 | - allcools 25 | - cemba_data 26 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"] 3 | 4 | [tool.setuptools_scm] 5 | write_to = 'cemba_data/_version.py' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | papermill 2 | ipykernel 3 | nbsphinx 4 | sphinx>=3 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='cemba-data', 5 | use_scm_version=True, 6 | setup_requires=['setuptools_scm'], 7 | author='Hanqing Liu', 8 | author_email='hanliu@salk.edu', 9 | description='Pipelines for single nucleus methylome and multi-omic dataset.', 10 | long_description=open('README.md').read(), 11 | long_description_content_type='text/markdown', 12 | url='https://github.com/lhqing/cemba_data', 13 | license='MIT', 14 | classifiers=[ 15 | "License :: OSI Approved :: MIT License", 16 | "Programming Language :: Python :: 3", 17 | "Programming Language :: Python :: 3.7", 18 | ], 19 | packages=find_packages(exclude=('doc',)), 20 | include_package_data=True, 21 | package_data={ 22 | '': ['*.txt', '*.tsv', '*.csv', '*.fa', '*Snakefile', '*ipynb'] 23 | }, 24 | install_requires=['pandas>=1.0', 25 | 'numpy', 26 | 'seaborn', 27 | 'matplotlib', 28 | 'papermill', 29 | 'dnaio', 30 | 'pysam'], 31 | entry_points={ 32 | 'console_scripts': ['yap=cemba_data.__main__:main', 33 | 'yap-internal=cemba_data._yap_internal_cli_:internal_main', 34 | 'yap-hisat3n=cemba_data.hisat3n.cli:main'], 35 | } 36 | ) 37 | --------------------------------------------------------------------------------