├── .github └── workflows │ └── publish.yaml ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── cemba_data ├── __init__.py ├── __main__.py ├── _yap_internal_cli_.py ├── bulk │ ├── Snakefile_template │ │ ├── __init__.py │ │ └── mc_bulk.Snakefile │ ├── __init__.py │ ├── atac_bulk.py │ ├── mc_bulk.py │ ├── mc_bulk_multigroup │ │ ├── __init__.py │ │ ├── mc_bulk_multigroup.py │ │ └── mc_bulk_multigroup_template.py │ └── mct_bulk.py ├── demultiplex │ ├── __init__.py │ ├── demultiplex.py │ ├── fastq_dataframe.py │ └── plateinfo_and_samplesheet.py ├── dmr │ ├── __init__.py │ ├── dmrseq │ │ ├── DMRseq.ipynb │ │ └── __init__.py │ └── dss │ │ ├── DSS.MultiGroup.SingleRegionDML.ipynb │ │ ├── DSS.TwoGroup.SingleRegionDML.ipynb │ │ ├── MultiGroup.py │ │ ├── TwoGroup.py │ │ └── __init__.py ├── files │ ├── V1_i7_i5_index.tsv │ ├── V2_i7_i5_index.tsv │ ├── __init__.py │ ├── default_config │ │ ├── __init__.py │ │ ├── mapping_config_4m.ini │ │ ├── mapping_config_m3c.ini │ │ ├── mapping_config_mc.ini │ │ ├── mapping_config_mct-nome.ini │ │ ├── mapping_config_mct.ini │ │ └── mapping_config_nome.ini │ ├── mapping_summary_template │ │ ├── 4m_template.ipynb │ │ ├── __init__.py │ │ ├── m3c_template.ipynb │ │ ├── mc_template.ipynb │ │ └── mct_template.ipynb │ ├── plate_info_template_v1.txt │ ├── plate_info_template_v2.txt │ ├── random_index_v1.fa │ ├── random_index_v2 │ │ ├── __init__.py │ │ ├── random_index_v2.fa │ │ ├── random_index_v2.multiplex_group_1.fa │ │ ├── random_index_v2.multiplex_group_2.fa │ │ ├── random_index_v2.multiplex_group_3.fa │ │ ├── random_index_v2.multiplex_group_4.fa │ │ ├── random_index_v2.multiplex_group_5.fa │ │ └── random_index_v2.multiplex_group_6.fa │ ├── sample_sheet_header.txt │ ├── sbatch_template_schicluster.txt │ └── sbatch_template_yap.txt ├── hisat3n │ ├── __init__.py │ ├── cli.py │ ├── config │ │ ├── __init__.py │ │ ├── gcp.md │ │ ├── hisat-3n-build.sh │ │ ├── hisat3n_mapping_env.yaml │ │ └── vm_init.sh │ ├── hisat3n_general.py │ ├── hisat3n_m3c.py │ ├── hisat3n_mct.py │ ├── snakefile │ │ ├── __init__.py │ │ ├── m3c.smk │ │ ├── mc-multi.smk │ │ ├── mc-multi_sort_input.smk │ │ ├── mc.smk │ │ ├── mct-multi.smk │ │ └── mct.smk │ ├── stats_col_names.py │ ├── stats_parser.py │ ├── summary.py │ └── utilities.py ├── mapping │ ├── Snakefile_template │ │ ├── 4m.Snakefile │ │ ├── __init__.py │ │ ├── m3c.Snakefile │ │ ├── mc.Snakefile │ │ └── mct.Snakefile │ ├── __init__.py │ ├── config.py │ ├── m3c │ │ └── __init__.py │ ├── mct │ │ ├── __init__.py │ │ ├── mct_bismark_bam_filter.py │ │ └── mct_star_bam_filter.py │ ├── pipelines │ │ ├── _4m.py │ │ ├── __init__.py │ │ ├── m3c.py │ │ ├── mc.py │ │ └── mct.py │ ├── stats │ │ ├── _4m.py │ │ ├── __init__.py │ │ ├── m3c.py │ │ ├── mc.py │ │ ├── mct.py │ │ ├── plate_info.py │ │ ├── plot.py │ │ └── utilities.py │ └── test_environment.py ├── qsub.py ├── sbatch.py ├── snm3C │ ├── __init__.py │ ├── prepare_dataset.py │ └── prepare_impute.py └── utilities.py ├── doc ├── Makefile ├── Mapping.ipynb ├── MappingSummary.ipynb ├── PipelineInput.ipynb ├── PlateInfoAndSampleSheet.ipynb ├── TODO_GenerateMCDS.ipynb ├── TODO_overview.ipynb ├── TechBasic.ipynb ├── archive │ └── MakeFastqDataframe.ipynb ├── conf.py ├── demultiplex.ipynb ├── files │ ├── MappingPipeline.png │ ├── molecularsteps.png │ ├── primerstructure.png │ ├── v1barcode.png │ └── v2barcode.png ├── index.rst ├── installation.ipynb └── make.bat ├── env.yaml ├── hisat3n_env.yml ├── pyproject.toml ├── requirements.txt └── setup.py /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | push: 13 | # Sequence of patterns matched against refs/tags 14 | tags: 15 | - "v*" # Push events to matching v*, i.e. v1.0, v20.15.10 16 | 17 | permissions: 18 | contents: read 19 | 20 | jobs: 21 | deploy: 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | # build python package and deploy to pypi 26 | - uses: actions/checkout@v3 27 | - name: Set up Python 28 | uses: actions/setup-python@v3 29 | with: 30 | python-version: "3.8" 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip wheel twine build 34 | pip install build 35 | - name: Build package 36 | run: python -m build 37 | - name: Publish package 38 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 39 | with: 40 | user: __token__ 41 | password: ${{ secrets.PYPI_API_TOKEN_CEMBA_DATA }} 42 | 43 | # # build docker image and push to GCR 44 | # - uses: actions/checkout@v3 45 | # - uses: google-github-actions/setup-gcloud@v0 46 | # - name: Get the version 47 | # id: get_tag_name 48 | # run: echo ::set-output name=GIT_TAG_NAME::${GITHUB_REF/refs\/tags\//} 49 | # - uses: RafikFarhad/push-to-gcr-github-action@v4 50 | # with: 51 | # gcloud_service_key: ${{ secrets.GCLOUD_SERVICE_KEY }} 52 | # registry: gcr.io 53 | # project_id: prod-635e 54 | # image_name: wmb 55 | # image_tag: ${{ steps.get_tag_name.outputs.GIT_TAG_NAME}},latest 56 | # dockerfile: ./Dockerfile 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea/ 6 | .DS_Store 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | cemba_data/_version.py 107 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.7' 4 | script: true 5 | deploy: 6 | provider: pypi 7 | username: __token__ 8 | on: 9 | tags: true 10 | password: 11 | secure: KGUWzdUpJgJGR4wOz8w3o16zscEFlRWF5Kdm3PSLTDuDY6OpNK0L4lywyMuhh178hCRZ72/5FmzXYoBXI1g2IODEvpWxmvbFe3kF8FPPD3BfgYsIsF0i4pNHPpdmIxZeuBaymf+SctVNY4o81mup7n3T05P9l8mATDOnSgP+5WLoHAk+ie7D9/H386xueGxfcKuUmzyZRlqUsjs7COXgDiG9VoyZi4KvUwlZz8+jriYjs9qL/t1rN2Mg0ZCDCzGghNDo36tnvRAX+TqGACj4xURXydCJGPx6hUPTJkbydIhGlvaVblCO8FYDsLuedUIblU5SMAUklkhh48VoR1k5+l2mxCkAOLCPYodZ2AS+wNhF5yMXbOhd4zmabw0uxfpfEVZOjcDi08YzbsRFyz5f8BuFkXwjWeaUpiNG8oj/6xZBpWzGNg5cQ+ZzqHXuavf5mzgrt+K0TxBGLfQ4san0EgbBYESkUaVWRaYt0LEhmkk58Wx27Um+C7lrl2Wxs6C0rnNXzho8jiAe2ZTHva8EhG1fJuUiLZ6YA2xobZVmZlFj/J/eEoZYRvLN1dEGhWwhcgenc/1rY1NW1mllGkGVzfvB/YqZEbk9Mo9PvNej5KLg63aoYJ0/tgL/fTdBE1S1LlisZPgFHdZ2RwkB6NxazXY2qWZQkLPqJ02aEuSDb1k= 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mambaorg/micromamba:0.23.0 2 | COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml 3 | RUN micromamba install -y -f /tmp/env.yaml && \ 4 | micromamba clean --all --yes 5 | 6 | ARG MAMBA_DOCKERFILE_ACTIVATE=1 7 | 8 | RUN yap --version 9 | RUN allcools --version 10 | 11 | USER root 12 | # default argument when not provided in the --build-arg 13 | # to build the image with gcp, use 14 | # docker build --build-arg gcp=true -t mapping-gcp:tag . 15 | ARG gcp 16 | RUN if [ "$gcp" = "true" ] ; then \ 17 | apt-get update && \ 18 | apt-get install -y curl gnupg && \ 19 | echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | \ 20 | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 21 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ 22 | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ 23 | apt-get update -y && \ 24 | apt-get install google-cloud-sdk -y; \ 25 | else echo 'no gcp install'; \ 26 | fi 27 | 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 - 2020 Hanqing Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include cemba_data *.ini 2 | include LICENSE.txt 3 | recursive-exclude * __pycache__ 4 | recursive-include cemba_data *.txt *.tsv *.csv *.fa *Snakefile *ipynb 5 | exclude doc 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](http://www.network-science.de/ascii/) 2 |
3 | ** ** ** ******* 4 | //** ** **** /**////** 5 | //**** **//** /** /** 6 | //** ** //** /******* 7 | /** ********** /**//// 8 | /** /**//////** /** 9 | /** /** /** /** 10 | // // // // 11 |12 | 13 | # YAP (Yet Another Pipeline) 14 | Pipeline(s) for mapping and cluster-level aggregation of single nucleus methylome and multi-omic datasets. 15 | Technologies supported: 16 | - snmC-seq(1/2/3) 17 | - snmCT-seq (mC + RNA) 18 | - snmC2T-seq (mC + RNA + Chromatin Accessibility) 19 | - snm3C-seq (mC + Chromatin Conformation) 20 | - any NOMe treated version of the above 21 | 22 | [See Documentation](https://hq-1.gitbook.io/mc/) 23 | -------------------------------------------------------------------------------- /cemba_data/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import version as __version__ 2 | -------------------------------------------------------------------------------- /cemba_data/bulk/Snakefile_template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/bulk/Snakefile_template/__init__.py -------------------------------------------------------------------------------- /cemba_data/bulk/Snakefile_template/mc_bulk.Snakefile: -------------------------------------------------------------------------------- 1 | 2 | # Example (required) parameters 3 | # merge_allc_cpu = 10 4 | # mcg_context = 'CGN' 5 | # mch_context = 'CHN' 6 | # bigwig_mch_bin_size = 50 7 | # bigwig_mcg_bin_size = 1 8 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE' 9 | # group = 'GROUP_NAME' 10 | 11 | # the main rule is the final target 12 | rule main: 13 | input: 14 | f"{group}.{mcg_context}-both.frac.bw", 15 | f"{group}.{mcg_context}-both.cov.bw", 16 | f"{group}.{mch_context}-both.frac.bw", 17 | f"{group}.{mch_context}-both.cov.bw", 18 | f"{group}.{mcg_context}-Merge.allc.tsv.gz" 19 | 20 | 21 | # Merge ALLC 22 | rule merge_allc: 23 | input: 24 | f"{group}.allc_paths.txt" 25 | output: 26 | allc=f"{group}.allc.tsv.gz", 27 | tbi=f"{group}.allc.tsv.gz.tbi" 28 | threads: 29 | max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1))) 30 | resources: 31 | mem_mb=merge_allc_cpu * 5000 32 | shell: 33 | "allcools merge-allc " 34 | "--allc_paths {input} " 35 | "--output_path {output.allc} " 36 | "--chrom_size_path {chrom_size_path} " 37 | "--cpu {threads}" 38 | 39 | 40 | # Extract mCG ALLC for DMR calling 41 | rule extract_allc_mcg: 42 | input: 43 | f"{group}.allc.tsv.gz" 44 | output: 45 | allc_cg=f"{group}.{mcg_context}-Merge.allc.tsv.gz", 46 | allc_cg_tbi=f"{group}.{mcg_context}-Merge.allc.tsv.gz.tbi" 47 | threads: 48 | 1 49 | resources: 50 | mem_mb=100 51 | shell: 52 | "allcools extract-allc " 53 | "--allc_path {input} " 54 | "--output_prefix {group} " 55 | "--mc_contexts {mcg_context} " 56 | "--chrom_size_path {chrom_size_path} " 57 | "--strandness merge " 58 | "--output_format allc " 59 | "--cpu {threads}" 60 | 61 | 62 | # Generate mCH BigWig files 63 | rule bigwig_ch: 64 | input: 65 | f"{group}.allc.tsv.gz" 66 | output: 67 | f"{group}.{mch_context}-both.cov.bw", 68 | f"{group}.{mch_context}-both.frac.bw" 69 | threads: 70 | 1 71 | resources: 72 | mem_mb=100 73 | shell: 74 | "allcools allc-to-bigwig " 75 | "--allc_path {input} " 76 | "--output_prefix {group} " 77 | "--bin_size {bigwig_mch_bin_size} " 78 | "--mc_contexts {mch_context} " 79 | "--chrom_size_path {chrom_size_path}" 80 | 81 | 82 | # Generate mCG BigWig files 83 | rule bigwig_cg: 84 | input: 85 | f"{group}.allc.tsv.gz" 86 | output: 87 | f"{group}.{mcg_context}-both.cov.bw", 88 | f"{group}.{mcg_context}-both.frac.bw" 89 | threads: 90 | 1 91 | resources: 92 | mem_mb=100 93 | shell: 94 | "allcools allc-to-bigwig " 95 | "--allc_path {input} " 96 | "--output_prefix {group} " 97 | "--bin_size {bigwig_mcg_bin_size} " 98 | "--mc_contexts {mcg_context} " 99 | "--chrom_size_path {chrom_size_path}" 100 | -------------------------------------------------------------------------------- /cemba_data/bulk/__init__.py: -------------------------------------------------------------------------------- 1 | from .mc_bulk import prepare_mc_bulk 2 | from .atac_bulk import prepare_atac_bulk 3 | from .mct_bulk import merge_mct_cluster_bam 4 | from .mc_bulk_multigroup.mc_bulk_multigroup import merge_bulk_multigroup 5 | -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | import cemba_data 4 | 5 | PACKAGE_DIR = pathlib.Path(cemba_data.__path__[0]) 6 | 7 | 8 | def prepare_mc_bulk(allc_table, 9 | output_dir, 10 | chrom_size_path, 11 | mch_context='CHN', 12 | mcg_context='CGN', 13 | bigwig_mch_bin_size=50, 14 | bigwig_mcg_bin_size=1, 15 | cpu_per_job=12, 16 | total_cpu=60): 17 | """ 18 | Prepare the snakefile for merging single-cell ALLC files into pseudo-bulk 19 | 20 | Parameters 21 | ---------- 22 | allc_table 23 | Path of the allc table. The allc table is a two column tsv file. 24 | The first columns is the absolute ALLC file paths; 25 | the second column is the group name of each file. 26 | output_dir 27 | Path of the output directory, will be created if not exist. 28 | mch_context 29 | mCH contexts for generating the bigwig tracks 30 | mcg_context 31 | mCG contexts for generating the bigwig tracks and merge strand 32 | chrom_size_path 33 | Path of the chromosome size file path 34 | bigwig_mch_bin_size 35 | Bin size used to generate mCH bigwig 36 | bigwig_mcg_bin_size 37 | Bin size used to generate mCG bigwig 38 | cpu_per_job 39 | Number of CPUs to use in individual merge-allc job 40 | total_cpu 41 | Number of CPUs to use in total 42 | 43 | Returns 44 | ------- 45 | 46 | """ 47 | snakemake_template_path = PACKAGE_DIR / 'bulk/Snakefile_template/mc_bulk.Snakefile' 48 | output_dir = pathlib.Path(output_dir).absolute() 49 | output_dir.mkdir(exist_ok=True) 50 | 51 | merge_allc_cpu = int(cpu_per_job / 1.1) 52 | total_mem_mb = cpu_per_job * 5000 53 | 54 | # prepare ALLC path dict 55 | # allc_path to group 56 | if str(allc_table).endswith('csv'): 57 | allc_path = pd.read_csv(allc_table, index_col=0, header=None, squeeze=True) 58 | else: 59 | allc_path = pd.read_csv(allc_table, sep='\t', index_col=0, header=None, squeeze=True) 60 | file_not_exist = allc_path[allc_path.index.map(lambda i: not pathlib.Path(i).exists())] 61 | if file_not_exist.size != 0: 62 | path_str = "\n".join(file_not_exist.index.tolist()) 63 | raise FileNotFoundError(f'{file_not_exist.size} files do not exist:' 64 | f'\n{path_str}') 65 | allc_dict = {group: paths.index.tolist() for group, paths in allc_path.groupby(allc_path)} 66 | 67 | # Prepare Snakefile 68 | snakemake_cmds = [] 69 | for group, paths in allc_dict.items(): 70 | # each group has a separate snakemake file 71 | group_dir = output_dir / group 72 | group_dir.mkdir(exist_ok=True) 73 | allc_list_path = group_dir / f'{group}.allc_paths.txt' 74 | with open(allc_list_path, 'w') as f: 75 | f.write('\n'.join(paths)) 76 | snakemake_parameters = f""" 77 | merge_allc_cpu = {merge_allc_cpu} 78 | mch_context = '{mch_context}' 79 | mcg_context = '{mcg_context}' 80 | bigwig_mch_bin_size = {bigwig_mch_bin_size} 81 | bigwig_mcg_bin_size = {bigwig_mcg_bin_size} 82 | chrom_size_path = '{chrom_size_path}' 83 | group = '{group}' 84 | 85 | """ 86 | with open(snakemake_template_path) as f: 87 | snakemake_template = f.read() 88 | snakemake_str = snakemake_parameters + snakemake_template 89 | with open(group_dir / f'Snakefile', 'w') as f: 90 | f.write(snakemake_str) 91 | snakemake_cmd = f'snakemake ' \ 92 | f'-d {group_dir.absolute()} ' \ 93 | f'--snakefile {group_dir.absolute()}/Snakefile ' \ 94 | f'-j {cpu_per_job} ' \ 95 | f'--default-resources mem_mb=100 ' \ 96 | f'--resources mem_mb={total_mem_mb} ' \ 97 | f'--rerun-incomplete' 98 | snakemake_cmds.append(snakemake_cmd) 99 | 100 | qsub_dir = output_dir / 'qsub' 101 | qsub_dir.mkdir(exist_ok=True) 102 | with open(qsub_dir / 'snakemake_cmds.txt', 'w') as f: 103 | f.write('\n'.join(snakemake_cmds)) 104 | with open(qsub_dir / 'qsub.sh', 'w') as f: 105 | qsub_str = f""" 106 | yap qsub \ 107 | --command_file_path {qsub_dir / 'snakemake_cmds.txt'} \ 108 | --working_dir {qsub_dir} \ 109 | --project_name merge \ 110 | --total_cpu {total_cpu} \ 111 | --qsub_global_parms "-pe smp={cpu_per_job};-l h_vmem=5G" 112 | """ 113 | f.write(qsub_str) 114 | print(f'Execute this command to start pipeline:\nnohup sh {qsub_dir / "qsub.sh"} &') 115 | return 116 | -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk_multigroup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/bulk/mc_bulk_multigroup/__init__.py -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk_multigroup/mc_bulk_multigroup.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | import shutil 4 | from .mc_bulk_multigroup_template import MERGE_TEMPLATE, MERGE_EXTRACT_TEMPLATE 5 | 6 | 7 | ''' 8 | the group_file is in csv format with the first column as allcpath and different group the others. 9 | the group_file needs a header. 10 | ''' 11 | 12 | def merge_bulk_multigroup(group_path, output_path, chrom_size_path, 13 | n_cpu=10, elem_snakegroup_num = 50, 14 | cate_snakegroup_num = 10, ): 15 | 16 | 17 | outdir = Path(output_path) 18 | outdir.mkdir(parents=True, exist_ok=True) 19 | shutil.copyfile(group_path, outdir/'GROUP.csv') 20 | 21 | df = pd.read_csv(group_path) 22 | df = df.rename(columns={df.columns[0]:'_path'}) 23 | sample_cates = df.columns[1:] 24 | 25 | df['_elem'] = pd.factorize(df[sample_cates].astype(str).apply('-'.join, axis=1))[0] 26 | countdict = df['_elem'].value_counts().to_dict() 27 | df['_elem'] = df['_elem'].apply(lambda x: f'{x}_{countdict[x]}') 28 | 29 | elem_grp_df = df.groupby('_elem')['_path'].apply(lambda x: x.unique()).to_frame() 30 | elem_grp_df.index.name = '_sample' 31 | elem_grp_df['_cate'] = '_elem' 32 | elem_grp_df = elem_grp_df.reset_index()[['_cate','_sample','_path']] 33 | 34 | df = df[df.columns[1:]].drop_duplicates() 35 | 36 | df['_path'] = output_path+'/_elem/'+df['_elem']+'.allc.tsv.gz' 37 | 38 | cate_grp_df = [] 39 | for cate in sample_cates: 40 | catedf = df[[cate,'_path']].groupby(cate)['_path'].apply(lambda x: x.unique()).to_frame() 41 | catedf['_cate'] = cate 42 | catedf.index.name = '_sample' 43 | catedf = catedf.reset_index() 44 | cate_grp_df.append(catedf) 45 | cate_grp_df = pd.concat(cate_grp_df).reset_index(drop=True)[['_cate','_sample','_path']] 46 | 47 | 48 | def prepare_snakefiles(grp_df, output_path, tag, n_per_snake=None, template=MERGE_TEMPLATE): 49 | outdir = Path(output_path) 50 | snkdir = outdir/'snakefiles' 51 | snkdir.mkdir(exist_ok=True) 52 | 53 | for cate in grp_df['_cate'].unique(): 54 | catedir = outdir/cate 55 | catedir.mkdir(exist_ok=True) 56 | 57 | for _,(cate,sample,paths) in grp_df.iterrows(): 58 | catedir = outdir/cate 59 | with open(catedir/f'{sample}.pathlist','w') as f: 60 | f.write('\n'.join(paths)) 61 | 62 | if n_per_snake is None: 63 | n_per_snake = len(grp_df) 64 | 65 | snk_ids = [] 66 | for i, snkdf in grp_df.groupby(grp_df.index%n_per_snake): 67 | snk_id = f'{tag}_{i}' 68 | 69 | tocp_df = snkdf[snkdf['_path'].apply(len)==1] 70 | tomg_df = snkdf[snkdf['_path'].apply(len)>1] 71 | 72 | with open(snkdir/f'{snk_id}.snakefile', 'w') as f: 73 | f.write( 74 | f'''merge_allc_cpu = {n_cpu} 75 | mcg_context = 'CGN' 76 | chrom_size_path = '{chrom_size_path}' 77 | merge_sample_prefixes = [{','.join("'"+tomg_df['_cate']+'/'+tomg_df['_sample']+"'")}] 78 | copy_sample_prefixes = [{','.join("'"+tocp_df['_cate']+'/'+tocp_df['_sample']+"'")}] 79 | group = "{snk_id}" 80 | ''' 81 | ) 82 | f.write(template) 83 | snk_ids.append(snk_id) 84 | 85 | return snk_ids 86 | 87 | elem_snk_ids = prepare_snakefiles(elem_grp_df, output_path, 'elem',elem_snakegroup_num, template=MERGE_TEMPLATE) 88 | cate_snk_ids = prepare_snakefiles(cate_grp_df, output_path, 'cate',cate_snakegroup_num, template=MERGE_EXTRACT_TEMPLATE) 89 | 90 | def prepare_commands(snake_ids): 91 | cmds = [f'snakemake -d {outdir.resolve()} --snakefile {outdir.resolve()}/snakefiles/{snkid}.snakefile ' 92 | f'-j {n_cpu} --default-resources mem_mb=100 --resources mem_mb=1000 --rerun-incomplete' \ 93 | for snkid in snake_ids] 94 | return cmds 95 | 96 | 97 | 98 | with open(outdir/'run_snakemake_cmds_1.txt', 'w') as f: 99 | f.write('\n'.join(prepare_commands(elem_snk_ids))) 100 | with open(outdir/'run_snakemake_cmds_2.txt', 'w') as f: 101 | f.write('\n'.join(prepare_commands(cate_snk_ids))) 102 | -------------------------------------------------------------------------------- /cemba_data/bulk/mc_bulk_multigroup/mc_bulk_multigroup_template.py: -------------------------------------------------------------------------------- 1 | MERGE_TEMPLATE = ''' 2 | # Example (required) parameters 3 | # merge_allc_cpu = 10 4 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE' 5 | # merge_sample_prefixes = '[]' 6 | # copy_sample_prefixes = '[]' 7 | # group = 'GROUP_NAME' 8 | sample_prefixes = merge_sample_prefixes + copy_sample_prefixes 9 | 10 | # the main rule is the final target 11 | rule main: 12 | input: 13 | expand("{sample}.allc.tsv.gz", sample=sample_prefixes), 14 | expand("{sample}.allc.tsv.gz.tbi", sample=sample_prefixes), 15 | # output: 16 | # f"{group}.finished" 17 | # shell: 18 | # "date > {output}" 19 | 20 | 21 | 22 | # Merge ALLC 23 | rule merge_allc: 24 | input: 25 | "{sample}.pathlist", 26 | output: 27 | allc="{sample}.allc.tsv.gz", 28 | tbi="{sample}.allc.tsv.gz.tbi" 29 | threads: 30 | max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1))) 31 | resources: 32 | mem_mb=merge_allc_cpu * 5000 33 | run: 34 | if wildcards.sample in merge_sample_prefixes: 35 | shell("allcools merge-allc " 36 | "--allc_paths {input} " 37 | "--output_path {output.allc} " 38 | "--chrom_size_path {chrom_size_path} " 39 | "--cpu {threads}") 40 | else: 41 | shell("cp $(cat {input}) {output.allc} ;" 42 | "cp $(cat {input}).tbi {output.tbi} ;") 43 | 44 | ''' 45 | 46 | MERGE_EXTRACT_TEMPLATE = ''' 47 | # Example (required) parameters 48 | # merge_allc_cpu = 10 49 | # mcg_context = 'CGN' 50 | # chrom_size_path = 'PATH_TO_CHROM_SIZE_FILE' 51 | # merge_sample_prefixes = '[]' 52 | # copy_sample_prefixes = '[]' 53 | # group = 'GROUP_NAME' 54 | sample_prefixes = merge_sample_prefixes + copy_sample_prefixes 55 | 56 | # the main rule is the final target 57 | rule main: 58 | input: 59 | expand("{sample}.{mcg_context}-Merge.allc.tsv.gz", sample=sample_prefixes, mcg_context=[mcg_context]), 60 | expand("{sample}.{mcg_context}-Merge.allc.tsv.gz.tbi", sample=sample_prefixes, mcg_context=[mcg_context]), 61 | # output: 62 | # f"{group}.finished" 63 | # shell: 64 | # "date > {output}" 65 | 66 | 67 | # Merge ALLC 68 | rule merge_allc: 69 | input: 70 | "{sample}.pathlist", 71 | output: 72 | allc="{sample}.allc.tsv.gz", 73 | tbi="{sample}.allc.tsv.gz.tbi" 74 | threads: 75 | max(1, min(int(1.1 * merge_allc_cpu), int(workflow.cores / 1.1))) 76 | resources: 77 | mem_mb=merge_allc_cpu * 5000 78 | run: 79 | if wildcards.sample in merge_sample_prefixes: 80 | shell("allcools merge-allc " 81 | "--allc_paths {input} " 82 | "--output_path {output.allc} " 83 | "--chrom_size_path {chrom_size_path} " 84 | "--cpu {threads}") 85 | else: 86 | shell("cp $(cat {input}) {output.allc} ;" 87 | "cp $(cat {input}).tbi {output.tbi} ;") 88 | 89 | # Extract mCG ALLC for DMR calling 90 | rule extract_allc_mcg: 91 | input: 92 | "{sample}.allc.tsv.gz" 93 | output: 94 | allc_cg="{sample}.{mcg_context}-Merge.allc.tsv.gz", 95 | allc_cg_tbi="{sample}.{mcg_context}-Merge.allc.tsv.gz.tbi" 96 | threads: 97 | 1 98 | resources: 99 | mem_mb=100 100 | shell: 101 | "allcools extract-allc " 102 | "--allc_path {input} " 103 | "--output_prefix {wildcards.sample} " 104 | "--mc_contexts {mcg_context} " 105 | "--chrom_size_path {chrom_size_path} " 106 | "--strandness merge " 107 | "--output_format allc " 108 | "--cpu {threads}" 109 | ''' 110 | -------------------------------------------------------------------------------- /cemba_data/bulk/mct_bulk.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import pandas as pd 3 | import glob 4 | import subprocess 5 | from concurrent.futures import ProcessPoolExecutor, as_completed 6 | import os 7 | 8 | 9 | def merge_single_bam(bam_path, cell_id_to_cluster, output_prefix, header_dict): 10 | header = pysam.AlignmentHeader.from_dict(header_dict) 11 | clusters = set(cell_id_to_cluster.values()) 12 | cluster_read_counts = {c: 0 for c in clusters} 13 | 14 | # write reads by cluster 15 | with pysam.AlignmentFile(bam_path, "rb") as bam_file: 16 | # open BAM handles for each cluster 17 | cluster_handles = {} 18 | for cluster in clusters: 19 | cluster_handles[cluster] = pysam.AlignmentFile( 20 | f'{output_prefix}_{cluster}.bam', "wb", header=header) 21 | 22 | for read in bam_file: 23 | cell_id = read.get_tag('RG') 24 | try: 25 | cluster = cell_id_to_cluster[cell_id] 26 | # this removes RG tag 27 | read.set_tag('RG', None) 28 | cluster_handles[cluster].write(read) 29 | cluster_read_counts[cluster] += 1 30 | except KeyError: 31 | continue 32 | 33 | # close handles 34 | for handle in cluster_handles.values(): 35 | handle.close() 36 | 37 | # delete empty out_bam 38 | for cluster, count in cluster_read_counts.items(): 39 | bam_path = f'{output_prefix}_{cluster}.bam' 40 | if count == 0: 41 | subprocess.run(['rm', '-rf', bam_path]) 42 | return cluster_read_counts 43 | 44 | 45 | def merge_mct_cluster_bam(cell_id_to_cluster_path, 46 | bam_list_path, 47 | output_prefix, 48 | cpu=10): 49 | cell_id_to_cluster = pd.read_csv( 50 | cell_id_to_cluster_path, 51 | index_col=0, 52 | header=None, 53 | squeeze=True).to_dict() 54 | bam_paths = pd.read_csv(bam_list_path, header=None, squeeze=True).tolist() 55 | 56 | # get header 57 | with pysam.AlignmentFile(bam_paths[0]) as bam: 58 | header_dict = bam.header.as_dict() 59 | # remove cell specific info 60 | keys_to_delete = ['PG', 'RG', 'CO'] 61 | for k in keys_to_delete: 62 | if k in header_dict: 63 | del header_dict[k] 64 | 65 | clusters = set(cell_id_to_cluster.values()) 66 | total_cluster_read_counts = {c: 0 for c in clusters} 67 | 68 | # merge single bam files 69 | with ProcessPoolExecutor(cpu) as exe: 70 | futures = {} 71 | for i, path in enumerate(bam_paths): 72 | f = exe.submit(merge_single_bam, 73 | bam_path=path, 74 | cell_id_to_cluster=cell_id_to_cluster, 75 | output_prefix=f'{output_prefix}{i:06d}', 76 | header_dict=header_dict) 77 | futures[f] = path 78 | 79 | for f in as_completed(futures): 80 | cluster_read_counts = f.result() 81 | for k, v in cluster_read_counts.items(): 82 | total_cluster_read_counts[k] += v 83 | 84 | # merge cluster bam files 85 | with ProcessPoolExecutor(cpu) as exe: 86 | futures = {} 87 | for cluster in clusters: 88 | chunk_paths = list(glob.glob(f'{output_prefix}*_{cluster}.bam')) 89 | if len(chunk_paths) == 0: 90 | continue 91 | merge_cmd = f'samtools merge --no-PG -c -o {output_prefix}_{cluster}.bam ' \ 92 | f'{output_prefix}*_{cluster}.bam && ' \ 93 | f'samtools index {output_prefix}_{cluster}.bam' 94 | f = exe.submit(subprocess.run, 95 | merge_cmd, 96 | shell=True, 97 | check=True) 98 | futures[f] = chunk_paths 99 | 100 | for f in as_completed(futures): 101 | chunk_paths = futures[f] 102 | f.result() 103 | for path in chunk_paths: 104 | os.unlink(path) 105 | return 106 | -------------------------------------------------------------------------------- /cemba_data/demultiplex/__init__.py: -------------------------------------------------------------------------------- 1 | from .plateinfo_and_samplesheet import print_plate_info, make_sample_sheet 2 | from .demultiplex import demultiplex_pipeline, update_snakemake 3 | -------------------------------------------------------------------------------- /cemba_data/demultiplex/fastq_dataframe.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate raw FASTQ dataframe based on fixed name pattern 3 | name pattern is based on samplesheet generated in plateinfo_and_samplesheet.py 4 | """ 5 | 6 | import glob 7 | import logging 8 | import pathlib 9 | 10 | import pandas as pd 11 | 12 | # logger 13 | log = logging.getLogger() 14 | 15 | 16 | def _parse_v1_fastq_path(path): 17 | """ 18 | UID pattern of V1 {sample_id_prefix}-{plate1}-{plate2}-{plate_pos} 19 | FASTQ name pattern of V1: 20 | {sample_id_prefix}-{plate1}-{plate2}-{plate_pos}_{internal_info}_{lane}_{read_type}_{internal_info}.fastq.gz 21 | """ 22 | path = pathlib.Path(path) 23 | try: 24 | *_, plate1, plate2, multi_field = path.name.split('-') 25 | plate_pos, _, lane, read_type, _ = multi_field.split('_') 26 | try: 27 | assert plate_pos[0] in 'ABCDEFGH' 28 | assert int(plate_pos[1:]) in list(range(1, 13)) 29 | assert lane in {'L001', 'L002', 'L003', 'L004'} 30 | assert read_type in {'R1', 'R2'} 31 | assert plate1 != plate2 32 | except AssertionError: 33 | raise ValueError 34 | except ValueError: 35 | raise ValueError(f'Found unknown name pattern in path {path}') 36 | name_dict = dict(plate1=plate1, 37 | plate2=plate2, 38 | plate_pos=plate_pos, 39 | lane=lane, 40 | read_type=read_type, 41 | fastq_path=path, 42 | uid=f'{plate1}-{plate2}-{plate_pos}') 43 | name_series = pd.Series(name_dict) 44 | return name_series 45 | 46 | 47 | def _parse_v2_fastq_path(path): 48 | """ 49 | UID pattern of V2 {sample_id_prefix}-{plate}-{multiplex_group}-{barcode_name} 50 | FASTQ name pattern of V1: 51 | {sample_id_prefix}-{plate}-{multiplex_group}-{barcode_name}_{internal_info}_{lane}_{read_type}_{internal_info}.fastq.gz 52 | """ 53 | path = pathlib.Path(path) 54 | try: 55 | *_, plate, multiplex_group, multi_field = path.name.split('-') 56 | primer_name, _, lane, read_type, _ = multi_field.split('_') 57 | try: 58 | assert primer_name[0] in 'ABCDEFGHIJKLMNOP' 59 | assert int(primer_name[1:]) in list(range(1, 25)) 60 | assert int(multiplex_group) in list(range(1, 7)) 61 | assert lane in {'L001', 'L002', 'L003', 'L004'} 62 | assert read_type in {'R1', 'R2'} 63 | except AssertionError: 64 | raise ValueError 65 | except ValueError: 66 | raise ValueError(f'Found unknown name pattern in path {path}') 67 | name_dict = dict(plate=plate, 68 | multiplex_group=multiplex_group, 69 | primer_name=primer_name, 70 | lane=lane, 71 | read_type=read_type, 72 | fastq_path=path, 73 | uid=f'{plate}-{multiplex_group}-{primer_name}') 74 | name_series = pd.Series(name_dict) 75 | return name_series 76 | 77 | 78 | def make_fastq_dataframe(file_path, barcode_version, output_path=None): 79 | """ 80 | Generate fastq_dataframe for pipeline input. 81 | 82 | Parameters 83 | ---------- 84 | file_path 85 | Accept 1. path pattern contain wildcard, 2. path list, 3. path of one file contain all the paths. 86 | barcode_version 87 | Only accept two options: 1) V1 for 8 random index; 2) V2 for 384 random index. 88 | output_path 89 | output path of the fastq dataframe 90 | Returns 91 | ------- 92 | fastq_dataframe for pipeline input. 93 | """ 94 | barcode_version = barcode_version.upper() 95 | if barcode_version == 'V1': 96 | parser = _parse_v1_fastq_path 97 | elif barcode_version == 'V2': 98 | parser = _parse_v2_fastq_path 99 | else: 100 | raise ValueError(f'Primer Version can only be V1 or V2, got {barcode_version}.') 101 | 102 | if isinstance(file_path, str) and ('*' in file_path): 103 | file_path = [str(pathlib.Path(p).absolute()) for p in glob.glob(file_path)] 104 | elif isinstance(file_path, list): 105 | pass 106 | else: 107 | with open(file_path) as f: 108 | file_path = [line.strip() for line in f] 109 | log.info(f'{len(file_path)} FASTQ file paths in input') 110 | 111 | fastq_data = [] 112 | for path in file_path: 113 | name_series = parser(path) 114 | fastq_data.append(name_series) 115 | fastq_df = pd.DataFrame(fastq_data) 116 | log.info(f'{fastq_df.shape[0]} valid fastq names.') 117 | if fastq_df.shape[0] == 0: 118 | log.info('No fastq name remained, check if the name pattern is correct.') 119 | return None 120 | 121 | # make sure UID is unique 122 | for _, df in fastq_df.groupby(['lane', 'read_type']): 123 | if df['uid'].unique().size != df['uid'].size: 124 | raise ValueError(f'UID column is not unique.') 125 | if output_path is not None: 126 | fastq_df.to_csv(output_path, index=False) 127 | return fastq_df 128 | -------------------------------------------------------------------------------- /cemba_data/dmr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/dmr/__init__.py -------------------------------------------------------------------------------- /cemba_data/dmr/dss/__init__.py: -------------------------------------------------------------------------------- 1 | from .TwoGroup import run_dss_two_group 2 | from .MultiGroup import run_dss_multi_group -------------------------------------------------------------------------------- /cemba_data/files/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/default_config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/default_config/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_4m.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | [mode] 18 | mode = 4m 19 | 20 | 21 | [multiplexIndex] 22 | ; This section is for demultiplex step 23 | ; V1: 8 random index version 24 | ; V2: 384 random index version 25 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 26 | 27 | 28 | [fastqTrim] 29 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 30 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 31 | ; Universal illumina adapter 32 | 33 | overlap = 6 34 | ; least overlap of base and illumina adapter 35 | 36 | r1_left_cut = 10 37 | ; constant length to trim at 5 prime end, apply before quality trim. 38 | ; Aim to cut random primer part, determined by random primer length. 39 | ; Random primer can impact results https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 40 | 41 | r1_right_cut = 10 42 | ; constant length to trim at 3 prime end, apply before quality trim. 43 | 44 | r2_left_cut = 10 45 | ; constant length to trim at 5 prime end, apply before quality trim. 46 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 47 | 48 | r2_right_cut = 10 49 | ; constant length to trim at 3 prime end, apply before quality trim. 50 | 51 | quality_threshold = 20 52 | ; reads quality score threshold for trimming. 53 | 54 | length_threshold = 30 55 | ; reads length threshold after all trim steps. 56 | 57 | total_read_pairs_min = 1 58 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 59 | 60 | total_read_pairs_max = 6000000 61 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 62 | 63 | 64 | [mapping reference] 65 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 66 | ; reference directory of bismark 67 | 68 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 69 | ; reference prefix for the HISAT-3N DNA mapping 70 | 71 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 72 | ; reference prefix for the HISAT-3N RNA mapping 73 | 74 | hisat3n_repeat_index_type = no-repeat 75 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 76 | ; if "no-repeat", will run hisat-3n in the normal mode. 77 | 78 | 79 | [readSplit] 80 | trim_on_both_end = 5 81 | ; whether trim the unmapped reads before split. 82 | 83 | split_left_size = 40 84 | ; length of the left part of the split 85 | 86 | split_right_size = 40 87 | ; length of the right part of the split 88 | 89 | split_middle_min_size = 30 90 | ; minimum length of the middle part after the split, middle part shorter than this will not be used. 91 | 92 | split_min_read_length = 30 93 | ; minimum length of the read to perform split, read shorter than this will not be used. 94 | 95 | 96 | [star] 97 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR 98 | ; reference directory of STAR 99 | 100 | 101 | [bamFilter] 102 | mapq_threshold = 10 103 | ; reads MAPQ threshold 104 | 105 | 106 | [DNAReadsFilter] 107 | mc_rate_max_threshold = 0.5 108 | ; if read CH ratio >= mc_rate_max_threshold, skip this read 109 | 110 | dna_cov_min_threshold = 3 111 | ; if read CH sites <= cov_min_threshold, skip this read 112 | 113 | 114 | [RNAReadsFilter] 115 | mc_rate_min_threshold = 0.9 116 | ; if read CH ratio <= mc_rate_min_threshold, skip this read 117 | 118 | rna_cov_min_threshold = 3 119 | ; if read CH sites <= cov_min_threshold, skip this read 120 | 121 | nome_flag_str = --nome 122 | ; if '--nome', will exclude GpC sites from the read-level methylation fraction calculation 123 | 124 | 125 | [callMethylation] 126 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 127 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 128 | 129 | num_upstr_bases = 1 130 | ; number of base to include before mC 131 | ; change this to 1 for NOMe treatment to get GpCNN 132 | 133 | num_downstr_bases = 2 134 | ; number of base to include after mC 135 | 136 | compress_level = 5 137 | ; ALLC file compress level 138 | 139 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN 140 | ; mC patterns to check when calculate ALLC summary 141 | 142 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH 143 | ; alias for the above mC patterns in the summary table 144 | 145 | 146 | [featureCount] 147 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF 148 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference. 149 | 150 | feature_type = gene 151 | ; type of feature to count, pass to featureCount -t parameter 152 | 153 | id_type = gene_id 154 | ; type of feature id to use in the output file, pass to featureCount -g parameter 155 | 156 | 157 | [contact] 158 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 159 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 160 | ; chrom size file has two tab-separated columns and not header 161 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 162 | 163 | min_gap = 2500 164 | ; minimum gap distance for a read pair being considered as cis-long -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_m3c.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | 18 | [mode] 19 | mode = m3c 20 | 21 | 22 | [multiplexIndex] 23 | ; This section is for demultiplex step 24 | ; V1: 8 random index version 25 | ; V2: 384 random index version 26 | ; put V1 or V2 here 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 28 | 29 | 30 | [fastqTrim] 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 33 | ; Universal illumina adapter 34 | 35 | overlap = 6 36 | ; least overlap of base and illumina adapter 37 | 38 | r1_left_cut = 10 39 | ; constant length to trim at 5 prime end, apply before quality trim. 40 | ; Aim to cut random primer part, determined by random primer length. 41 | ; Random primer can impact results, see bellow: 42 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 43 | 44 | r1_right_cut = 10 45 | ; constant length to trim at 3 prime end, apply before quality trim. 46 | 47 | r2_left_cut = 10 48 | ; constant length to trim at 5 prime end, apply before quality trim. 49 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 50 | 51 | r2_right_cut = 10 52 | ; constant length to trim at 3 prime end, apply before quality trim. 53 | 54 | quality_threshold = 20 55 | ; reads quality score threshold for trimming. 56 | 57 | length_threshold = 30 58 | ; reads length threshold after all trim steps. 59 | 60 | total_read_pairs_min = 1 61 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 62 | 63 | total_read_pairs_max = 6000000 64 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 65 | 66 | 67 | [mapping reference] 68 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 69 | ; reference directory of bismark 70 | 71 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 72 | ; reference prefix for the HISAT-3N DNA mapping 73 | 74 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 75 | ; reference prefix for the HISAT-3N RNA mapping 76 | 77 | hisat3n_repeat_index_type = no-repeat 78 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 79 | ; if "no-repeat", will run hisat-3n in the normal mode. 80 | 81 | 82 | [readSplit] 83 | trim_on_both_end = 5 84 | ; whether trim the unmapped reads before split. 85 | 86 | split_left_size = 40 87 | ; length of the left part of the split 88 | 89 | split_right_size = 40 90 | ; length of the right part of the split 91 | 92 | split_middle_min_size = 30 93 | ; minimum length of the middle part after the split, middle part shorter than this will not be used. 94 | 95 | split_min_read_length = 30 96 | ; minimum length of the read to perform split, read shorter than this will not be used. 97 | 98 | 99 | [bamFilter] 100 | mapq_threshold = 10 101 | ; reads MAPQ threshold 102 | 103 | 104 | [callMethylation] 105 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 106 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 107 | 108 | num_upstr_bases = 0 109 | ; number of base to include before mC, use 0 for normal snmC, use 1 for NOMe treatment 110 | 111 | num_downstr_bases = 2 112 | ; number of base to include after mC 113 | 114 | compress_level = 5 115 | ; ALLC file compress level 116 | 117 | mc_stat_feature = CHN CGN CCC 118 | ; this is based on the num_upstr_bases and num_downstr_bases 119 | ; mC patterns to check when calculate ALLC summary, separated by space 120 | 121 | mc_stat_alias = mCH mCG mCCC 122 | ; alias for the above mC patterns in the summary table, 123 | ; separated by space and follow the same order as mc_stat_feature 124 | 125 | 126 | [contact] 127 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 128 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 129 | ; chrom size file has two tab-separated columns and not header 130 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 131 | 132 | min_gap = 2500 133 | ; minimum gap distance for a read pair being considered as cis-long -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_mc.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | 18 | [mode] 19 | mode = mc 20 | 21 | 22 | [multiplexIndex] 23 | ; This section is for demultiplex step 24 | ; V1: 8 random index version 25 | ; V2: 384 random index version 26 | ; put V1 or V2 here 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 28 | 29 | 30 | [fastqTrim] 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 33 | ; Universal illumina adapter 34 | 35 | overlap = 6 36 | ; least overlap of base and illumina adapter 37 | 38 | r1_left_cut = 10 39 | ; constant length to trim at 5 prime end, apply before quality trim. 40 | ; Aim to cut random primer part, determined by random primer length. 41 | ; Random primer can impact results, see bellow: 42 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 43 | 44 | r1_right_cut = 10 45 | ; constant length to trim at 3 prime end, apply before quality trim. 46 | 47 | r2_left_cut = 10 48 | ; constant length to trim at 5 prime end, apply before quality trim. 49 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 50 | 51 | r2_right_cut = 10 52 | ; constant length to trim at 3 prime end, apply before quality trim. 53 | 54 | quality_threshold = 20 55 | ; reads quality score threshold for trimming. 56 | 57 | length_threshold = 30 58 | ; reads length threshold after all trim steps. 59 | 60 | total_read_pairs_min = 1 61 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 62 | 63 | total_read_pairs_max = 6000000 64 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 65 | 66 | 67 | [mapping reference] 68 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 69 | ; reference directory of bismark 70 | 71 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 72 | ; reference prefix for the HISAT-3N DNA mapping 73 | 74 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 75 | ; reference prefix for the HISAT-3N RNA mapping 76 | 77 | hisat3n_repeat_index_type = no-repeat 78 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 79 | ; if "no-repeat", will run hisat-3n in the normal mode. 80 | 81 | unmapped_fastq = False 82 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 83 | 84 | [bamFilter] 85 | mapq_threshold = 10 86 | ; reads MAPQ threshold 87 | 88 | 89 | [callMethylation] 90 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 91 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 92 | 93 | num_upstr_bases = 0 94 | ; number of base to include before mC, use 0 for normal snmC, use 1 for NOMe treatment 95 | 96 | num_downstr_bases = 2 97 | ; number of base to include after mC 98 | 99 | compress_level = 5 100 | ; ALLC file compress level 101 | 102 | mc_stat_feature = CHN CGN CCC 103 | ; this is based on the num_upstr_bases and num_downstr_bases 104 | ; mC patterns to check when calculate ALLC summary, separated by space 105 | 106 | mc_stat_alias = mCH mCG mCCC 107 | ; alias for the above mC patterns in the summary table, 108 | ; separated by space and follow the same order as mc_stat_feature 109 | 110 | [allcPostprocessing] 111 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 112 | ; This file is needed when extract mCG sites from ALLC file. 113 | ; The UCSC chrom sizes file contain two tab separated columns 114 | ; the 1st column is the names of chromosomes, the names should be the same as your reference_fasta 115 | ; the 2nd column is the length of chromosomes. 116 | -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_mct-nome.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | [mode] 18 | # for mCAT, we still using mCT mode for simplicity, 19 | # the two differences specifically changed in this file for NOMe treatment are: 20 | # 1. [callMethylation] num_upstr_bases = 1 21 | # 2. [callMethylation] mc_stat_feature and mc_stat_alias changed 22 | mode = mct 23 | 24 | 25 | [multiplexIndex] 26 | ; This section is for demultiplex step 27 | ; V1: 8 random index version 28 | ; V2: 384 random index version 29 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 30 | 31 | 32 | [fastqTrim] 33 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 34 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 35 | ; Universal illumina adapter 36 | 37 | overlap = 6 38 | ; least overlap of base and illumina adapter 39 | 40 | r1_left_cut = 10 41 | ; constant length to trim at 5 prime end, apply before quality trim. 42 | ; Aim to cut random primer part, determined by random primer length. 43 | ; Random primer can impact results, see bellow 44 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 45 | 46 | r1_right_cut = 10 47 | ; constant length to trim at 3 prime end, apply before quality trim. 48 | 49 | r2_left_cut = 10 50 | ; constant length to trim at 5 prime end, apply before quality trim. 51 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 52 | 53 | r2_right_cut = 10 54 | ; constant length to trim at 3 prime end, apply before quality trim. 55 | 56 | quality_threshold = 20 57 | ; reads quality score threshold for trimming. 58 | 59 | length_threshold = 30 60 | ; reads length threshold after all trim steps. 61 | 62 | total_read_pairs_min = 1 63 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 64 | 65 | total_read_pairs_max = 6000000 66 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 67 | 68 | 69 | [mapping reference] 70 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 71 | ; reference directory of bismark 72 | 73 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 74 | ; reference prefix for the HISAT-3N DNA mapping 75 | 76 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 77 | ; reference prefix for the HISAT-3N RNA mapping 78 | 79 | hisat3n_repeat_index_type = no-repeat 80 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 81 | ; if "no-repeat", will run hisat-3n in the normal mode. 82 | 83 | unmapped_fastq = False 84 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 85 | 86 | 87 | [star] 88 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR 89 | ; reference directory of STAR 90 | 91 | 92 | [bamFilter] 93 | mapq_threshold = 10 94 | ; reads MAPQ threshold 95 | 96 | 97 | [DNAReadsFilter] 98 | mc_rate_max_threshold = 0.5 99 | ; if read CH ratio >= mc_rate_max_threshold, skip this read 100 | 101 | dna_cov_min_threshold = 3 102 | ; if read CH sites <= cov_min_threshold, skip this read 103 | 104 | [RNAReadsFilter] 105 | mc_rate_min_threshold = 0.9 106 | ; if read CH ratio <= mc_rate_min_threshold, skip this read 107 | 108 | rna_cov_min_threshold = 3 109 | ; if read CH sites <= cov_min_threshold, skip this read 110 | 111 | nome_flag_str = 112 | ; if '--nome', will exclude GpC sites from the read-level methylation fraction calculation 113 | 114 | 115 | [callMethylation] 116 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 117 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 118 | 119 | num_upstr_bases = 1 120 | ; number of base to include before mC 121 | ; change this to 1 for NOMe treatment to get GpCNN 122 | 123 | num_downstr_bases = 2 124 | ; number of base to include after mC 125 | 126 | compress_level = 5 127 | ; ALLC file compress level 128 | 129 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN 130 | ; mC patterns to check when calculate ALLC summary 131 | 132 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH 133 | ; alias for the above mC patterns in the summary table 134 | 135 | [featureCount] 136 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF 137 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference. 138 | 139 | feature_type = gene 140 | ; type of feature to count, pass to featureCount -t parameter 141 | 142 | id_type = gene_id 143 | ; type of feature id to use in the output file, pass to featureCount -g parameter 144 | 145 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 146 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 147 | ; chrom size file has two tab-separated columns and not header 148 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 149 | -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_mct.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | 18 | [mode] 19 | mode = mct 20 | 21 | 22 | [multiplexIndex] 23 | ; This section is for demultiplex step 24 | ; V1: 8 random index version 25 | ; V2: 384 random index version 26 | ; put V1 or V2 here 27 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 28 | 29 | 30 | [fastqTrim] 31 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 32 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 33 | ; Universal illumina adapter 34 | 35 | overlap = 6 36 | ; least overlap of base and illumina adapter 37 | 38 | r1_left_cut = 10 39 | ; constant length to trim at 5 prime end, apply before quality trim. 40 | ; Aim to cut random primer part, determined by random primer length. 41 | ; Random primer can impact results https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 42 | 43 | r1_right_cut = 10 44 | ; constant length to trim at 3 prime end, apply before quality trim. 45 | 46 | r2_left_cut = 10 47 | ; constant length to trim at 5 prime end, apply before quality trim. 48 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 49 | 50 | r2_right_cut = 10 51 | ; constant length to trim at 3 prime end, apply before quality trim. 52 | 53 | quality_threshold = 20 54 | ; reads quality score threshold for trimming. 55 | 56 | length_threshold = 30 57 | ; reads length threshold after all trim steps. 58 | 59 | total_read_pairs_min = 1 60 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 61 | 62 | total_read_pairs_max = 6000000 63 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 64 | 65 | 66 | [mapping reference] 67 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 68 | ; reference directory of bismark 69 | 70 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 71 | ; reference prefix for the HISAT-3N DNA mapping 72 | 73 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 74 | ; reference prefix for the HISAT-3N RNA mapping 75 | 76 | hisat3n_repeat_index_type = no-repeat 77 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 78 | ; if "no-repeat", will run hisat-3n in the normal mode. 79 | 80 | 81 | unmapped_fastq = False 82 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 83 | 84 | 85 | [star] 86 | star_reference = CHANGE_THIS_TO_YOUR_STAR_REFERENCE_DIR 87 | ; reference directory of STAR 88 | 89 | 90 | [bamFilter] 91 | mapq_threshold = 10 92 | ; reads MAPQ threshold 93 | 94 | 95 | [DNAReadsFilter] 96 | mc_rate_max_threshold = 0.5 97 | ; if read CH ratio >= mc_rate_max_threshold, skip this read 98 | 99 | dna_cov_min_threshold = 3 100 | ; if read CH sites <= cov_min_threshold, skip this read 101 | 102 | 103 | [RNAReadsFilter] 104 | mc_rate_min_threshold = 0.9 105 | ; if read CH ratio <= mc_rate_min_threshold, skip this read 106 | 107 | rna_cov_min_threshold = 3 108 | ; if read CH sites <= cov_min_threshold, skip this read 109 | 110 | nome_flag_str = 111 | 112 | [callMethylation] 113 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 114 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 115 | 116 | num_upstr_bases = 0 117 | ; number of base to include before mC 118 | 119 | num_downstr_bases = 2 120 | ; number of base to include after mC 121 | 122 | compress_level = 5 123 | ; ALLC file compress level 124 | 125 | mc_stat_feature = CHN CGN CCC 126 | ; mC patterns to check when calculate ALLC summary 127 | 128 | mc_stat_alias = mCH mCG mCCC 129 | ; alias for the above mC patterns in the summary table 130 | 131 | [featureCount] 132 | gtf_path = CHANGE_THIS_TO_YOUR_GENE_ANNOTATION_GTF 133 | ; path to gene annotation .gtf file. This must be the same as the one used in build STAR reference. 134 | 135 | feature_type = gene 136 | ; type of feature to count, pass to featureCount -t parameter 137 | 138 | id_type = gene_id 139 | ; type of feature id to use in the output file, pass to featureCount -g parameter 140 | 141 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 142 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 143 | ; chrom size file has two tab-separated columns and not header 144 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 145 | -------------------------------------------------------------------------------- /cemba_data/files/default_config/mapping_config_nome.ini: -------------------------------------------------------------------------------- 1 | ; Mapping configurations 2 | ; 3 | ; INI format 4 | ; [Section1] 5 | ; KEY1 = VALUE1 6 | ; KEY2 = VALUE2 7 | ; 8 | ; [Section2] 9 | ; KEY1 = VALUE1 10 | ; KEY2 = VALUE2 11 | ; 12 | ; lines start with ";" is comment. 13 | ; 14 | ; NOTE: Don't change any section or key names. 15 | ; Custom keys won't work, only change value when adjust parameters. 16 | ; 17 | [mode] 18 | # for NOMe treated snmC, we still using mc mode for simplicity, 19 | # the two differences specifically changed in this file for NOMe treatment are: 20 | # 1. [callMethylation] num_upstr_bases = 1 21 | # 2. [callMethylation] mc_stat_feature and mc_stat_alias changed 22 | mode = mc 23 | 24 | 25 | [multiplexIndex] 26 | ; This section is for demultiplex step 27 | ; V1: 8 random index version 28 | ; V2: 384 random index version 29 | ; put V1 or V2 here 30 | barcode_version = USE_CORRECT_BARCODE_VERSION_HERE 31 | 32 | 33 | [fastqTrim] 34 | r1_adapter = AGATCGGAAGAGCACACGTCTGAAC 35 | r2_adapter = AGATCGGAAGAGCGTCGTGTAGGGA 36 | ; Universal illumina adapter 37 | 38 | overlap = 6 39 | ; least overlap of base and illumina adapter 40 | 41 | r1_left_cut = 10 42 | ; constant length to trim at 5 prime end, apply before quality trim. 43 | ; Aim to cut random primer part, determined by random primer length. 44 | ; Random primer can impact results, see bellow: 45 | ; https://sequencing.qcfail.com/articles/mispriming-in-pbat-libraries-causes-methylation-bias-and-poor-mapping-efficiencies/ 46 | 47 | r1_right_cut = 10 48 | ; constant length to trim at 3 prime end, apply before quality trim. 49 | 50 | r2_left_cut = 10 51 | ; constant length to trim at 5 prime end, apply before quality trim. 52 | ; Aim to cut Y-tailing by adaptase, exact length is uncertain. 53 | 54 | r2_right_cut = 10 55 | ; constant length to trim at 3 prime end, apply before quality trim. 56 | 57 | quality_threshold = 20 58 | ; reads quality score threshold for trimming. 59 | 60 | length_threshold = 30 61 | ; reads length threshold after all trim steps. 62 | 63 | total_read_pairs_min = 1 64 | ; total minimum reads number threshold for a cell to be analyzed in subsequent steps. 65 | 66 | total_read_pairs_max = 6000000 67 | ; total maximum reads number threshold for a cell to be analyzed in subsequent steps. 68 | 69 | [mapping reference] 70 | bismark_reference= CHANGE_THIS_TO_YOUR_BISMARK_REFERENCE_DIR 71 | ; reference directory of bismark 72 | 73 | hisat3n_dna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_DNA_REFERENCE 74 | ; reference prefix for the HISAT-3N DNA mapping 75 | 76 | hisat3n_rna_reference= CHANGE_THIS_TO_YOUR_HISAT3N_RNA_REFERENCE 77 | ; reference prefix for the HISAT-3N RNA mapping 78 | 79 | hisat3n_repeat_index_type = no-repeat 80 | ; repeat index type for HISAT-3N, if "repeat", repeat index mapping will be used. 81 | ; if "no-repeat", will run hisat-3n in the normal mode. 82 | 83 | unmapped_fastq = False 84 | ; whether unmapped FASTQ file should be kept. Use this for trouble shooting purpose. 85 | 86 | [bamFilter] 87 | mapq_threshold = 10 88 | ; reads MAPQ threshold 89 | 90 | 91 | [callMethylation] 92 | reference_fasta = CHANGE_THIS_TO_YOUR_REFERENCE_FASTA 93 | ; reference fasta file, use the same one that bismark_mapping reference is prepared from 94 | 95 | num_upstr_bases = 1 96 | ; number of base to include before mC 97 | 98 | num_downstr_bases = 2 99 | ; number of base to include after mC 100 | 101 | compress_level = 5 102 | ; ALLC file compress level 103 | 104 | mc_stat_feature = HCHN HCYN HCGN HCCC GCYN GCHN 105 | ; mC patterns to check when calculate ALLC summary 106 | 107 | mc_stat_alias = HmCH HmCY HmCG HmCCC GmCY GmCH 108 | ; alias for the above mC patterns in the summary table 109 | 110 | chrom_size_path = CHANGE_THIS_TO_YOUR_CHROM_SIZE_PATH 111 | ; only chromosomes appeared from the chrom_size_path file will be included in contact calling 112 | ; chrom size file has two tab-separated columns and not header 113 | ; 1) chrom name, the same as ref fasta; 2) chrom size. 114 | -------------------------------------------------------------------------------- /cemba_data/files/mapping_summary_template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/mapping_summary_template/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/plate_info_template_v1.txt: -------------------------------------------------------------------------------- 1 | # .__ 2 | # ___________ _____ ______ | | ____ 3 | # / ___/\__ \ / \\____ \| | _/ __ \ 4 | # \___ \ / __ \| Y Y \ |_> > |_\ ___/ 5 | # /____ >(____ /__|_| / __/|____/\___ > 6 | # \/ \/ \/|__| \/ 7 | # .__ __ ._. 8 | # _____| |__ ____ _____/ |_| | 9 | # / ___/ | \_/ __ \_/ __ \ __\ | 10 | # \___ \| Y \ ___/\ ___/| | \| 11 | # /____ >___| /\___ >\___ >__| __ 12 | # \/ \/ \/ \/ \/ 13 | # 14 | # ____ ________ 15 | # \ \ / /_ | 16 | # \ Y / | | 17 | # \ / | | 18 | # \___/ |___| 19 | # 20 | # 21 | # PlateInfo template of single cell sequencing demultiplex 22 | # 23 | # This file template contain 3 sections. 24 | # 25 | # [CriticalInfo] 26 | # [LibraryInfo] 27 | # [PlateInfo] 28 | # 29 | # The final sample id will be values of each part concatenated by "-" in the following order 30 | # [Values in LibraryInfo] + [Additional values in PlateInfo] + [Sample UID determined by library strategy] 31 | # 32 | # Empty lines and line start with "#" will be ignored. You can remove these if you understand the template. 33 | # 34 | 35 | 36 | # ===================================================================================================== 37 | 38 | [CriticalInfo] 39 | 40 | # ===================================================================================================== 41 | 42 | # Explain: 43 | # Every key=value pairs are required. key name can not be change. 44 | # Some values have limited options, they are: 45 | # n_random_index choice: 8 (V1), if your n_random_index=384, use V2 template! 46 | # input_plate_size choice: 384 47 | # 48 | # Example: 49 | # n_random_index=8 50 | # input_plate_size=384 51 | # pool_id=Pool_NN 52 | # tube_label=Pool_NN_MM_AA_BB # often times 2 libraries are pooled together on Nova-Seq, but there is no rule on this. 53 | # email=your-email@salk.edu 54 | # 55 | 56 | # if your n_random_index=384, use V2 template! 57 | n_random_index=8 58 | input_plate_size=384 59 | pool_id= 60 | tube_label= 61 | email= 62 | 63 | 64 | # ===================================================================================================== 65 | 66 | [LibraryInfo] 67 | 68 | # ===================================================================================================== 69 | # 70 | # Explain: 71 | # library metadata that applies to all plates 72 | # this whole part is optional, may contain any "key=value" pairs necessary to describe the library. 73 | # All the values will be concatenate by "-" into the sample id and present in file name. Use UNIX path safe characters. 74 | # Any character does not belong to [a-zA-Z0-9] will be replaced by "_" 75 | # Here are the recommended information to include, you can define your own based on your needs, 76 | # non of these information is actually used in demultiplex or mapping: 77 | # these keys are ALL optional, but better be consistent throughout the project. 78 | # 79 | # Example: 80 | # lib_comp_date=180101 81 | # project=CEMBA 82 | # organism=mm 83 | # dev_stage_age=P56 84 | # tissue_cell_type=1A 85 | # exp_cond=1 86 | # bio_rep=1 87 | # tech_rep=1 88 | # lib_type=snmC-seq2 89 | # sequencer=NovaSeq 90 | # se_pe=pe 91 | # read_length=150 92 | # 93 | 94 | 95 | 96 | 97 | 98 | # ===================================================================================================== 99 | 100 | [PlateInfo] 101 | 102 | # ===================================================================================================== 103 | 104 | # Explain: 105 | # Plate metadata that specific to certain plates, a tab separated table 106 | # First row must be header start with: plate_id primer_quarter 107 | # First 2 columns are required and must be in the order of: plate_id primer_quarter 108 | # You can add more plate specific info into additional columns, those info will be appended to LibraryInfo as part of sample_id. 109 | # All the values will be concatenate by "-" into the sample id and present in file name. 110 | # So better not to include "-" in value and use UNIX path safe characters. 111 | # 112 | # If your experiment design contain sup-plate difference (e.g. some rows come from 1 sample, some rows come from another), 113 | # you should maintain your own metadata about this and added into the mapping summary table later after mapping by yourself 114 | # Because here the plate info is just for barcode demultiplexing, so that we can get single cell data AND the plate position of each cell 115 | # with the plate position, it should be very convenient for you to add any custom information you designed in your experiment. 116 | # 117 | # primer_quarter valid values are: 118 | # Set1_Q1, Set1_Q2, Set1_Q3, Set1_Q4 119 | # SetB_Q1, SetB_Q2, SetB_Q3, SetB_Q4 120 | # 121 | # Example: 122 | # plate_id primer_quarter 123 | # CEMBA190530_9C_1 SetB_Q1 124 | # CEMBA190530_9C_2 SetB_Q1 125 | # CEMBA190530_9C_3 SetB_Q2 126 | # CEMBA190530_9C_4 SetB_Q2 127 | # CEMBA190620_9C_1 SetB_Q3 128 | # CEMBA190620_9C_2 SetB_Q3 129 | # CEMBA190620_9C_3 SetB_Q4 130 | # CEMBA190620_9C_4 SetB_Q4 131 | # 132 | # Remember the columns MUST be separate by tab not space 133 | # 134 | 135 | 136 | # ===================================================================================================== 137 | # if your n_random_index=384, use V2 template! 138 | # ===================================================================================================== 139 | 140 | plate_id primer_quarter 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /cemba_data/files/plate_info_template_v2.txt: -------------------------------------------------------------------------------- 1 | # .__ 2 | # ___________ _____ ______ | | ____ 3 | # / ___/\__ \ / \\____ \| | _/ __ \ 4 | # \___ \ / __ \| Y Y \ |_> > |_\ ___/ 5 | # /____ >(____ /__|_| / __/|____/\___ > 6 | # \/ \/ \/|__| \/ 7 | # .__ __ ._. 8 | # _____| |__ ____ _____/ |_| | 9 | # / ___/ | \_/ __ \_/ __ \ __\ | 10 | # \___ \| Y \ ___/\ ___/| | \| 11 | # /____ >___| /\___ >\___ >__| __ 12 | # \/ \/ \/ \/ \/ 13 | # 14 | # ____ ____________ 15 | # \ \ / /\_____ \ 16 | # \ Y / / ____/ 17 | # \ / / \ 18 | # \___/ \_______ \ 19 | # \/ 20 | # 21 | # PlateInfo template of single cell sequencing demultiplex 22 | # 23 | # This file template contain 3 sections. 24 | # 25 | # [CriticalInfo] 26 | # [LibraryInfo] 27 | # [PlateInfo] 28 | # 29 | # The final sample id will be values of each part concatenated by "-" in the following order 30 | # [Values in LibraryInfo] + [Additional values in PlateInfo] + [Sample UID determined by library strategy] 31 | # 32 | # Empty lines and line start with "#" will be ignored. You can remove these if you understand the template. 33 | # 34 | 35 | 36 | # ===================================================================================================== 37 | 38 | [CriticalInfo] 39 | 40 | # ===================================================================================================== 41 | 42 | # Explain: 43 | # Every key=value pairs are required. key name can not be change. 44 | # Some values have limited options, they are: 45 | # n_random_index choice: 384 (V2), if your n_random_index=8, use V1 template! 46 | # input_plate_size choice: 384 47 | # 48 | # 49 | # Example: 50 | # n_random_index=8 51 | # input_plate_size=384 52 | # pool_id=Pool_73 53 | # tube_label=Pool_72_73_9A_10C # often times 2 library are pooled together on Nova-Seq 54 | # email=your-email@salk.edu 55 | # 56 | 57 | # if your n_random_index=8, use V1 template! 58 | n_random_index=384 59 | input_plate_size=384 60 | pool_id= 61 | tube_label= 62 | email= 63 | 64 | 65 | # ===================================================================================================== 66 | 67 | [LibraryInfo] 68 | 69 | # ===================================================================================================== 70 | # 71 | # Explain: 72 | # library metadata that applies to all plates 73 | # this whole part is optional, may contain any "key=value" pairs necessary to describe the library. 74 | # All the values will be concatenate by "-" into the sample id and present in file name. Use UNIX path safe characters. 75 | # Any character does not belong to [a-zA-Z0-9] will be replaced by "_" 76 | # Here are the recommended information to include, you can define your own based on your needs, 77 | # non of these information is actually used in demultiplex or mapping: 78 | # these keys are ALL optional, but better be consistent throughout the project. 79 | # 80 | # Example: 81 | # lib_comp_date=180101 82 | # project=CEMBA 83 | # organism=mm 84 | # dev_stage_age=P56 85 | # tissue_cell_type=1A 86 | # exp_cond=1 87 | # bio_rep=1 88 | # tech_rep=1 89 | # lib_type=snmC-seq2 90 | # sequencer=NovaSeq 91 | # se_pe=pe 92 | # read_length=150 93 | # 94 | # 95 | 96 | 97 | 98 | 99 | 100 | # ===================================================================================================== 101 | 102 | [PlateInfo] 103 | 104 | # ===================================================================================================== 105 | 106 | # Explain: 107 | # Plate metadata that specific to certain plates, a tab separated table 108 | # First row must be header start with: plate_id primer_quarter 109 | # First 3 columns are required and must be in the order of: plate_id multiplex_group primer_name 110 | # You can add more plate specific info into additional columns, those info will be appended to LibraryInfo as part of sample_id. 111 | # All the values will be concatenate by "-" into the sample id and present in file name. 112 | # So better not to include "-" in value and use UNIX path safe characters. 113 | # 114 | # If your experiment design contain sup-plate difference (e.g. some rows come from 1 sample, some rows come from another), 115 | # you should maintain your own metadata about this and added into the mapping summary table later after mapping by yourself 116 | # Because here the plate info is just for barcode demultiplexing, so that we can get single cell data AND the plate position of each cell 117 | # with the plate position, it should be very convenient for you to add any custom information you designed in your experiment. 118 | # 119 | # primer_name valid values are: 120 | # [A-P][1-24] 121 | # 122 | # Example: 123 | # plate_id multiplex_group primer_name 124 | # Plate_1 1 B1 125 | # Plate_1 2 B3 126 | # Plate_1 3 B5 127 | # Plate_1 4 B7 128 | # Plate_1 5 B9 129 | # Plate_1 6 B11 130 | # 131 | # Remember the columns MUST be separate by tab, not space or comma 132 | # 133 | 134 | 135 | # ===================================================================================================== 136 | # if your n_random_index=8, use V1 template! 137 | # ===================================================================================================== 138 | 139 | plate_id multiplex_group primer_name 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v1.fa: -------------------------------------------------------------------------------- 1 | >AD001 2 | ^ATCACG 3 | >AD002 4 | ^CGATGT 5 | >AD004 6 | ^TGACCA 7 | >AD006 8 | ^GCCAAT 9 | >AD007 10 | ^CAGATC 11 | >AD008 12 | ^ACTTGA 13 | >AD010 14 | ^TAGCTT 15 | >AD012 16 | ^CTTGTA 17 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/files/random_index_v2/__init__.py -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_1.fa: -------------------------------------------------------------------------------- 1 | >A1 2 | ^ACGATCAG 3 | >A13 4 | ^ATCATGCG 5 | >C1 6 | ^GTAGCGTA 7 | >C13 8 | ^GTCCTAAG 9 | >E1 10 | ^GATCAAGG 11 | >E13 12 | ^TACCGGAT 13 | >G1 14 | ^CAGTCACA 15 | >G13 16 | ^ACCTCAGT 17 | >I1 18 | ^TACTGCTC 19 | >I13 20 | ^GTGGTATG 21 | >K1 22 | ^AGCTACCA 23 | >K13 24 | ^CAGACGTT 25 | >M1 26 | ^AGGTCAAC 27 | >M13 28 | ^CAATCAGG 29 | >O1 30 | ^AACAGGTG 31 | >O13 32 | ^CTACAAGG 33 | >A2 34 | ^TGATAGGC 35 | >A14 36 | ^ACAACGTG 37 | >C2 38 | ^CAGGTAAG 39 | >C14 40 | ^AATTCCGG 41 | >E2 42 | ^ACAAGCTC 43 | >E14 44 | ^GTGATCCA 45 | >G2 46 | ^AACCGTGT 47 | >G14 48 | ^GTCCTTGA 49 | >I2 50 | ^ATTCCGCT 51 | >I14 52 | ^ACTGCGAA 53 | >K2 54 | ^CACGCAAT 55 | >K14 56 | ^AAGCGACT 57 | >M2 58 | ^AGAAGGAC 59 | >M14 60 | ^CGAATACG 61 | >O2 62 | ^AGCAGACA 63 | >O14 64 | ^GCCTTAAC 65 | >B1 66 | ^GAACGAAG 67 | >B13 68 | ^GACTACGA 69 | >D1 70 | ^ATACGCAG 71 | >D13 72 | ^CCTGTCAA 73 | >F1 74 | ^GTTGCTGT 75 | >F13 76 | ^CGAATTGC 77 | >H1 78 | ^CCAAGGTT 79 | >H13 80 | ^TCTACGCA 81 | >J1 82 | ^TGCACTTG 83 | >J13 84 | ^AGAGCAGA 85 | >L1 86 | ^GATGCTAC 87 | >L13 88 | ^CGACCTAA 89 | >N1 90 | ^TCAGCCTT 91 | >N13 92 | ^CCGTTATG 93 | >P1 94 | ^TGACCGTT 95 | >P13 96 | ^AGCTAAGC 97 | >B2 98 | ^AGGCAATG 99 | >B14 100 | ^ACGCTTCT 101 | >D2 102 | ^GCGTTAGA 103 | >D14 104 | ^TCAATCCG 105 | >F2 106 | ^CTAGGTTG 107 | >F14 108 | ^GCATAGTC 109 | >H2 110 | ^CTCGGTAA 111 | >H14 112 | ^CAACTTGG 113 | >J2 114 | ^CCTAAGTC 115 | >J14 116 | ^TTCCTCCT 117 | >L2 118 | ^AAGCGTTC 119 | >L14 120 | ^CTTAGGAC 121 | >N2 122 | ^CAACTGAC 123 | >N14 124 | ^CTCACCAA 125 | >P2 126 | ^CTCTATCG 127 | >P14 128 | ^CGCAATGT 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_2.fa: -------------------------------------------------------------------------------- 1 | >A3 2 | ^TCGAGAGT 3 | >A15 4 | ^TGTTCCGT 5 | >C3 6 | ^AGAGTCCA 7 | >C15 8 | ^TATGGCAC 9 | >E3 10 | ^TCTTCGAC 11 | >E15 12 | ^TTGCAACG 13 | >G3 14 | ^TCGATGAC 15 | >G15 16 | ^CGTCTTCA 17 | >I3 18 | ^GACGAACT 19 | >I15 20 | ^CCAACTTC 21 | >K3 22 | ^AGATTGCG 23 | >K15 24 | ^CTGAACGT 25 | >M3 26 | ^TACACACG 27 | >M15 28 | ^TCGTGCAT 29 | >O3 30 | ^AGTCGAAG 31 | >O15 32 | ^CGATGTTC 33 | >A4 34 | ^CATCCAAG 35 | >A16 36 | ^TGCTGTGA 37 | >C4 38 | ^GTATCGAG 39 | >C16 40 | ^TCTAGGAG 41 | >E4 42 | ^GAACCTTC 43 | >E16 44 | ^ACTGGTGT 45 | >G4 46 | ^CGCGTATT 47 | >G16 48 | ^CAGGTTCA 49 | >I4 50 | ^AAGCTCAC 51 | >I16 52 | ^TCTGTCGT 53 | >K4 54 | ^AGCTTCAG 55 | >K16 56 | ^CCTACCTA 57 | >M4 58 | ^GCGTATCA 59 | >M16 60 | ^TGCTTGCT 61 | >O4 62 | ^GTTAAGCG 63 | >O16 64 | ^GTTGGCAT 65 | >B3 66 | ^ACCTAGAC 67 | >B15 68 | ^TTACGTGC 69 | >D3 70 | ^AAGACCGT 71 | >D15 72 | ^CTATGCCT 73 | >F3 74 | ^AGAACCAG 75 | >F15 76 | ^CAAGAAGC 77 | >H3 78 | ^ACGTATGG 79 | >H15 80 | ^TGGCTCTT 81 | >J3 82 | ^TCACTCGA 83 | >J15 84 | ^CTTCGGTT 85 | >L3 86 | ^AGGAACAC 87 | >L15 88 | ^CTCTCAGA 89 | >N3 90 | ^AAGCATCG 91 | >N15 92 | ^CTAGCAGT 93 | >P3 94 | ^CATCTGCT 95 | >P15 96 | ^GTTCCATG 97 | >B4 98 | ^TCACCTAG 99 | >B16 100 | ^GAGTAGAG 101 | >D4 102 | ^TTGCGAGA 103 | >D16 104 | ^GACTTGTG 105 | >F4 106 | ^GTGTCCTT 107 | >F16 108 | ^CTCCTGAA 109 | >H4 110 | ^TACAGAGC 111 | >H16 112 | ^TCAGTAGG 113 | >J4 114 | ^TTCGTACG 115 | >J16 116 | ^GCTGTAAG 117 | >L4 118 | ^CGATTCTG 119 | >L16 120 | ^ATAGTCGG 121 | >N4 122 | ^TGCTCTAC 123 | >N16 124 | ^CAGAACTG 125 | >P4 126 | ^ACTCTCCA 127 | >P16 128 | ^CCTAGAGA 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_3.fa: -------------------------------------------------------------------------------- 1 | >A5 2 | ^CTAGCTCA 3 | >A17 4 | ^ATTAGCCG 5 | >C5 6 | ^GCTACTCT 7 | >C17 8 | ^TCGGATTC 9 | >E5 10 | ^ATCGTGGT 11 | >E17 12 | ^CACTTCAC 13 | >G5 14 | ^GAAGTGCT 15 | >G17 16 | ^TGCGTAAC 17 | >I5 18 | ^CTTCGCAA 19 | >I17 20 | ^GACGTCAT 21 | >K5 22 | ^CACACATC 23 | >K17 24 | ^TTGGACTG 25 | >M5 26 | ^CAAGTCGT 27 | >M17 28 | ^TAACGTCG 29 | >O5 30 | ^TGGAAGCA 31 | >O17 32 | ^ACCGGTTA 33 | >A6 34 | ^GTGAGACT 35 | >A18 36 | ^CCAAGTAG 37 | >C6 38 | ^TTCACGGA 39 | >C18 40 | ^ATCCGTTG 41 | >E6 42 | ^AGCGAGAT 43 | >E18 44 | ^CTAACCTG 45 | >G6 46 | ^AGTTCGCA 47 | >G18 48 | ^CCAACACT 49 | >I6 50 | ^TGATCACG 51 | >I18 52 | ^CTCAAGCT 53 | >K6 54 | ^CCTCGTTA 55 | >K18 56 | ^ATCTCCTG 57 | >M6 58 | ^CAACACAG 59 | >M18 60 | ^CTCGAACA 61 | >O6 62 | ^CATGGATC 63 | >O18 64 | ^CAACCTCT 65 | >B5 66 | ^TACGACGT 67 | >B17 68 | ^ACTGCTTG 69 | >D5 70 | ^CTCCAATC 71 | >D17 72 | ^TTCGGCTA 73 | >F5 74 | ^GATGTCGA 75 | >F17 76 | ^CACCAGTT 77 | >H5 78 | ^AAGGACCA 79 | >H17 80 | ^CCTTCCAT 81 | >J5 82 | ^CACTGTAG 83 | >J17 84 | ^ACAACAGC 85 | >L5 86 | ^ACCATCCT 87 | >L17 88 | ^AGGCTGAA 89 | >N5 90 | ^GCCAATAC 91 | >N17 92 | ^GCCAGAAT 93 | >P5 94 | ^CGCTGATA 95 | >P17 96 | ^GCATCCTA 97 | >B6 98 | ^CATACGGA 99 | >B18 100 | ^ATGCCTAG 101 | >D6 102 | ^ACACCGAT 103 | >D18 104 | ^CCGATGTA 105 | >F6 106 | ^TACCTGCA 107 | >F18 108 | ^AACGCACA 109 | >H6 110 | ^GCATAACG 111 | >H18 112 | ^ACAGCAAG 113 | >J6 114 | ^TCCTGGTA 115 | >J18 116 | ^GACATCTC 117 | >L6 118 | ^GCAACCAT 119 | >L18 120 | ^GAGACCAA 121 | >N6 122 | ^CATCACGT 123 | >N18 124 | ^AGAAGCCT 125 | >P6 126 | ^CAGCATAC 127 | >P18 128 | ^TACTAGCG 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_4.fa: -------------------------------------------------------------------------------- 1 | >A7 2 | ^ATCGTCTC 3 | >A19 4 | ^CGATCGAT 5 | >C7 6 | ^CTCTGGAT 7 | >C19 8 | ^AACAGCGA 9 | >E7 10 | ^CGGTAATC 11 | >E19 12 | ^TAGCCATG 13 | >G7 14 | ^CTTCCTTC 15 | >G19 16 | ^AACACGCT 17 | >I7 18 | ^ATGGCGAT 19 | >I19 20 | ^ACGTCCAA 21 | >K7 22 | ^GAGCAATC 23 | >K19 24 | ^GTCTGCAA 25 | >M7 26 | ^AGCTAGTG 27 | >M19 28 | ^AAGGCGTA 29 | >O7 30 | ^CTCGTTCT 31 | >O19 32 | ^GAACGGTT 33 | >A8 34 | ^CTGATGAG 35 | >A20 36 | ^AACTGAGG 37 | >C8 38 | ^GAGCTCTA 39 | >C20 40 | ^GATAGCCA 41 | >E8 42 | ^CCGTAACT 43 | >E20 44 | ^AGCCAACT 45 | >G8 46 | ^TAGTCAGC 47 | >G20 48 | ^GAGAGTAC 49 | >I8 50 | ^CAATGCGA 51 | >I20 52 | ^AACCACTC 53 | >K8 54 | ^TGAGACGA 55 | >K20 56 | ^TCACGATG 57 | >M8 58 | ^TCCACGTT 59 | >M20 60 | ^ACATGGAG 61 | >O8 62 | ^ACAGAGGT 63 | >O20 64 | ^TGGATGGT 65 | >B7 66 | ^TTGAGCTC 67 | >B19 68 | ^GCCTATGT 69 | >D7 70 | ^TCTGGACA 71 | >D19 72 | ^ACCGACAA 73 | >F7 74 | ^AGGAGGTT 75 | >F19 76 | ^GTATTCCG 77 | >H7 78 | ^TATGCGGT 79 | >H19 80 | ^ATACTGGC 81 | >J7 82 | ^GTACGATC 83 | >J19 84 | ^AGCCGTAA 85 | >L7 86 | ^GAACGTGA 87 | >L19 88 | ^ATCGGAGA 89 | >N7 90 | ^GACACAGT 91 | >N19 92 | ^CGAGAGAA 93 | >P7 94 | ^TCGTCTGA 95 | >P19 96 | ^CCATGAAC 97 | >B8 98 | ^GTCATCGT 99 | >B20 100 | ^CAACTCCA 101 | >D8 102 | ^CGTATCTC 103 | >D20 104 | ^TAGGAGCT 105 | >F8 106 | ^CCTTAGGT 107 | >F20 108 | ^TAGTCTCG 109 | >H8 110 | ^GATCAGAC 111 | >H20 112 | ^GAATGGCA 113 | >J8 114 | ^CATTGACG 115 | >J20 116 | ^CAACCGTA 117 | >L8 118 | ^AATCCAGC 119 | >L20 120 | ^AACAAGGC 121 | >N8 122 | ^GCCACTTA 123 | >N20 124 | ^CACGATTC 125 | >P8 126 | ^TACTCCAG 127 | >P20 128 | ^CGTCCATT 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_5.fa: -------------------------------------------------------------------------------- 1 | >A9 2 | ^TCGACAAG 3 | >A21 4 | ^GATCTTGC 5 | >C9 6 | ^AGATCGTC 7 | >C21 8 | ^CCAACGAA 9 | >E9 10 | ^AGTTGTGC 11 | >E21 12 | ^ACAGGCAT 13 | >G9 14 | ^CGAACAAC 15 | >G21 16 | ^ACTCGATC 17 | >I9 18 | ^ACATGCCA 19 | >I21 20 | ^GATCCACT 21 | >K9 22 | ^ATAGAGCG 23 | >K21 24 | ^CCACATTG 25 | >M9 26 | ^CTCCTAGT 27 | >M21 28 | ^TCTTACGG 29 | >O9 30 | ^ACGAGAAC 31 | >O21 32 | ^CTGTACCA 33 | >A10 34 | ^ACGGTACA 35 | >A22 36 | ^AGGTAGGA 37 | >C10 38 | ^GTCAGTCA 39 | >C22 40 | ^TATGACCG 41 | >E10 42 | ^TCAGACAC 43 | >E22 44 | ^CCAGTTGA 45 | >G10 46 | ^AACACCAC 47 | >G22 48 | ^AGATACGG 49 | >I10 50 | ^ATGCGTCA 51 | >I22 52 | ^CTTACAGC 53 | >K10 54 | ^CACAGGAA 55 | >K22 56 | ^CCACAACA 57 | >M10 58 | ^ATCGCAAC 59 | >M22 60 | ^ACAAGACG 61 | >O10 62 | ^TAAGTGGC 63 | >O22 64 | ^CTATCCAC 65 | >B9 66 | ^AGTACACG 67 | >B21 68 | ^GTACCACA 69 | >D9 70 | ^AACACTGG 71 | >D21 72 | ^CGTAGATG 73 | >F9 74 | ^AATCGCTG 75 | >F21 76 | ^TTCGAAGC 77 | >H9 78 | ^AAGGAAGG 79 | >H21 80 | ^AACCTACG 81 | >J9 82 | ^TGGTGAAG 83 | >J21 84 | ^CTCTTGTC 85 | >L9 86 | ^TAGAACGC 87 | >L21 88 | ^GATACCTG 89 | >N9 90 | ^AAGAGGCA 91 | >N21 92 | ^AACTCGGA 93 | >P9 94 | ^CACATGGT 95 | >P21 96 | ^ATCCACGA 97 | >B10 98 | ^TTACCGAC 99 | >B22 100 | ^AAGTCCTC 101 | >D10 102 | ^AAGGAGAC 103 | >D22 104 | ^CAACGAGT 105 | >F10 106 | ^CACAGACT 107 | >F22 108 | ^ACTCTGAG 109 | >H10 110 | ^CGCAACTA 111 | >H22 112 | ^CGGATCAA 113 | >J10 114 | ^ACCTCTTC 115 | >J22 116 | ^TGCGATAG 117 | >L10 118 | ^AGTGCATC 119 | >L22 120 | ^CCAGTATC 121 | >N10 122 | ^GCTTCACA 123 | >N22 124 | ^AAGCTGGT 125 | >P10 126 | ^GAGGCATT 127 | >P22 128 | ^TCGCTATC 129 | -------------------------------------------------------------------------------- /cemba_data/files/random_index_v2/random_index_v2.multiplex_group_6.fa: -------------------------------------------------------------------------------- 1 | >A11 2 | ^CCTTGGAA 3 | >A23 4 | ^AGGATAGC 5 | >C11 6 | ^GCTCAGTT 7 | >C23 8 | ^CAGTGCTT 9 | >E11 10 | ^AATGACGC 11 | >E23 12 | ^AGGTGTTG 13 | >G11 14 | ^AACAACCG 15 | >G23 16 | ^TGAGCTGT 17 | >I11 18 | ^GTCAACAG 19 | >I23 20 | ^AGCCTATC 21 | >K11 22 | ^GACCGATA 23 | >K23 24 | ^GATGGAGT 25 | >M11 26 | ^ACTCCTAC 27 | >M23 28 | ^CGTGTGAT 29 | >O11 30 | ^AAGCCTGA 31 | >O23 32 | ^GCGCATAT 33 | >A12 34 | ^CTCGACTT 35 | >A24 36 | ^TTCGCCAT 37 | >C12 38 | ^CACGTCTA 39 | >C24 40 | ^CGATTGGA 41 | >E12 42 | ^CGAAGTCA 43 | >E24 44 | ^AAGTGCAG 45 | >G12 46 | ^GTAAGCAC 47 | >G24 48 | ^GTTCTTCG 49 | >I12 50 | ^TACATCGG 51 | >I24 52 | ^AGTCTTGG 53 | >K12 54 | ^ACTCAACG 55 | >K24 56 | ^AGGTCTGT 57 | >M12 58 | ^ACGTCGTT 59 | >M24 60 | ^CGCCTTAT 61 | >O12 62 | ^AGTCAGGT 63 | >O24 64 | ^GATCTCAG 65 | >B11 66 | ^TGTCAGTG 67 | >B23 68 | ^TAGTGGTG 69 | >D11 70 | ^TTGGTGCA 71 | >D23 72 | ^CTGTATGC 73 | >F11 74 | ^AGTGACCT 75 | >F23 76 | ^AGACCTTG 77 | >H11 78 | ^AGCGTGTA 79 | >H23 80 | ^CATACTCG 81 | >J11 82 | ^TAGCTGAG 83 | >J23 84 | ^CAGATCCT 85 | >L11 86 | ^AACCAGAG 87 | >L23 88 | ^TCCTGACT 89 | >N11 90 | ^GAAGACTG 91 | >N23 92 | ^ACAGTTCG 93 | >P11 94 | ^CGAGTTAG 95 | >P23 96 | ^GAGAAGGT 97 | >B12 98 | ^ACCTTCGA 99 | >B24 100 | ^GTCGATTG 101 | >D12 102 | ^TGTCGACT 103 | >D24 104 | ^TGTGTCAG 105 | >F12 106 | ^TCGAACCT 107 | >F24 108 | ^GTTATGGC 109 | >H12 110 | ^TCCGATCA 111 | >H24 112 | ^ACTGCACT 113 | >J12 114 | ^CATTCGTC 115 | >J24 116 | ^TGGTTCGA 117 | >L12 118 | ^GCATTGGT 119 | >L24 120 | ^CCTCGAAT 121 | >N12 122 | ^ACCGAATG 123 | >N24 124 | ^GCAATGAG 125 | >P12 126 | ^ACACCTCA 127 | >P24 128 | ^AATGGTCG 129 | -------------------------------------------------------------------------------- /cemba_data/files/sample_sheet_header.txt: -------------------------------------------------------------------------------- 1 | [Header],,,,,,,,,, 2 | IEMFileVersion,4,,,,,,,,, 3 | Date,,,,,,,,,, 4 | Workflow,GenerateFASTQ,,,,,,,,, 5 | Application,HiSeq_FASTQ_Only,,,,,,,,, 6 | Assay,TruSeq_HT,,,,,,,,, 7 | Description,,,,,,,,,, 8 | Chemistry,,,,,,,,,, 9 | ,,,,,,,,,, 10 | [Reads],,,,,,,,,, 11 | 151,,,,,,,,,, 12 | 151,,,,,,,,,, 13 | ,,,,,,,,,, 14 | [Settings],,,,,,,,,, 15 | Adapter,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,,,,,,,,, 16 | AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT,,,,,,,,, 17 | ,,,,,,,,,, 18 | [Data],,,,,,,,,, 19 | -------------------------------------------------------------------------------- /cemba_data/files/sbatch_template_schicluster.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Notes from TACC: 4 | # 5 | # -- Launch this script by executing 6 | # -- Copy/edit this script as desired. Launch by executing 7 | # "sbatch knl.openmp.slurm" on a Stampede2 login node. 8 | # 9 | # -- OpenMP codes run on a single node (upper case N = 1). 10 | # OpenMP ignores the value of lower case n, 11 | # but slurm needs a plausible value to schedule the job. 12 | # 13 | # -- Default value of OMP_NUM_THREADS is 1; be sure to change it! 14 | # 15 | # -- Increase thread count gradually while looking for optimal setting. 16 | # If there is sufficient memory available, the optimal setting 17 | # is often 68 (1 thread per core) or 136 (2 threads per core). 18 | # 19 | #---------------------------------------------------- 20 | 21 | #SBATCH -J {job_name} # Job name 22 | #SBATCH -o {log_dir}/{job_name}.o%j # Name of stdout output file 23 | #SBATCH -e {log_dir}/{job_name}.e%j # Name of stderr error file 24 | #SBATCH -p {queue} # Queue (partition) name 25 | #SBATCH -N 1 # Total # of nodes (must be 1 for OpenMP) 26 | #SBATCH -n 1 # Total # of mpi tasks (should be 1 for OpenMP) 27 | #SBATCH -t {time_str} # Run time (hh:mm:ss) 28 | {email_str} 29 | {email_type_str} 30 | 31 | #---------------------------------------------------- 32 | # Clone the whole miniconda into /tmp so the snakemake command do not access $WORK 33 | mkdir /tmp/test_{env_dir_random} 34 | 35 | # use micromamba 36 | export PATH=/work/05622/lhq/stampede2/bin:$PATH 37 | micromamba shell init -s bash -p /tmp/test_{env_dir_random} 38 | source ~/.bashrc 39 | 40 | # activate base environment 41 | micromamba activate 42 | 43 | # create schicluster environment 44 | micromamba create -y -n schicluster python=3.8 numpy scipy scikit-learn h5py \ 45 | joblib cooler pandas statsmodels rpy2 anndata xarray snakemake pybedtools htslib=1.9 pysam=0.18 46 | micromamba activate schicluster 47 | 48 | # export correct PYTHONPATH 49 | export PYTHONPATH=/tmp/test_{env_dir_random}/envs/schicluster/lib/python3.8/site-packages 50 | 51 | # install schicluster 52 | pip install schicluster 53 | which hicluster 54 | 55 | # Installation finished 56 | #---------------------------------------------------- 57 | 58 | 59 | # --------------------------------------------------- 60 | # actual command 61 | 62 | # print some info 63 | date 64 | hostname 65 | pwd 66 | # If you want to profile the job (CPU, MEM usage, etc.) 67 | # load remora with 68 | # "module load remora" 69 | # and change the command to 70 | # "remora {command}" 71 | 72 | 73 | # Set thread count (default value is 1)... 74 | export OMP_NUM_THREADS=48 75 | 76 | for i in `seq 1 5` 77 | do 78 | {command} --batch summary=${{i}}/5 79 | done 80 | 81 | # {command} 82 | 83 | # delete everything in /tmp 84 | 85 | rm -rf /tmp/test* 86 | # --------------------------------------------------- 87 | -------------------------------------------------------------------------------- /cemba_data/files/sbatch_template_yap.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Notes from TACC: 4 | # 5 | # -- Launch this script by executing 6 | # -- Copy/edit this script as desired. Launch by executing 7 | # "sbatch knl.openmp.slurm" on a Stampede2 login node. 8 | # 9 | # -- OpenMP codes run on a single node (upper case N = 1). 10 | # OpenMP ignores the value of lower case n, 11 | # but slurm needs a plausible value to schedule the job. 12 | # 13 | # -- Default value of OMP_NUM_THREADS is 1; be sure to change it! 14 | # 15 | # -- Increase thread count gradually while looking for optimal setting. 16 | # If there is sufficient memory available, the optimal setting 17 | # is often 68 (1 thread per core) or 136 (2 threads per core). 18 | # 19 | #---------------------------------------------------- 20 | 21 | #SBATCH -J {job_name} # Job name 22 | #SBATCH -o {log_dir}/{job_name}.o%j # Name of stdout output file 23 | #SBATCH -e {log_dir}/{job_name}.e%j # Name of stderr error file 24 | #SBATCH -p {queue} # Queue (partition) name 25 | #SBATCH -N 1 # Total # of nodes (must be 1 for OpenMP) 26 | #SBATCH -n 1 # Total # of mpi tasks (should be 1 for OpenMP) 27 | #SBATCH -t {time_str} # Run time (hh:mm:ss) 28 | {email_str} 29 | {email_type_str} 30 | 31 | 32 | #---------------------------------------------------- 33 | # Clone the whole miniconda into /tmp so the snakemake command do not access $WORK 34 | mkdir /tmp/test_{env_dir_random} 35 | tar -xf /work2/05622/lhq/test_conda.tar -C /tmp/test_{env_dir_random} 36 | export CONDA_PREFIX=/tmp/test_{env_dir_random}/test/miniconda3 37 | export CONDA_PYTHON_EXE=/tmp/test_{env_dir_random}/test/miniconda3/bin/python 38 | export CONDA_EXE=/tmp/test_{env_dir_random}/test/miniconda3/bin/conda 39 | export PATH=/dev/shm/bin:/tmp/test_{env_dir_random}/test/miniconda3/envs/mapping/bin:/tmp/test_{env_dir_random}/test/miniconda3/bin:/opt/apps/cmake/3.16.1/bin:/opt/apps/intel18/python2/2.7.15/bin:/opt/apps/autotools/1.1/bin:/opt/apps/git/2.24.1/bin:/opt/apps/libfabric/1.7.0/bin:/opt/apps/intel18/impi/18.0.2/bin:/opt/intel/compilers_and_libraries_2018.2.199/linux/mpi/intel64/bin:/opt/intel/compilers_and_libraries_2018.2.199/linux/bin/intel64:/opt/apps/gcc/6.3.0/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/opt/dell/srvadmin/bin:. 40 | find /tmp/test_{env_dir_random}/test/miniconda3/ -type f -print0 | sed 's/ /\\ /g; s/(/\\(/g; s/)/\\)/g' | xargs -0 -P 30 -I % sh -c '/bin/sed -i "s/\/tmp\/test\/miniconda3\/envs\/mapping\/bin\/python/\/tmp\/test_{env_dir_random}\/test\/miniconda3\/envs\/mapping\/bin\/python/" %' 41 | 42 | pip install cemba_data --upgrade 43 | pip install schicluster --upgrade 44 | 45 | # Check the path 46 | which python 47 | which snakemake 48 | which yap 49 | which allcools 50 | which bismark 51 | 52 | # Installation finished 53 | #---------------------------------------------------- 54 | 55 | 56 | # --------------------------------------------------- 57 | # actual command 58 | 59 | # print some info 60 | date 61 | hostname 62 | pwd 63 | # If you want to profile the job (CPU, MEM usage, etc.) 64 | # load remora with 65 | # "module load remora" 66 | # and change the command to 67 | # "remora {command}" 68 | 69 | 70 | # Set thread count (default value is 1)... 71 | export OMP_NUM_THREADS=48 72 | 73 | {command} 74 | 75 | # delete everything in /tmp 76 | 77 | rm -rf /tmp/test* 78 | # --------------------------------------------------- 79 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/__init__.py: -------------------------------------------------------------------------------- 1 | from .hisat3n_general import \ 2 | separate_unique_and_multi_align_reads, \ 3 | convert_hisat_bam_strandness, \ 4 | make_snakefile_hisat3n 5 | from .utilities import validate_cwd_fastq_paths, read_mapping_config 6 | from .hisat3n_mct import select_mct_reads, aggregate_feature_counts 7 | from .summary import snmc_summary, snmct_summary, snm3c_summary 8 | from .hisat3n_m3c import \ 9 | split_hisat3n_unmapped_reads, \ 10 | call_chromatin_contacts, \ 11 | remove_overlap_read_parts 12 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from .hisat3n_m3c import remove_overlap_read_parts 4 | 5 | 6 | @click.command('remove_overlap_read_parts') 7 | @click.argument('in_bam_path') 8 | @click.argument('out_bam_path') 9 | def _remove_overlap_read_parts(in_bam_path, out_bam_path): 10 | remove_overlap_read_parts(in_bam_path, out_bam_path) 11 | return 12 | 13 | 14 | @click.group() 15 | def _main(): 16 | return 17 | 18 | 19 | def main(): 20 | _main.add_command(_remove_overlap_read_parts) 21 | _main() 22 | return 23 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/config/__init__.py -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/gcp.md: -------------------------------------------------------------------------------- 1 | # Setup GCP image for mapping 2 | 3 | ## Create base image 4 | 5 | ```bash 6 | # init install system tools 7 | sudo yum install -y zsh tree wget screen git nfs-utils make gcc 8 | 9 | # install mambaforge 10 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh 11 | sh Mambaforge-Linux-x86_64.sh -b -p $HOME/mambaforge 12 | rm -f Mambaforge-Linux-x86_64.sh 13 | ./mambaforge/bin/mamba init zsh 14 | ./mambaforge/bin/mamba init bash 15 | exec /bin/zsh 16 | mamba install -y gxx 17 | 18 | # Create mapping env hisat3n_env.yml 19 | wget https://raw.githubusercontent.com/lhqing/cemba_data/master/hisat3n_env.yml 20 | mamba env update -f hisat3n_env.yml # this should install things in the base env 21 | 22 | # Install packages 23 | mkdir -p ~/pkg 24 | 25 | # install hisat-3n 26 | cd ~/pkg 27 | git clone https://github.com/DaehwanKimLab/hisat2.git hisat-3n 28 | cd hisat-3n 29 | git checkout hisat-3n-dev-directional-mapping-reverse 30 | make 31 | # put hisat-3n in the PATH 32 | echo 'export PATH=$HOME/pkg/hisat-3n:$PATH' >> ~/.bashrc 33 | source ~/.bashrc 34 | echo 'export PATH=$HOME/pkg/hisat-3n:$PATH' >> ~/.zshrc 35 | source ~/.zshrc 36 | 37 | # make sure allcools and yap is upto date 38 | cd ~/pkg 39 | git clone https://github.com/lhqing/cemba_data.git 40 | cd cemba_data 41 | pip install -e . 42 | 43 | cd ~/pkg 44 | git clone https://github.com/lhqing/ALLCools.git 45 | cd ALLCoools 46 | pip install -e . 47 | 48 | ## Create genome reference 49 | 50 | # add genome reference file 51 | # prepare and copy specific genome reference file to $HOME 52 | 53 | # prepare a $HOME/mapping.yaml file the records the path of required genome reference files 54 | 55 | # clean unnecessary cache files 56 | mamba clean -y -a 57 | ``` 58 | 59 | ## Actual mapping 60 | 61 | ```bash 62 | mkdir -p ~/mapping 63 | cd ~/mapping 64 | gsutil cp gs://PATH/TO/FASTQ_DIR/fastq ./ 65 | cp ~/pkg/cemba_data/hisat3n/snakefile/SNAKEFILE_YOU_WANT_TO_USE ./Snakefile 66 | 67 | # run snakemake 68 | snakemake --configfile ~/mapping.yaml -j 69 | ``` 70 | 71 | ## Build hisat-3n index 72 | ```bash 73 | # non-repeat index 74 | hisat-3n-build --base-change C,T genome.fa genome 75 | # repeat index 76 | hisat-3n-build --base-change T,C --repeat-index genome.fa genome 77 | # Build the repeat HISAT-3N integrated index with splice site information 78 | hisat-3n-build --base-change C,T --repeat-index --ss genome.ss --exon genome.exon genome.fa genome 79 | ``` 80 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/hisat-3n-build.sh: -------------------------------------------------------------------------------- 1 | # normal index 2 | hisat-3n-build --base-change C,T -p THREAD \ 3 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.fa \ 4 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl 5 | 6 | # repeat index 7 | hisat-3n-build --base-change C,T -p THREAD --repeat-index \ 8 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.fa \ 9 | ~/ref/hg38/fasta/with_chrl/hg38_with_chrl.repeat 10 | 11 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/hisat3n_mapping_env.yaml: -------------------------------------------------------------------------------- 1 | name: mapping 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.8 8 | - pip 9 | - bedtools 10 | - cutadapt 11 | - htslib>=1.9 12 | - natsort 13 | - picard 14 | - pybedtools 15 | - pyBigWig 16 | - pysam 17 | - samtools 18 | - seaborn 19 | - snakemake 20 | - subread=2.0 21 | - yaml 22 | - pip: 23 | - allcools 24 | - cemba_data 25 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/config/vm_init.sh: -------------------------------------------------------------------------------- 1 | sudo yum install -y zsh tree wget screen git nfs-utils make gcc 2 | 3 | wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh 4 | sh Mambaforge-Linux-x86_64.sh -b -p $HOME/mambaforge 5 | rm -f Mambaforge-Linux-x86_64.sh 6 | ./mambaforge/bin/mamba init zsh 7 | ./mambaforge/bin/mamba init bash 8 | 9 | mamba install -y gxx 10 | exec /bin/zsh 11 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/hisat3n_general.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import pathlib 3 | import cemba_data 4 | import subprocess 5 | from ..utilities import get_configuration 6 | 7 | 8 | def bam_read_to_fastq_read(read, read_type=None): 9 | if read_type is None: 10 | if read.is_read1: 11 | read_type = '1' 12 | else: 13 | read_type = '2' 14 | 15 | fastq_record = f"@{read.qname}_{read_type}\n" \ 16 | f"{read.query_sequence}\n" \ 17 | f"+\n" \ 18 | f"{read.qual}\n" 19 | return fastq_record 20 | 21 | 22 | def separate_unique_and_multi_align_reads(in_bam_path, 23 | out_unique_path, 24 | out_multi_path, 25 | out_unmappable_path=None, 26 | unmappable_format='auto', 27 | mapq_cutoff=10, 28 | qlen_cutoff=30, 29 | primary_only=True, 30 | read_type=None): 31 | """ 32 | Separate unique aligned, multi-aligned, and unaligned reads from hisat-3n bam file. 33 | 34 | Parameters 35 | ---------- 36 | in_bam_path 37 | Path to hisat-3n bam file. 38 | out_unique_path 39 | Path to output unique aligned bam file. 40 | out_multi_path 41 | Path to output multi-aligned bam file. 42 | out_unmappable_path 43 | Path to output unmappable file. 44 | unmappable_format 45 | Format of unmappable file, only "bam" and "fastq" supported. 46 | mapq_cutoff 47 | MAPQ cutoff for uniquely aligned reads, 48 | note that for hisat-3n, unique aligned reads always have MAPQ=60 49 | qlen_cutoff 50 | read length cutoff for any reads 51 | primary_only 52 | If True, only primary alignments (FLAG 256) are considered for multi-aligned reads. 53 | read_type 54 | read type, only None, "1" and "2" supported. If the BAM file is paired-end, use None. 55 | Returns 56 | ------- 57 | None 58 | """ 59 | if out_unmappable_path is not None: 60 | if unmappable_format == 'auto': 61 | if out_unmappable_path.endswith('.bam'): 62 | unmappable_format = 'bam' 63 | elif out_unmappable_path.endswith('.fastq'): 64 | unmappable_format = 'fastq' 65 | else: 66 | raise ValueError(f'Unmappable format {unmappable_format} not supported.') 67 | else: 68 | if unmappable_format not in ['bam', 'fastq']: 69 | raise ValueError(f'Unmappable format {unmappable_format} not supported.') 70 | 71 | with pysam.AlignmentFile(in_bam_path, index_filename=None) as bam: 72 | header = bam.header 73 | with pysam.AlignmentFile(out_unique_path, header=header, mode='wb') as unique_bam, \ 74 | pysam.AlignmentFile(out_multi_path, header=header, mode='wb') as multi_bam: 75 | if out_unmappable_path is not None: 76 | if unmappable_format == 'bam': 77 | unmappable_file = pysam.AlignmentFile(out_unmappable_path, header=header, mode='wb') 78 | else: 79 | unmappable_file = open(out_unmappable_path, 'w') 80 | else: 81 | unmappable_file = None 82 | 83 | for read in bam: 84 | # skip reads that are too short 85 | if read.qlen < qlen_cutoff: 86 | continue 87 | 88 | if read.mapq > mapq_cutoff: 89 | unique_bam.write(read) 90 | elif read.mapq > 0: 91 | if primary_only and read.is_secondary: 92 | # skip secondary alignments if primary_only is True, 93 | # read.is_secondary is True when FLAG contains 256. 94 | continue 95 | multi_bam.write(read) 96 | else: 97 | # unmappable reads 98 | if unmappable_file is not None: 99 | if unmappable_format == 'bam': 100 | unmappable_file.write(read) 101 | else: 102 | unmappable_file.write(bam_read_to_fastq_read(read, read_type=read_type)) 103 | 104 | if unmappable_file is not None: 105 | unmappable_file.close() 106 | return 107 | 108 | 109 | def convert_hisat_bam_strandness(in_bam_path, out_bam_path): 110 | with pysam.AlignmentFile(in_bam_path) as in_bam, \ 111 | pysam.AlignmentFile(out_bam_path, header=in_bam.header, mode='wb') as out_bam: 112 | for read in in_bam: 113 | if read.get_tag('YZ') == '+': 114 | read.is_forward = True 115 | if read.is_paired: 116 | read.mate_is_forward = True 117 | else: 118 | read.is_forward = False 119 | if read.is_paired: 120 | read.mate_is_forward = False 121 | out_bam.write(read) 122 | return 123 | 124 | 125 | def make_snakefile_hisat3n(output_dir): 126 | output_dir = pathlib.Path(output_dir) 127 | 128 | mapping_config_name = list(output_dir.glob('mapping_config.*'))[0].name 129 | 130 | config = get_configuration(output_dir / mapping_config_name) 131 | try: 132 | mode = config['mode'] 133 | except KeyError: 134 | raise KeyError('mode not found in the config file.') 135 | 136 | skip_dirs = ['stats', 'snakemake', 'scool'] 137 | mapping_job_dirs = [p for p in output_dir.glob('*') 138 | if p.is_dir() and (p.name not in skip_dirs)] 139 | 140 | snakemake_dir = output_dir / 'snakemake' 141 | snakemake_dir.mkdir(exist_ok=True) 142 | stats_dir = output_dir / 'stats' 143 | stats_dir.mkdir(exist_ok=True) 144 | 145 | package_dir = cemba_data.__path__[0] 146 | snakefile_path = f'{package_dir}/hisat3n/snakefile/{mode.lower()}.smk' 147 | if not pathlib.Path(snakefile_path).exists(): 148 | print('Possible snakefile templates:') 149 | for p in pathlib.Path(f'{package_dir}/hisat3n/snakefile/').glob('*.smk'): 150 | print(p) 151 | raise ValueError(f'Mode {mode} not supported, ' 152 | f'because Snakefile {snakefile_path} not found.') 153 | 154 | for p in mapping_job_dirs: 155 | subprocess.run(['cp', f'{output_dir}/{mapping_config_name}', 156 | f'{p}/{mapping_config_name}'], check=True) 157 | subprocess.run(['cp', snakefile_path, f'{p}/Snakefile'], check=True) 158 | 159 | # leave a flag to indicate using hisat-3n pipeline 160 | subprocess.run(['touch', f'{output_dir}/snakemake/hisat3n'], check=True) 161 | return 162 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/snakefile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lhqing/cemba_data/788e83cd66f3b556bdfacf3485bed9500d381f23/cemba_data/hisat3n/snakefile/__init__.py -------------------------------------------------------------------------------- /cemba_data/hisat3n/stats_col_names.py: -------------------------------------------------------------------------------- 1 | # "DELETE" means this column will be removed from the output file 2 | # "" means the name will not be changed 3 | # "CertainMetricNames" means the name will be changed to "CertainMetricNames" 4 | 5 | 6 | COL_NAMES = { 7 | ('cell_parser_cutadapt_trim_stats', 'status'): 'DELETE', 8 | ('cell_parser_cutadapt_trim_stats', 'in_reads'): 'InputReadPairs', 9 | ('cell_parser_cutadapt_trim_stats', 'in_bp'): 'InputReadPairsBP', 10 | ('cell_parser_cutadapt_trim_stats', 'too_short'): 'DELETE', 11 | ('cell_parser_cutadapt_trim_stats', 'too_long'): 'DELETE', 12 | ('cell_parser_cutadapt_trim_stats', 'too_many_n'): 'DELETE', 13 | ('cell_parser_cutadapt_trim_stats', 'out_reads'): 'TrimmedReadPairs', 14 | ('cell_parser_cutadapt_trim_stats', 'w/adapters'): 'R1WithAdapters', 15 | ('cell_parser_cutadapt_trim_stats', 'qualtrim_bp'): 'R1QualTrimBP', 16 | ('cell_parser_cutadapt_trim_stats', 'out_bp'): 'R1TrimmedReadsBP', 17 | ('cell_parser_cutadapt_trim_stats', 'w/adapters2'): 'R2WithAdapters', 18 | ('cell_parser_cutadapt_trim_stats', 'qualtrim2_bp'): 'R2QualTrimBP', 19 | ('cell_parser_cutadapt_trim_stats', 'out2_bp'): 'R2TrimmedReadsBP', 20 | ('cell_parser_hisat_summary', 'ReadPairsMappedInPE'): 'DELETE', 21 | ('cell_parser_hisat_summary', 'PEUnmappableReadPairs'): 'DELETE', 22 | ('cell_parser_hisat_summary', 'PEUniqueMappedReadPairs'): 'DELETE', 23 | ('cell_parser_hisat_summary', 'PEMultiMappedReadPairs'): 'DELETE', 24 | ('cell_parser_hisat_summary', 'PEDiscordantlyUniqueMappedReadPairs'): 'DELETE', 25 | ('cell_parser_hisat_summary', 'ReadsMappedInSE'): 'DELETE', 26 | ('cell_parser_hisat_summary', 'SEUnmappableReads'): 'DELETE', 27 | ('cell_parser_hisat_summary', 'SEUniqueMappedReads'): 'DELETE', 28 | ('cell_parser_hisat_summary', 'SEMultiMappedReads'): 'DELETE', 29 | ('cell_parser_hisat_summary', 'UniqueMappedReads'): 'UniqueMappedReads', 30 | ('cell_parser_hisat_summary', 'MultiMappedReads'): 'MultiMappedReads', 31 | ('cell_parser_hisat_summary', 'UniqueMappingRate'): 'UniqueMappingRate', 32 | ('cell_parser_hisat_summary', 'MultiMappingRate'): 'MultiMappingRate', 33 | ('cell_parser_hisat_summary', 'OverallMappingRate'): 'OverallMappingRate', 34 | ('cell_parser_picard_dedup_stat', 'LIBRARY'): 'DELETE', 35 | ('cell_parser_picard_dedup_stat', 'UNPAIRED_READS_EXAMINED'): 'DELETE', 36 | ('cell_parser_picard_dedup_stat', 'READ_PAIRS_EXAMINED'): 'DELETE', 37 | ('cell_parser_picard_dedup_stat', 'SECONDARY_OR_SUPPLEMENTARY_RDS'): 'DELETE', 38 | ('cell_parser_picard_dedup_stat', 'UNMAPPED_READS'): 'DELETE', 39 | ('cell_parser_picard_dedup_stat', 'UNPAIRED_READ_DUPLICATES'): 'DELETE', 40 | ('cell_parser_picard_dedup_stat', 'READ_PAIR_DUPLICATES'): 'DELETE', 41 | ('cell_parser_picard_dedup_stat', 'READ_PAIR_OPTICAL_DUPLICATES'): 'DELETE', 42 | ('cell_parser_picard_dedup_stat', 'PERCENT_DUPLICATION'): 'DELETE', 43 | ('cell_parser_picard_dedup_stat', 'ESTIMATED_LIBRARY_SIZE'): 'DELETE', 44 | ('cell_parser_picard_dedup_stat', 'FinalReads'): '', 45 | ('cell_parser_picard_dedup_stat', 'DuplicatedReads'): '', 46 | ('cell_parser_picard_dedup_stat', 'PCRDuplicationRate'): '', 47 | ('cell_parser_feature_count_summary', 'Assigned'): 'AssignedRNAReads', 48 | ('cell_parser_feature_count_summary', 'Unassigned_Unmapped'): 'DELETE', 49 | ('cell_parser_feature_count_summary', 'Unassigned_Read_Type'): 'DELETE', 50 | ('cell_parser_feature_count_summary', 'Unassigned_Singleton'): 'DELETE', 51 | ('cell_parser_feature_count_summary', 'Unassigned_MappingQuality'): 'DELETE', 52 | ('cell_parser_feature_count_summary', 'Unassigned_Chimera'): 'DELETE', 53 | ('cell_parser_feature_count_summary', 'Unassigned_FragmentLength'): 'DELETE', 54 | ('cell_parser_feature_count_summary', 'Unassigned_Duplicate'): 'DELETE', 55 | ('cell_parser_feature_count_summary', 'Unassigned_MultiMapping'): 'DELETE', 56 | ('cell_parser_feature_count_summary', 'Unassigned_Secondary'): 'DELETE', 57 | ('cell_parser_feature_count_summary', 'Unassigned_NonSplit'): 'DELETE', 58 | ('cell_parser_feature_count_summary', 'Unassigned_NoFeatures'): 'DELETE', 59 | ('cell_parser_feature_count_summary', 'Unassigned_Overlapping_Length'): 'DELETE', 60 | ('cell_parser_feature_count_summary', 'Unassigned_Ambiguity'): 'DELETE', 61 | ('cell_parser_feature_count_summary', 'Unassigned_Total'): 'UnassignedRNAReads', 62 | ('cell_parser_feature_count_summary', 'AssignedRNAReadsRate'): 'AssignedRNAReadsRate', 63 | ('cell_parser_call_chromatin_contacts', 'cis'): 'CisContacts', 64 | ('cell_parser_call_chromatin_contacts', 'ciscut'): 'CisCutContacts', 65 | ('cell_parser_call_chromatin_contacts', 'cis_multi'): 'CisMultiContacts', 66 | ('cell_parser_call_chromatin_contacts', 'ciscut_multi'): 'CisCutMultiContacts', 67 | ('cell_parser_call_chromatin_contacts', 'trans'): 'TransContacts', 68 | ('cell_parser_call_chromatin_contacts', 'transcut',): 'TransCutContacts', 69 | ('cell_parser_call_chromatin_contacts', 'trans_multi'): 'TransMultiContacts', 70 | ('cell_parser_call_chromatin_contacts', 'transcut_multi'): 'TransCutMultiContacts', 71 | ('cell_parser_call_chromatin_contacts', 'chimeric'): 'ChimericContacts', 72 | ('cell_parser_call_chromatin_contacts', 'no'): 'NoContacts', 73 | ('cell_parser_call_chromatin_contacts', 'mapped_frag'): 'MappedFragments', 74 | ('cell_parser_call_chromatin_contacts', 'dedup_frag'): 'DeduppedContacts', 75 | ('cell_parser_call_chromatin_contacts', 'dup_rate'): 'ContactsDeduplicationRate', 76 | ('cell_parser_call_chromatin_contacts', 'TotalCisContacts'): '', 77 | ('cell_parser_call_chromatin_contacts', 'TotalTransContacts'): '', 78 | ('cell_parser_call_chromatin_contacts', 'TotalMultiContacts'): '', 79 | ('cell_parser_call_chromatin_contacts', 'CisContactsRatio'): '', 80 | ('cell_parser_call_chromatin_contacts', 'TransContactsRatio'): '', 81 | ('cell_parser_call_chromatin_contacts', 'MultiContactsRatio'): '', 82 | } 83 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/summary.py: -------------------------------------------------------------------------------- 1 | from .stats_parser import * 2 | 3 | 4 | def snmc_summary(): 5 | """ 6 | Generate snmC pipeline MappingSummary.csv.gz and save into cwd 7 | 8 | Returns 9 | ------- 10 | pd.DataFrame 11 | """ 12 | all_stats = [] 13 | 14 | # fastq trimming stats 15 | df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt', 16 | cell_parser_cutadapt_trim_stats) 17 | all_stats.append(df) 18 | 19 | # hisat-3n mapping 20 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt', 21 | cell_parser_hisat_summary) 22 | all_stats.append(df) 23 | 24 | # uniquely mapped reads dedup 25 | df = parse_single_stats_set(f'bam/*.unique_align.deduped.matrix.txt', 26 | cell_parser_picard_dedup_stat, prefix='UniqueAlign') 27 | all_stats.append(df) 28 | 29 | # multi mapped reads dedup 30 | df = parse_single_stats_set(f'bam/*.multi_align.deduped.matrix.txt', 31 | cell_parser_picard_dedup_stat, prefix='MultiAlign') 32 | all_stats.append(df) 33 | 34 | # allc count 35 | df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv', 36 | cell_parser_allc_count) 37 | all_stats.append(df) 38 | 39 | # concatenate all stats 40 | all_stats = pd.concat(all_stats, axis=1) 41 | all_stats.index.name = 'cell' 42 | all_stats.to_csv(f'MappingSummary.csv.gz') 43 | return all_stats 44 | 45 | 46 | def snmct_summary(): 47 | """ 48 | Generate snmCT pipeline MappingSummary.csv.gz and save into cwd 49 | 50 | Returns 51 | ------- 52 | pd.DataFrame 53 | """ 54 | all_stats = [] 55 | 56 | # fastq trimming stats 57 | df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt', 58 | cell_parser_cutadapt_trim_stats) 59 | all_stats.append(df) 60 | 61 | # hisat-3n DNA mapping 62 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt', 63 | cell_parser_hisat_summary, prefix='DNA') 64 | all_stats.append(df) 65 | 66 | # hisat-3n RNA mapping 67 | df = parse_single_stats_set(f'rna_bam/*.hisat3n_rna_summary.txt', 68 | cell_parser_hisat_summary, prefix='RNA') 69 | all_stats.append(df) 70 | 71 | # uniquely mapped reads dedup 72 | df = parse_single_stats_set(f'bam/*.unique_align.deduped.matrix.txt', 73 | cell_parser_picard_dedup_stat, prefix='DNAUniqueAlign') 74 | all_stats.append(df) 75 | 76 | # multi mapped reads dedup 77 | df = parse_single_stats_set(f'bam/*.multi_align.deduped.matrix.txt', 78 | cell_parser_picard_dedup_stat, prefix='DNAMultiAlign') 79 | all_stats.append(df) 80 | 81 | # uniquely mapped dna reads selection 82 | df = parse_single_stats_set('bam/*.hisat3n_dna.unique_align.deduped.dna_reads.reads_mch_frac.csv', 83 | cell_parser_reads_mc_frac_profile, prefix='UniqueAlign') 84 | all_stats.append(df) 85 | 86 | # multi mapped dna reads selection 87 | df = parse_single_stats_set('bam/*.hisat3n_dna.multi_align.deduped.dna_reads.reads_mch_frac.csv', 88 | cell_parser_reads_mc_frac_profile, prefix='MultiAlign') 89 | all_stats.append(df) 90 | 91 | # uniquely mapped rna reads selection 92 | df = parse_single_stats_set('rna_bam/*.hisat3n_rna.unique_align.rna_reads.reads_mch_frac.csv', 93 | cell_parser_reads_mc_frac_profile) 94 | all_stats.append(df) 95 | 96 | # allc count 97 | df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv', 98 | cell_parser_allc_count) 99 | all_stats.append(df) 100 | 101 | # feature count 102 | df = parse_single_stats_set(f'rna_bam/*.feature_count.tsv.summary', 103 | cell_parser_feature_count_summary) 104 | all_stats.append(df) 105 | 106 | # concatenate all stats 107 | all_stats = pd.concat(all_stats, axis=1) 108 | all_stats.index.name = 'cell' 109 | all_stats.to_csv(f'MappingSummary.csv.gz') 110 | return all_stats 111 | 112 | 113 | def snm3c_summary(): 114 | """ 115 | Generate snm3C pipeline MappingSummary.csv.gz and save into cwd 116 | 117 | Returns 118 | ------- 119 | pd.DataFrame 120 | """ 121 | all_stats = [] 122 | 123 | # fastq trimming stats 124 | df = parse_single_stats_set(f'fastq/*.trimmed.stats.txt', 125 | cell_parser_cutadapt_trim_stats) 126 | all_stats.append(df) 127 | 128 | # hisat-3n mapping PE 129 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_summary.txt', 130 | cell_parser_hisat_summary) 131 | all_stats.append(df) 132 | 133 | # hisat-3n mapping split-reads SE 134 | df = parse_single_stats_set(f'bam/*.hisat3n_dna_split_reads_summary.txt', 135 | cell_parser_hisat_summary, prefix='SplitReads') 136 | all_stats.append(df) 137 | 138 | # uniquely mapped reads dedup 139 | df = parse_single_stats_set(f'bam/*.all_reads.deduped.matrix.txt', 140 | cell_parser_picard_dedup_stat, prefix='UniqueAlign') 141 | all_stats.append(df) 142 | 143 | # call chromatin contacts 144 | df = parse_single_stats_set(f'hic/*.all_reads.contact_stats.csv', 145 | cell_parser_call_chromatin_contacts) 146 | all_stats.append(df) 147 | 148 | # allc count 149 | df = parse_single_stats_set(f'allc/*.allc.tsv.gz.count.csv', 150 | cell_parser_allc_count) 151 | all_stats.append(df) 152 | 153 | # concatenate all stats 154 | all_stats = pd.concat(all_stats, axis=1) 155 | all_stats.index.name = 'cell' 156 | all_stats.to_csv(f'MappingSummary.csv.gz') 157 | return all_stats 158 | -------------------------------------------------------------------------------- /cemba_data/hisat3n/utilities.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import re 3 | import yaml 4 | import pandas as pd 5 | 6 | from ..utilities import get_configuration 7 | 8 | 9 | def _read_yaml_config(config_path): 10 | with open(config_path, 'r') as f: 11 | config = yaml.safe_load(f) 12 | return config 13 | 14 | 15 | def _read_ini_config(config_path): 16 | return get_configuration(config_path) 17 | 18 | 19 | def read_mapping_config(cwd: str = '.'): 20 | tried = [] 21 | yaml_path = None 22 | for name in ['config', 'mapping_config']: 23 | for config_dir in [cwd, f'{cwd}/..']: 24 | for suffix in ['yaml', 'yml']: 25 | path = f'{config_dir}/{name}.{suffix}' 26 | tried.append(path) 27 | if pathlib.Path(path).exists(): 28 | yaml_path = path 29 | default_path = f'~/mapping_config.yaml' 30 | if pathlib.Path(default_path).exists(): 31 | yaml_path = default_path 32 | 33 | ini_path = None 34 | for name in ['config', 'mapping_config']: 35 | for config_dir in [cwd, f'{cwd}/..']: 36 | path = f'{config_dir}/{name}.ini' 37 | tried.append(path) 38 | if pathlib.Path(path).exists(): 39 | ini_path = path 40 | 41 | if yaml_path is not None: 42 | config = _read_yaml_config(yaml_path) 43 | elif ini_path is not None: 44 | config = _read_ini_config(ini_path) 45 | else: 46 | config = {} 47 | return config 48 | 49 | 50 | def validate_cwd_fastq_paths(cwd: str = '.'): 51 | """ 52 | Validate fastq paths in the fastq subdirectory of cwd. 53 | Parameters 54 | ---------- 55 | cwd : 56 | Path of the current working directory. 57 | 58 | Returns 59 | ------- 60 | fastq_table : pandas.DataFrame 61 | """ 62 | # get all fastq file paths 63 | fastq_paths = [p 64 | for p in pathlib.Path(f'{cwd}/fastq/').glob('*.[fq.gz][fastq.gz]') 65 | if 'trim' not in p.name] 66 | 67 | # parse cell id and match fastq pairs 68 | fastq_pattern = re.compile(r'(?P