├── .gitignore ├── .travis.yml ├── .travis └── install.sh ├── LICENSE ├── MANIFEST.in ├── README.md ├── conda_rck └── meta.yaml ├── docs ├── Adjacencies.md ├── AdjacencyGroups.md ├── Installation.md ├── README.md ├── Segments.md ├── Usage.md └── img │ └── RCK_Overview_vertical.png ├── rck ├── __init__.py ├── core │ ├── __init__.py │ ├── graph.py │ ├── ilp_gurobi.py │ ├── io.py │ ├── process.py │ └── structures.py ├── rck_run.py └── utils │ ├── __init__.py │ ├── adj │ ├── __init__.py │ ├── adjacency_group_inference.py │ ├── adjacency_group_process.py │ ├── adjacency_group_stats.py │ ├── analysis.py │ ├── convert.py │ ├── long_reads.py │ ├── main_chrs.txt │ ├── process.py │ ├── rck_adg_infer.py │ ├── rck_adg_process.py │ ├── rck_adg_stats.py │ ├── rck_adj_long_reads.py │ ├── rck_adj_process.py │ ├── rck_adj_rck2x.py │ ├── rck_adj_stats.py │ ├── rck_adj_x2rck.py │ └── stats.py │ ├── karyotype │ ├── __init__.py │ ├── analysis.py │ ├── rck_kar_graph.py │ └── rck_kar_stats.py │ ├── rck_input_refine.py │ └── scn │ ├── __init__.py │ ├── convert.py │ ├── process.py │ ├── rck_scnb.py │ ├── rck_scnt_process.py │ ├── rck_scnt_rck2x.py │ ├── rck_scnt_stats.py │ ├── rck_scnt_x2rck.py │ └── stats.py ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── test_graph.py └── test_structures.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | analysis/ 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | .idea/ 109 | test_data/ 110 | 111 | **/dev_tmp* -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.6 4 | - 3.7 5 | 6 | os: 7 | - linux 8 | dist: xenial 9 | sudo: required 10 | 11 | matrix: 12 | include: 13 | - os: osx 14 | osx_image: xcode10.1 15 | language: generic 16 | env: 17 | - PYTHON=3.6 18 | - os: osx 19 | osx_image: xcode10.1 20 | language: generic 21 | env: 22 | - PYTHON=3.7 23 | 24 | 25 | before_install: 26 | - sudo chmod +x .travis/install.sh && sudo chown $USER .travis/install.sh && /bin/bash .travis/install.sh 27 | 28 | install: 29 | - hash -r 30 | - export PATH="$HOME/miniconda/bin:$PATH" 31 | # - export PATH="$HOME/miniconda/envs/test-environment/bin:$PATH" 32 | - echo $PATH 33 | - source activate test-environment 34 | - ls -l $HOME/miniconda/envs/test-environment/bin 35 | - travis_wait 30 pip install -e . 36 | - conda install -c gurobi gurobi 37 | 38 | script: 39 | - which python 40 | - python -c "import gurobi" 41 | - rck --help 42 | - rck-adj-x2rck --help 43 | - rck-adj-x2rck sniffles --help 44 | - rck-adj-x2rck lumpy --help 45 | - rck-adj-x2rck longranger --help 46 | - rck-adj-x2rck naibr --help 47 | - rck-adj-x2rck manta --help 48 | - rck-adj-x2rck grocsv --help 49 | - rck-adj-x2rck delly --help 50 | - rck-adj-x2rck pbsv --help 51 | - rck-adj-x2rck remixt --help 52 | - rck-adj-process --help 53 | - rck-adj-process cat --help 54 | - rck-adj-process reciprocal --help 55 | - rck-adj-process filter --help 56 | - rck-scnt-x2rck --help 57 | - rck-scnt-x2rck titan --help 58 | - rck-scnt-x2rck battenberg --help 59 | - rck-scnt-x2rck hatchet --help 60 | - rck-scnt-x2rck remixt --help -------------------------------------------------------------------------------- /.travis/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$TRAVIS_OS_NAME" = 'osx' ]; then 4 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O ~/miniconda.sh 5 | bash ~/miniconda.sh -b -p $HOME/miniconda 6 | export PATH="$HOME/miniconda/bin:$PATH" 7 | hash -r 8 | conda config --set always_yes yes --set changeps1 no 9 | conda update -q conda 10 | conda info -a 11 | conda create -n test-environment python=$PYTHON 12 | source activate test-environment 13 | fi 14 | 15 | if [ "$TRAVIS_OS_NAME" = 'linux' ]; then 16 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 17 | bash miniconda.sh -b -p $HOME/miniconda 18 | export PATH="$HOME/miniconda/bin:$PATH" 19 | hash -r 20 | conda config --set always_yes yes --set changeps1 no 21 | conda update -q conda 22 | # Useful for debugging any issues with conda 23 | conda info -a 24 | conda create -n test-environment python=$TRAVIS_PYTHON_VERSION 25 | source activate test-environment 26 | fi -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2010-2017 Raphael Research Group, Princeton University, Princeton, NJ, USA. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.md setup.py 2 | recursive-include rck/utils * 3 | recursive-include docs * 4 | global-exclude __pycache__ 5 | global-exclude *.py[co] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RCK
Reconstruction of clone- and haplotype-specific Cancer Karyotypes 2 | 3 | [![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/hyperium/hyper/master/LICENSE) 4 | [![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/) 5 | [![Build Status](https://travis-ci.com/aganezov/RCK.svg?token=yNaqdjAcHsstx5v8GMKj&branch=master)](https://travis-ci.com/aganezov/RCK) 6 | 7 | 8 | **RCK** - is a method for **R**econstruction of clone- and haplotype-specific **C**ancer **K**aryotypes from tumor mixtures, distributed both as a standalone software package and as a Python library under the MIT licence. 9 | 10 | RCK has been initially designed and developed by Sergey Aganezov in the group of prof. Ben Raphael at Princeton University ([group site](http://compbio.cs.brown.edu/)). 11 | Current development of RCK is continued by Sergey Aganezov in the group of prof. Michael Schatz at Johns Hopkins University ([group site](http://schatz-lab.org/)). 12 | 13 | The full description of the algorithm and its application on published cancer datasets are described in: 14 | 15 | [Sergey Aganezov and Benjamin J. Raphael, 2019](https://www.biorxiv.org/content/10.1101/560839v1) 16 | 17 | ### Contents: 18 | 1. [Algorithm overview](#algorithm-overview) 19 | 2. [Installation](#installation) 20 | 3. [Input preprocessing](#input-preprocessing) 21 | 1. [Novel Adjacencies](#novel-adjacencies) 22 | 2. [Segment copy numbers](#segment-copy-numbers) 23 | 4. [High-level RCK data processing recipe](#RCK-data-processing-recipe) 24 | 5. [Running RCK](#running-rck) 25 | 6. [Results](#results) 26 | 7. [Citation](#citation) 27 | 8. [Issues](#issues) 28 | 29 | ### Algorithm Overview 30 | 31 | ![RCK overview](docs/img/RCK_Overview_vertical.png) 32 | 33 | RCK infers clone- and haplotype-speicifc cancer genome karyotypes from tumor mixtures. 34 | 35 | RCK assumes that: 36 | * the reference human genome is diploid (except for sex chromosomes) 37 | * somatic evolution is propagated by large scale rearrangements (any type, quantity, etc) that respect the infinite sites assumption (i.e., no genomic location, on either copy of the homologous chromosome, prticipates in the double-stranded breakage, which are requried for a rearrangement to happen, more than once thgoughout the entire somatic evolutionary history of the tumor); 38 | this can be relaxed for extremity-exclusivity constraint, if in the high confident input novel adjacencies some genomic location is shared. 39 | * no novel genomic locations (unless explicitly specified) can play a role of telomeres in the derived chromosomes 40 | * (approximate) clone- and allele-specific fragment/segment copy numbers are inferred by 3rd-party tools and are part of the input (see more in the [segments docs](docs/Segments.md)) 41 | * (noisy) unlabeled (i.e., without haplotype labels) novel adjacencies (aka structural variants) are inferred by 3rd-party tools and are part of the input (see more in the [adjacencies docs](docs/Adjacencies.md)) 42 | 43 | RCK uses a Diploid Interval Adjacency Graph to represent all possible segments and transitions between them (across all clones and the reference). 44 | RCK then solves an optimization problem of inferring clone- and haplotype-specific karyotypes (i.e., finding clone-specific edge multiplicity functions in the constructed DIAG) as an MILP program. 45 | Several constraints are taken into consideration (some of which are listed below) during the inference: 46 | * infinite sites complience (across all clones in the tumor) 47 | * adjacencies grouping (is part of the input, optional) 48 | * false positive for novel adjacencies presence in reconstructed karyotypes 49 | * maximum divergence from input (approximate) allele-specific segment/fragment copy number profile 50 | * preservatino of allele-separation across clones in tumor 51 | * telomere locations 52 | 53 | We note, that in contrast to some other cancer karyotype inference methods, RCK model has several advantages, that all work in q unifying computation framework and some/all of which differentiate RCK from other methods: 54 | * any level of sample heterogeneity (on the karyotype level): from homogeneous samples with a single derived clone, to tumor samples comprised of `n` derived genomes 55 | * support for any type of novel adjacencies signature (SV types), including copy-number-neutral ones, as well as the complicated ones arising from chromoplexy/chromothripsis events 56 | * model of diploid reference/non-haploid derived genomes 57 | * explicit control over telomere location during the inference 58 | * explicit fine-grain control over false positive in the novel adjacencies in the input and respectively their utilization in the inference 59 | * haplotype-specific (aka phased) inference both for segments and adjacencies across all clones in the tumor sample 60 | * support for (optional) 3rd-generation sequencing additional information 61 | 62 | ### Installation 63 | 64 | RCK shall work on latest macOS, and main Linux distribution. 65 | RCK is implemented in Python and designed to work with Python 3.7+. 66 | We highly recommend creating an independent python virtual environment for RCK usage. 67 | 68 | RCK itself can be installed in three different ways: 69 | * [conda](https://conda.io/docs/user-guide/overview.html) `conda install -c aganezov rck` 70 | * [pip (PyPI)](https://pip.pypa.io/en/stable/) `pip install rck` 71 | * source `python setup.py install` 72 | 73 | RCK requires an ILP solver installed on the system, as well as python bindings for it. 74 | Currently only Gurobi ILP solver is supported. 75 | 76 | For more details about installation please refer to the [installation documentation](docs/Installation.md). 77 | 78 | ### Input (preprocessing) 79 | 80 | The minimum input for RCK is comprised of two parts: 81 | 1. Unlabeled novel adjacencies (aka structural variations in the tumor sample) 82 | 2. Clone- and allele-specific segment copy numbers 83 | 84 | Additional input can contain: 85 | * Additional telomere locations 86 | * Segment-, clone-, and allele-specific boundaries (both lower and upper) on inferred copy numbers 87 | * Grouping information about clone-specific novel adjacencies (usually informed by 3rd-generation sequencing data), with individual False Positive rates per each group 88 | * False Positive rates for any subgroup of input novel adjacencies. 89 | 90 | RCK expects the input data to be in a (C/T)SV (Coma/Tab Separated Values) format. 91 | We provide a set of utility tools to convert input data obtained from a lot of state-of-the-atr methods outputs into the RCK suitable format. 92 | 93 | #### Novel Adjacencies 94 | Obtaining unlabeled (i.e., without allele-information) novel adjacencies (aka Structural Variants) is not a part of the RCK workflow, as there exist a lot of tools for obtaining those. 95 | We provide a `rck-adj-x2rck` utility to convert output from output format of SV detection tools to the RCK suitable format. 96 | We currently support converting the output of the following 3rd-party SV detection tools: 97 | * *short-reads* 98 | * **Delly** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3436805/) | [code](https://github.com/dellytools/delly)] 99 | * **Manta** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/26647377) | [code](https://github.com/Illumina/manta)] 100 | * **Lumpy** [[paper](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2014-15-6-r84) | [code](https://github.com/arq5x/lumpy-sv)] 101 | * **BreakDancer** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138716/) | [code](https://github.com/genome/breakdancer)] 102 | * *linked/barcode reads* 103 | * **LongRanger** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4786454/) | [code](https://github.com/10XGenomics/longranger)] 104 | * **GROC-SVs** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28714986) | [code](https://github.com/grocsvs/grocsvs)] 105 | * **NAIBR** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/29112732) | [code](https://github.com/raphael-group/NAIBR)] 106 | * *long reads* 107 | * **Sniffles** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5990442/) | [code](https://github.com/fritzsedlazeck/Sniffles)] 108 | * **PBSV** [paper | [code](https://github.com/PacificBiosciences/pbsv)] 109 | * *generic* 110 | * **SURVIVOR** [[paper](https://www.nature.com/articles/ncomms14061) | [code](https://github.com/fritzsedlazeck/SURVIVOR)] 111 | 112 | For more information about adjacencies, formats, converting, reciprocality, etc, please refer to [adjacencies documentation](docs/Adjacencies.md) 113 | 114 | #### Segment copy numbers 115 | Obtaining clone- and allele-specific segment copy numbers is not a part of the RCK workflow, as there exist a lof of tools for obtaining those. 116 | We provide a `rck-scnt-x2rck` utility to convert output from output format of other tools that infer clone- and allele-specific segment copy numbers to the RCK suitable format. 117 | We currently support converting the output of the following 3rd-party tools: 118 | * **HATCHet** [[paper](https://www.biorxiv.org/content/early/2018/12/17/496174) | [code](https://github.com/raphael-group/hatchet)] (*recommended* as it has fewest limitation w.r.t. tumor heterogeneity) 119 | * **TitanCNA** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/25060187) | [code](https://github.com/gavinha/TitanCNA)] 120 | * **Battenberg** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/22608083) | [code](https://github.com/cancerit/cgpBattenberg)] 121 | * **ReMixT** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28750660) | [code](https://bitbucket.org/dranew/remixt)] 122 | * **Ginkgo** [[paper](https://www.nature.com/articles/nmeth.3578) | [code](https://github.com/robertaboukhalil/ginkgo)] (Attention! *haploid* mode only) 123 | 124 | ## RCK data processing recipe 125 | For the most cases the cancer sample of interest is initially represented via a set `cancer.sr.fastq` of reads obtained via a sequencer. 126 | Additionally, a sequenced reads `normal.sr.fastq` from a matching normal sample need to be available. 127 | Most often case of analysis consists of having a standard Illumina paired-end sequenced reads for both the tumor and the matching normal. 128 | Increasingly 3rd-generation sequencing technologies are being utilized in cancer analysis. 129 | Let us assume that there may optionally be a set `cancer.lr.fastq` of reads for the cancer sample in question obtained via 3rd-generation sequencing technology. 130 | 131 | 1. Align sequenced reads (with you aligner of choice) `cancer.sr.fastq` and `normal.sr.fastq` for cancer and a matching normal samples to obtain `cancer.sr.bam` and `normal.sr.bam` 132 | 1. Optionally align sequenced long reads `cancer.lr.fastq` to obtain `cancer.lr.bam` 133 | 2. Run a tool of you choosing on `cancer.sr.fastq` to obtain a novel adjacencies VCF file `cancer.sr.vcf` 134 | 1. Optionally infer novel adjacencies on long-read dataset obtaining `cancer.lr.vcf` 135 | 2. Merge short- and long-read novel adjacencies into a unified set `cancer.vcf` (we suggest using SURVIVOR tool [[code](https://github.com/fritzsedlazeck/SURVIVOR) | [paper](https://www.nature.com/articles/ncomms14061)] for this task) 136 | 3. Convert novel adjacencies from VCF file `cancer.vcf` to the `RCK` input format via `rck-adj-x2rck x cancer.vcf -o input.rck.adj.tsv`, where `x` stands for the novel adjacency inference tool. 137 | Please, see [adjacencies docs](docs/Adjacencies.md) for list of supported tools and more detailed instructions on comparison. 138 | 4. Run any of the supported tools (HATCHet, TitanCNA, Battenberg, ReMixT) of choice to infer large-scale clone- and allele-specific fragment copy numbers `CN.data` (generic name of the tool-specific result) 139 | 5. Convert tool-specific copy-number data `CN.data` into `RCK` format via `rck-scnt-x2rck x CN-data -o input.rck.scnt.tsv`, where `x` stands for copy number inference tool. 140 | Please, see [segments docs](docs/Segments.md) for link to specific methods, as well as details on how to run conversion. 141 | 6. Run `RCK` 142 | 143 | ### Running RCK 144 | We provide the the `rck` tool to run the main RCK algorithm for clone- and haplotype specific cancer karyotypes reconstruction. 145 | 146 | With the minimum input for RCK the following is the example of running RCK: 147 | 148 | ````bash 149 | rck --scnt input.rck.scnt.tsv --adjacecnies input.rck.adj.tsv 150 | ```` 151 | 152 | where: 153 | * `--scnt` corresponds to the clone- and allele-specific segments copy number input 154 | * `--adjacencies` corresponds to the unlabeled novel adjacencies input 155 | 156 | Additionally one can specify the `--workdir` working directory, where the input, preprocessing, and the output will be stored. 157 | For more on the `rck` command usage please refer to [usage documentation](docs/Usage.md). 158 | 159 | ### Results 160 | Here is the description of the results produced by `rck` main method for cancer karyotype reconstruction. 161 | For results on segment/adjacency conversion/processing, please refer to respective [segment](docs/Segments.md)/[adjacency](docs/Adjacencies.md) documentations. 162 | 163 | RCK's cancer karyotype reconstruction is stored in the `output` subdirectory in the working directory (the `--workdir`). 164 | The following two files depict the inferred clone- and haplotype-specific karyotypes: 165 | * `rck.scnt.tsv` - clone- and haplotype-specific segments copy numbers; 166 | * `rck.acnt.tsv` - clone- and haplotype-specific adjacencies copy numbers; 167 | 168 | For information about the format of the inferred clone- and haplotype-specific copy numbers on segments/adjacencies please refer to [segment](docs/Segments.md)/[adjacency](docs/Adjacencies.md) documentations 169 | 170 | Results in the original [manuscript](https://www.biorxiv.org/content/10.1101/560839v1) can be found in the [dedicated Github repository](https://github.com/aganezov/RCK-pub-data). 171 | 172 | ### Citation 173 | When using RCK's cancer karyotype reconstruction algorithm or any of RCK's utilities, please cite the following paper: 174 | 175 | [Sergey Aganezov and Benjamin J. Raphael, 2019](https://www.biorxiv.org/content/10.1101/560839v1) 176 | 177 | ### Issues 178 | If you experience any issues with RCK installation, usage, or results or want to see RCK enhanced in any way, shape or form, please create an issue on RCK [issue tracker](https://github.com/aganezov/RCK/issues). 179 | Please, make sure to specify the RCK's, Python's, and Gurobi's versions in question, and, if possible, provide (minimized) data, on which the issue(s) occur(s). 180 | 181 | If you want to discuss any avenues of collaboration, custom RCK applications, etc, please contact Sergey Aganezov at *aganezov(at)jhu.edu* or *sergeyaganezovjr(at)gmail.com* 182 | -------------------------------------------------------------------------------- /conda_rck/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "RCK" %} 2 | {% set version = "1.1.0" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ version }}" 7 | 8 | source: 9 | url: https://files.pythonhosted.org/packages/fd/85/69c8ba6c6e80e9d9acbb40365601ee0632f8ce7c8b61e051249a8db70595/RCK-1.1.0.tar.gz 10 | sha256: 522e7965be2eb3ed7089b37e05bb795846a7f78ec658815ad229779cd4874ca5 11 | 12 | build: 13 | number: 0 14 | script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vvv " 15 | 16 | requirements: 17 | host: 18 | - networkx >=2 19 | - scipy 20 | - pyvcf 21 | - pysam 22 | - sortedcontainers 23 | - pandas 24 | - gurobi 25 | - pip 26 | - gffutils 27 | - python >=3.6 28 | run: 29 | - networkx >=2 30 | - scipy 31 | - pyvcf 32 | - pysam 33 | - sortedcontainers 34 | - pandas 35 | - gurobi 36 | - pip 37 | - gffutils 38 | - python >=3.6 39 | 40 | test: 41 | imports: 42 | - rck 43 | - rck.core 44 | - rck.utils 45 | - rck.utils.adj 46 | - rck.utils.scn 47 | commands: 48 | - rck --help # [not win] 49 | - rck-adj-x2rck --help # [not win] 50 | - rck-adj-x2rck sniffles --help # [not win] 51 | - rck-adj-x2rck lumpy --help # [not win] 52 | - rck-adj-x2rck longranger --help # [not win] 53 | - rck-adj-x2rck naibr --help # [not win] 54 | - rck-adj-x2rck manta --help # [not win] 55 | - rck-adj-x2rck grocsvs --help # [not win] 56 | - rck-adj-x2rck delly --help # [not win] 57 | - rck-adj-x2rck pbsv --help # [not win] 58 | - rck-adj-x2rck remixt --help # [not win] 59 | - rck-adj-process --help # [not win] 60 | - rck-adj-process cat --help # [not win] 61 | - rck-adj-process reciprocal --help # [not win] 62 | - rck-adj-process filter --help # [not win] 63 | - rck-scnt-x2rck --help # [not win] 64 | - rck-scnt-x2rck titan --help # [not win] 65 | - rck-scnt-x2rck battenberg --help # [not win] 66 | - rck-scnt-x2rck hatchet --help # [not win] 67 | - rck-scnt-x2rck remixt --help # [not win] 68 | - rck-scnt-process --help # [not win] 69 | - rck-scnt-process refine --help # [not win] 70 | - rck-scnt-process align --help # [not win] 71 | - rck-scnt-process distance --help # [not win] 72 | - rck-scnb --help # [not win] 73 | - rck-input-refine --help # [not win] 74 | 75 | about: 76 | home: https://github.com/aganezov/rck 77 | 78 | extra: 79 | recipe-maintainers: 80 | - aganezov 81 | -------------------------------------------------------------------------------- /docs/Adjacencies.md: -------------------------------------------------------------------------------- 1 | # Adjacencies 2 | 3 | ### Contents: 4 | * [Adjacencies overview](#adjacencies-overview) 5 | * [RCK adjacency format](#rck-adjacency-format) 6 | * [inferred clone- and haplotype-specific adjacency copy numbers](#inferred-clone--and-haplotype-specific-adjacency-copy-numbers) 7 | * [Converting to RCK format from SV detection tools](#converting-to-rck-format-from-sv-detection-tools) 8 | * [Processing RCK adjacencies](#processing-rck-adjacencies) 9 | 10 | ### Adjacencies overview 11 | One of the key concepts in the RCK model is the notion of *adjacency*. 12 | Adjacency is a transition between segment's extremities. 13 | There are two kinds of adjacencies: 14 | * **reference** (present in the reference genome, or inherited by the derived cancer genome(s)) 15 | * **novel** (present in derive genomes only). 16 | 17 | Every adjacency `{(q,x,+|-),(p,y,+|-)}` describes a transition from (right|left) side of loci at coordinate `x` on chromosome `q` to (right|left) side of the loci at coordinate `y` on the chromosome `p`. 18 | Reference adjacencies naturally have a form of `{(chr,x,+),(chr,x+1,-)}` (i.e., on the same chromosome, neighbouring positions, and respective extremities via strands orientation). 19 | 20 | We call two adjacencies reciprocal if some pair of their extremities resemble a reference adjacency. 21 | For example, adjacencies `{(1,123450,+),(1,4567890,+)}` and `{(1,123451,-),(1,876534,+)}` are reciprocal, because extremity `(1,123450,+)` form first adjacency one, and extremity `(1,123451,-)` from second adjacency resemble a reference adjacency `{(1,123450,+),(1,123451,-)}` 22 | 23 | While sometimes novel adjacencies are classified as insertion, deletion, duplication, reversals (aka inversion), translocation, etc. 24 | This is usually done by looking at chromosomes, coordinates, and strands of involved extremities. 25 | For example insertion and deletion has the same *signature* (i.e., same chromosome, `+` strand on the leftmost extremity, and `-` on the rightmost extremity). 26 | Duplication has a signature of `-` strand followed by the `-` strand on the same chromosome. 27 | Reversal (event) usually involves two reciprocal novel adjacencies, with `+`,`+` strand signature on one, and `-`, `-` signature on another adjacency. 28 | 29 | While, indeed, aforementioned annotation correspond to cases, where respective rearrangement events would produce such novel adjacencies, 30 | it can also be the case that novel adjacencies that resemble signatures described above can be produced by more complex rearrangement events, such as *chromoplexy* and *chromothripsis*. 31 | 32 | ### RCK adjacency format 33 | RCK works with adjacencies in the following (T/C)SV (Tab/Comma Separated Values) text format: 34 | 35 | ```` 36 | aid chr1 coord1 strand1 chr2 coord2 strand2 extra 37 | ```` 38 | where every entry thus describes an adjacency `{(chr1, coord1, strand1), (chr2, coord2, strand2)}` with an id of `aid`. 39 | The `extra` field is designed to store `key=value` pairs of additional information about adjacencies, with entries being separated by `;`. 40 | 41 | There are several special extra fields, that RCK relies upon, when working: 42 | * `aid` -- copy of the `aid`. When reading adjacencies, id for the adjacency will be based on the column, not the extra field. 43 | * `cn` -- copy number values (refer to the following [subsection](#inferred-clone--and-haplotype-specific-adjacency-copy-numbers)) 44 | * `at` -- adjacency type (either `N` for novel (default), or `R` for reference). By default all adjacencies are considered to be noevl, unless the adjacency id starts with the lower-case `r`. 45 | 46 | #### inferred clone- and haplotype-specific adjacency copy numbers 47 | The result sof the main RCK algorithm (via `rck` executable) contains the `rck.acnt.tsv` file, with entries following the RCK adjacencies format. 48 | 49 | Both novel and reference adjacencies are output in the result, and depending on the ``--o-acnt-mix-novel-and-reference`` novel and reference adjacencies are either going to be mixed together, or separated with novel adjacencies followed by the reference ones. 50 | While the adjacencies themselves are self-explanatory, the main important peace of information about them is the `cn` field in the `extra` column, that encodes the clone- and haplotype-specific copy number values. 51 | 52 | The `cn` value is a python/JSON dict with the following structure: 53 | 54 | ``` 55 | { 56 | 'clone_id': { 57 | 'AA': int, 58 | 'AB': int, 59 | 'BA': int, 60 | 'BB': int 61 | }, 62 | ... 63 | } 64 | ``` 65 | where `clone_id` corresponds to the clone, for which haplotype-specific copy numbers are provided, and the 66 | `AA`, `AB`, `BA`, `BB` entries encode the copy number of the (haplotype) labeled versions of the adjacency (where the first position is labeled with the first haplotype letter, and the second position is labeled with the second haplotype letter). 67 | 68 | In the following example: 69 | ```` 70 | aid chr1 coord1 strand1 chr2 coord2 strand2 extra 71 | id1 1 123450 + 1 123760 - cn={'c1':{'AA': 1, 'AB': 0, 'BA': 0, 'BB':0}, 'c2': {'AA': 2, 'AB': 0, 'BA': 0, 'BB':0}} 72 | ```` 73 | where for the novel adjacency `{(1,123450,+),(1,123760,-)}` with id `id1` the following labeled adjacency `{(1,123450,+,A),(1,123760,-,A)}` has a copy number 1 in clone `c1` and copy number 2 in clone `c2`. 74 | 75 | ### Converting to RCK format from SV detection tools 76 | RCK installation adds `rck-adj-x2rck` adjacency-conversion executable tool to the `PATH` of your installation environment. 77 | With the help of `rck-adj-x2rck` one can convert (unlabeled) novel adjacency predictions from the following tools: 78 | 79 | * *short-reads* 80 | * **Delly** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3436805/) | [code](https://github.com/dellytools/delly)] 81 | * **Manta** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/26647377) | [code](https://github.com/Illumina/manta)] 82 | * **Lumpy** [[paper](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2014-15-6-r84) | [code](https://github.com/arq5x/lumpy-sv)] 83 | * **BreakDancer** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138716/) | [code](https://github.com/genome/breakdancer)] 84 | * *linked/barcode reads* 85 | * **LongRanger** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4786454/) | [code](https://github.com/10XGenomics/longranger)] 86 | * **GROC-SVs** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28714986) | [code](https://github.com/grocsvs/grocsvs)] 87 | * **NAIBR** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/29112732) | [code](https://github.com/raphael-group/NAIBR)] 88 | * *long reads* 89 | * **Sniffles** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5990442/) | [code](https://github.com/fritzsedlazeck/Sniffles)] 90 | * **PBSV** [paper | [code](https://github.com/PacificBiosciences/pbsv)] 91 | 92 | The input is converted to the RCK adjacency format (described [above](#rck-adjacency-format)). 93 | `rck-adj-x2rck` tries to retain as much as possible of extra information from the SV detection tools input during the conversion, and such information is stored in the `extra` column. 94 | 95 | For help message of `rck-adj-x2rck`run: 96 | ````bash 97 | rck-adj-x2rck --help 98 | ```` 99 | 100 | To get help in converting adjacency prediction from a specific tool `x` run: 101 | ````bash 102 | rck-adj-x2rck x --help 103 | ```` 104 | 105 | The following optional command line arguments are shared for all of the input sources and can be beneficial to use: 106 | * `-o` | `--output` -- output file (default is `stdout`) 107 | * `--id-suffix` -- a suffix, that will be appended to every input adjacency id, when transforming to RCK format. 108 | This can be beneficial when working with several input sources of adjacencies, and one would want to differentiate based on the source 109 | 110 | An example of converting adjacency prediction by `Sniffles` tool to the RCK suitable format: 111 | ````bash 112 | rck-adj-x2rck sniffles SV_predictions.vcf --id-suffix sample-technology-sniffles -o sample-technology-sniffles.rck.adj.tsv 113 | ```` 114 | Which will convert SV prediction in `SV_predictions.vcf` produced by Sniffles in sample `sample` that was sequenced with technology `technology` into the RCK formatted adjacency calls in the file `sample-technology-sniffles.rck.adj.tsv`. 115 | 116 | All converted adjacencies will have `id_sample-technology-sniffles`, where `id` is the VCF id provided by Sniffles. 117 | 118 | Not that the `--id-suffix` value here is provided as an example and is not mandatory to (i) be prent at all (default value is empty string) (ii) be in the form of `sample-technology-method`, though we found it useful on several occasions. 119 | Depending on your needs a different suffix values may be more useful. 120 | 121 | ### Processing RCK adjacencies 122 | RCK installation adds `rck-adj-process` adjacency processing executable tool to `PATH` of your installation environment. 123 | For `rck-adj-process` the following commands are available: 124 | * `cat` -- combining adjacencies from 1+ inputs into a single one 125 | * `reciprocal` -- updating extremities of adjacencies in the input, so that pairs of extremities of distinct adjacencies that resemble reciprocality, but are tno exactly 1 bp apart, are brought together. 126 | This option ran by default in the main `rck` executable, unless explicitly suppressed. 127 | 128 | Running `rck-adj-process command --help` provides one with the help on usage of each particular command. 129 | 130 | -------------------------------------------------------------------------------- /docs/AdjacencyGroups.md: -------------------------------------------------------------------------------- 1 | # Adjacencies Groups 2 | 3 | ### Contents: 4 | * [Adjacencies Groups overview](#adjacencies-groups-overview) 5 | * [RCK Adjacencies Groups format](#rck-adjacencies-groups-format) 6 | * [Molecule Adjacencies Group](#molecule-adjacencies-group) 7 | 8 | 9 | ### Adjacencies Groups overview 10 | 3rd-generations sequencing experiments can produce groups of novel adjacencies for which we can infer additional, useful, information. 11 | We assume that all (novel) adjacencies are provided in the RCK input, but then we allow additional "grouping" information. 12 | 13 | ### RCK Adjacencies Groups format 14 | Adjacencies groups information is accepted into RCK workflow via the `--adjacency-groups` option that ust point to a file with RCK formatted adjacencies groups. 15 | RCK works with adjacencies groups in the following (T/C)SV (Tab/Comma Separated Values) text format: 16 | ``` 17 | gid aids extra 18 | ``` 19 | where every entry describes a subset of the adjacencies, that are part of the RCK input, with the group id of `gid`, and adjacencies ids in the group listed in a comma-separated fashion in the `aids` column. 20 | Comma-separated values in the `aids` column must match entries in the `aid` column in the RCK input adjacencies file. 21 | The extra field is designed to store `key=value` pairs of additional information about each adjacency groups, with entries being separated by `;`. 22 | 23 | There are several special extra fields, that RCK relies upon, when working: 24 | 25 | * `agt` -- adjacencies group type (`M` for [molecule groups](#molecule-adjacencies-group)). 26 | * `fp` -- maximum false positive (fraction) value for the adjacencies group 27 | 28 | ### Molecule Adjacencies Group 29 | This type of adjacencies groups usually comes from the 3rd-generation sequencing experiments as either a group of adjacencies supported by a single long read (long read sequencing), 30 | or a predicted by short reads with the same barcode (10x Genomics sequencing data), or coming from a single cell experiment. 31 | One way or the other, all of the adjacencies in the molecule group come from a single clone: either the single (part) of the derived chromosome (long reads + barcoded cases), which is in a single cell, which represents a single clone, 32 | or from a real single cell (single cell sequencing source), which, again, represents a single clone. 33 | 34 | Every group in the input adjacencies groups file with the entry `agt=M;` in the `extra` column is treated as the molecule adjacency group. 35 | 36 | So for every molecule group `U` comprised of `|U|` input adjacencies with a False Positive values of `f`, RCK forces that in at least one of the reconstructed clones, there will be at least `(1-f)*|U|` labeled representations of novel adjacencies from `U` present. 37 | We note, that such a constraint does not imply that only in one clone labeled realizations of adjacencies from `U` can be present. 38 | There may be several clones, in which different subsets of adjacencies from `U` have their labeled realizations present, but the constraints guaranties that in at least one clone, there will be `(1-f)|U|` of them. 39 | 40 | -------------------------------------------------------------------------------- /docs/Installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | 4 | ### Contents: 5 | * [Virtual environment](#virtual-environment) 6 | * [conda](#conda-recommended) 7 | * [pip](#pip) 8 | * [source](#source) 9 | * [Gurobi](#gurobi) 10 | * [Python bindings for Gurobi](#python-bindings-for-gurobi) 11 | * [Executables](#executables) 12 | 13 | ### virtual environment 14 | We recommend that RCK is installed in the isolated virtual environments. 15 | Virtual environments can be created via `anaconda` and `python` (w.r.t. RCK, which is written in Python). 16 | 17 | To create a virtual environment (named `rck-env`) with anaconda, run the following command: 18 | ````bash 19 | conda create --name rck-env python=3.7 20 | ```` 21 | 22 | To create a virtual environment (named `rck-env`) with python, run the following command: 23 | ````bash 24 | python -m venv rck-env 25 | ```` 26 | 27 | If virtual environments are used (which, again, we recommend), we assume that the environment is activated. 28 | 29 | ### conda (recommended) 30 | Run the following conda command, to install RCK: 31 | ````bash 32 | conda install -c aganezov rck 33 | ```` 34 | 35 | Installation via conda automatically takes care of Gurobi python bindings (refer to respective [subsection](#python-bindings-for-gurobi)), and everything shall work from this part (assuming that Gurobi is correctly installed and working). 36 | 37 | ### pip 38 | 39 | Run the following command, to install RCK: 40 | ````bash 41 | pip install rck 42 | ```` 43 | 44 | **WARNING**: this installation does take care of python bindings for Gurobi. Please, refer to respective [subsection](#python-bindings-for-gurobi) on how that can be addressed. 45 | 46 | ### source 47 | 48 | First, download the source code. Example is shown below: 49 | ````bash 50 | git clone https://github.com/aganezov/RCK.git 51 | ```` 52 | 53 | then run the following command from the RCK source folder: 54 | ````bash 55 | python setup.py install 56 | ```` 57 | 58 | **WARNING**: this installation does take care of python bindings for Gurobi. Please, refer to respective [subsection](#python-bindings-for-gurobi) on how that can be addressed. 59 | 60 | ### Gurobi 61 | [Gurobi](http://www.gurobi.com/) solver can be obtained from the official web site and installation procedure is also described there. 62 | Gurobi requires a valid license to run. 63 | Licensing [information](http://www.gurobi.com/downloads/licenses/license-center) is provided on the official website, and is available for free for individual academic users. 64 | More complicated setups with multi-user and cluster licenses are also available (and described on the official Gurobi website). 65 | Contact your university IT support for more information about any complication with Gurobi licensing and setup. 66 | 67 | RCK expects that Gurobi is installed on the machine in question. 68 | RCK requires python bindings be installed (in the virtual environment, if you use it (which we recommend)). 69 | Refer to the next [subsection](#python-bindings-for-gurobi) for details on how this can be addressed. 70 | 71 | ##### Python bindings for Gurobi 72 | RCK requires python bindings be installed (in the virtual environment, if you use it (which we recommend)). 73 | The following [documentation](https://www.gurobi.com/documentation/8.1/quickstart_windows/the_gurobi_python_interfac.html) of the Gurobi website explains how an installation of such bindings can be done. 74 | 75 | Recommended way is via anaconda. 76 | Regardless of whether conda is used for virtual environment, os just in general, the following command will install Python Guorbi bindings: 77 | ````bash 78 | conda install -c gurobi gurobi 79 | ```` 80 | 81 | If not using conda, one needs to go to the Gurobi installation dir and locate the `setup.py` file and run the following command: 82 | ````bash 83 | python setup.py install 84 | ```` 85 | Note that this way is deprecated by Gurobi. 86 | 87 | 88 | ### Executables 89 | Installation of RCK adds several executables to your `PATH` (if using virtual environment, this executables will be accessible only when the environment is activated): 90 | * `rck` - main executable that runs the RCK inference algorithm 91 | * `rck-adj-x2rck` - conversion of SV prediction from several 3rd-party SV prediction tools (refer to respective [docs section](Adjacencies.md#converting-to-rck-format-from-sv-detection-tools) for more details) 92 | * `rck-adj-process` - various processing options for RCK formatted adjacencies 93 | * `rck-scnt-x2rck` - conversion of the clone- and allele-specific segment copy number predictions by 3rd-party tools (refer to respective [docs section](Segments.md#converting-to-rck-format-from-clone--and-allele-specific-inference-tools) for more details) 94 | * `rck-scnt-process` - various processing options for RCK formatted segments, copy number, boundaries, etc. -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | Documentation for RCK is comprised of several documents. 2 | Information addresses the basic concepts, provides some examples, and covers usage of tools in the RCK distribution. 3 | 4 | ### Contents: 5 | * [Adjacencies](Adjacencies.md) 6 | * [Adjacencies Groups](AdjacencyGroups.md) 7 | * [Segments](Segments.md) 8 | * [Installation](Installation.md) 9 | * [Usage](Usage.md) -------------------------------------------------------------------------------- /docs/Segments.md: -------------------------------------------------------------------------------- 1 | # Segments 2 | 3 | ### Contents: 4 | * [Segments overview](#segments-overview) 5 | * [RCK segments format](#rck-segments-format) 6 | * [inferred clone- and haplotype-specific segment copy numbers](#inferred-clone--and-haplotype-specific-segment-copy-numbers) 7 | * [input clone- and allle-specific segment copy numbers](#input-allele--and-clone-specific-segment-copy-numbers) 8 | * [Converting to RCK format from clone- and allele-specific inference tools](#converting-to-rck-format-from-clone--and-allele-specific-inference-tools) 9 | * [Processing RCK segments](#processing-rck-segments) 10 | * [Segments vs fragments](#segments-vs-fragments) 11 | 12 | ### Segments overview 13 | 14 | One of the key concepts in the RCK model if the notion of *segment*. 15 | A segment `(chr,start,end)` represents a continuous part of the reference genome's chromosome `chr` starting at `start` and ending at `end` (both inclusive). 16 | A segment `s=(chr,start,end)` naturally has two extremities `(chr,start,-)` and `(chr,end,+)` corresponding to tail `s^t` and head `s^h` of the segment `s`. 17 | In a diploid reference genome every segment `(chr,start,end)` (except for segment on sex chromosomes) has two haplotype-specific copies `(chr,start,end,A)` and `(chr,start,end,B)` respectively. 18 | 19 | Reference adjacencies correspond to pairs of adjacent extremities of consecutive segments. 20 | For example, two consecutive segments `a=(chr,10001,20000)` and `b=(chr,20001,30000)` determine a reference adjacency `{(chr,20000,+),(chr,20001,-)}`. 21 | Naturally for every chromosome that has two homologous copies, for every unlabeled reference adjacency `{(chr,20000,+),(chr,20001,-)}` there are two labeled reference adjacency counterparts: 22 | `{(chr,20000,+,A),(chr,20001,-,A)}` and `{(chr,20000,+,B),(chr,20001,-,B)}`. 23 | 24 | ### RCK segments format 25 | RCK works with segments in the following (T/C)SV (Tab/Comma Separated Values) text format (similar to that of bedpe): 26 | ```` 27 | chr start end extra 28 | ```` 29 | where every entry describes a segment `(chr,start,end)`. 30 | The `extra` field is designated to store `key=value` pairs of additional information about segments, with entries being separated by `;`. 31 | 32 | There are several special extra fields, that RCK relies upon, when working: 33 | * `cn` -- clone and allele/haplotype-specific copy number values of the segment (refer to the following [subsection](#inferred-clone--and-haplotype-specific-segment-copy-numbers)) 34 | * `cnb` -- clone and allele/haplotype-specific copy number boundaries of the segment (refer to the respective [subsection](#copy-number-boundaries)) 35 | 36 | #### inferred clone- and haplotype-specific segment copy numbers 37 | The result of the main RCK algorithm (via `rck` executable) contains the `rck.scnt.tsv` file, with entries following the RCK segments format. 38 | While the segments themselves are self-explanatory, the main important peace of information about them is the `cn` field in the `extra` column, that encode clone- and haplotype-specific copy number values. 39 | 40 | The `cn` value is a python/JSON dict with the following structure: 41 | ``` 42 | { 43 | 'clone_id' : { 44 | 'A': int, 45 | 'B'; int 46 | }, 47 | ... 48 | } 49 | ``` 50 | where `clone_id` corresponds to the clone, for which haplotype-specific copy numbers are provided, with the `A` and `B` entries encoding the copy number of the multiplicity of the corresponding haplotype-specific segments. 51 | 52 | In the following example: 53 | ```` 54 | chr start end extra 55 | 1 10000 20000 cn={'c1':{'A': 1, 'B': 2}, 'c2':{'A': 3 'B': 2}} 56 | ```` 57 | where for the segment `(1,10000,20000)` its `A` haplotype-specific version has `1` copy in clone `c1` and `3` copies in clone `c2`, and its `B` haplotype-specific version has `2` copies in clone `c1` and `2` copies on clone `c2`. 58 | 59 | #### input allele- and clone-specific segment copy numbers 60 | The input for the `RCK` method expects clone- and *allele*-specific (approximate) segment copy numbers. 61 | While most methods follow the notion of `major` and `minor` alleles (based on the segment copy numbers), we employ the same format of `cn` field, as was described in the previous [subsection](#inferred-clone--and-haplotype-specific-segment-copy-numbers) 62 | The only and major difference is, that while the RCK output is haplotype-specific (i.e., `A` and `B` are matching and the same for every segment), the input is allele-specific (i.e., `A` and `B` entries do not necessarily match and can be"flipped"). 63 | 64 | ### Converting to RCK format from clone- and allele-specific inference tools 65 | RCK installation adds `rck-scnt-x2rck` segment copy number conversion executable tool to the `PATH` of your installation environment. 66 | With the help of `rck-scnt-x2rck` one can convert clone- and allele-specific prediction from the following tools: 67 | * **HATCHet** [[paper](https://www.biorxiv.org/content/early/2018/12/17/496174) | [code](https://github.com/raphael-group/hatchet)] (*recommended* as it has fewest limitation w.r.t. tumor heterogeneity) 68 | * **TitanCNA** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/25060187) | [code](https://github.com/gavinha/TitanCNA)] 69 | * **Battenberg** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/22608083) | [code](https://github.com/cancerit/cgpBattenberg)] 70 | * **ReMixT** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28750660) | [code](https://bitbucket.org/dranew/remixt)] 71 | * **Ginkgo** [[paper](https://www.nature.com/articles/nmeth.3578) | [code](https://github.com/robertaboukhalil/ginkgo)] (Attention! *haploid* mode only) 72 | 73 | For help message of `rck-scnt-x2rck` run 74 | ````bash 75 | rck-scnt-x2rck --help 76 | ```` 77 | 78 | To get help in converting clone- and allele-specific predictions from a specific tool `x` run: 79 | ````bash 80 | rck-scnt-x2rck x --help 81 | ```` 82 | 83 | ### Processing RCK segments 84 | RCK installation adds `rck-scnt-process` segment copy number processing executable tool to the `PATH` of your installation environment. 85 | For `rck-scnt-process` the following commands are available: 86 | * `align` -- aligning segments (and corresponding segment copy number tensors) form 1+ segment copy number tensord 87 | * `refine` -- filling the missing spans in entries, of merging consecutive entries that have the same clone- and allele/haplotype-specific copy numbers. 88 | This option ran by default in the main `rck` executable, unless explicitly suppressed. 89 | 90 | Running `rck-scnt-process command --help` provides one with the help on usage of each particular command. 91 | 92 | 93 | ### Segments vs fragments 94 | A segment and a fragment are of the same nature: a consecutive part of the reference chromosome. 95 | Input clone- and allele-specific copy number are usually inferred on rather large spans, which we treat as fragments. 96 | Such fragments are further fragmented into smaller, actual segments, based on the novel adjacencies, in such a way that novel adjacencies involve segments' extremities. 97 | With smaller segments, we still retain information about allele-separation based on fragments, which span smaller segments. 98 | -------------------------------------------------------------------------------- /docs/Usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | ### Contents: 4 | * [Input data](#input-data) 5 | * [Mandatory](#mandatory) 6 | * [Optional](#optional) 7 | * [Running RCK](#running-rck) 8 | * [preprocessing options](#preprocessing-options) 9 | * [running options](#running-options) 10 | * [Examples](#examples) 11 | 12 | ### Input data 13 | 14 | #### Mandatory 15 | RCK expects two mandatory peaces of the input: 16 | * `--scnt` - clone- and allele-specific segment copy number predictions, obtained from 3rd-party tools (see [segments docs](Segments.md#converting-to-rck-format-from-clone--and-allele-specific-inference-tools) for more details). 17 | * `--adjacencies` (unlabeled) novel adjacencies (aka SVs) obtained from 3rd-party tools (see [adjacencies docs](Adjacencies.md#converting-to-rck-format-from-sv-detection-tools) for more details). 18 | 19 | Both inputs must be i the RCK format (refer to [segments](Segments.md#rck-segments-format) and [adjacencies](Adjacencies.md#rck-adjacency-format) docs on the formatting issues.) 20 | 21 | #### Optional 22 | 23 | * `--adjacency-groups` - Adjacencies groups (see [adjacencies groups docs](AdjacencyGroups.md) for more details). 24 | * `--clone-ids` - a comma-separated list of clone ids (as present in the `scnt` input). 25 | * **advanced** `--telomere-positions` and `--telomere-segments` - Telomeres (either via exact locations, or via segments, for which all spanned extremities will be considered as possible additional telomeres) 26 | * **advanced** `--fragments` - Fragments that span segments. Only works properly if no preprocessing on input `scnt` is performed. 27 | 28 | ### Running RCK 29 | 30 | Running RCK inference algorithm is achieved through the `rck` executable (which is automatically added to your `PATH` with RCK installation). 31 | 32 | #### preprocessing options 33 | When running RCK, a lot of input preprocessing options, that are achieved via RCK utilities. 34 | 35 | All preprocessing can be turned off via the `--no-pre` flag, but this is an advanced option, use with caution. 36 | 37 | All the `--pre-scnt-xxx` flags refer to `--scnt` input clone- and allele-specific segment copy number values (in RCK format) preprocessing, that is similar to the `refine` command on the `rck-scnt-process` tool. 38 | 39 | All the `--pre-scnb-xxx` flags refer to creating copy number boundaries for the inferred copy number values. 40 | By default the strategy for obtaining the copy number boundaries is the uniform min-max one, where regardless of the input clone- and allele-specific copy number, the lower bound is set to the `--pre-scnb-uniform-min` value (default 0) and the upper is set to the `--pre-scnb-uniform-max` value (default 10). 41 | When working with genomes with known highly amplified segments, one can think about altering the default value for the `--pre-scnb-uniform-max`. 42 | 43 | Preprocessing of the input adjacencies concerns the reciprocality adjustments, and is similar to the `rck-adj-process reciprocal` command and the `--pre-adj-xxx` option mirror those of the `rck-adj-process reciprocal`. 44 | Adjacency preprocessing can be turned off by specifying the `--pre-no-adj`. 45 | 46 | #### running options 47 | 48 | One of the main arguments in running `rck` is the `--workdir` option, specifying the working directory in which the three following directories are created: 49 | * `raw_input` - contains exact copies of the input files 50 | * `input` - contains fully preprocessed data 51 | * `output` - contains inference results from the RCK algorithm 52 | 53 | Running options for `rck` start with the `--run-` prefix. 54 | Running RCK without actually executing the inference algorithm can be achieved by using the `--no-run` flag. 55 | This will prevent the actual gurobi based ilp solving and respective karyotype inference, but will preprocess (unless disabled) of all the input, and putting the preprocessed data into the `workdir/input` directory. 56 | 57 | The `--run-g-` are the flags corresponding to setting Gurobi related options: 58 | * `--run-g-mip-gap` - the gap between the best bound and best objective, after which the Gurobi solver will stop crunching numbers (default: 0.015, or 1.5% difference) 59 | * `--run-g-time-limit` - the maximum time (in seconds) for gurobi to run, before stopping execution and taking the current best objective as the result (default: 28800, aka 8 hours) 60 | * `--run-g-threads` - number of threads gurobi will use (deault: 4) 61 | * `--run-g-allow-interrupted` - allow for gurobi run to be interrupted and still use the best obtained objective for the inference result 62 | 63 | Other flags: 64 | * `--run-nas-fp` - default False Positive upper bound (i.e., at most a `--run-nas-fp` fraction of input novel adjacencies can be *not* used in the inferred karyotypes). Default is 0.1 65 | * `--run-group-m-default-fp` - default False Positive values for *molecule* adjacencies groups (unless explicitly specified in with the `fp` value in the `extra` field). Default is 0.1 66 | * `--run-segment-length-attr` - an choice based attribute that is used to get the segments length. Default is `length_100` which means that for every segment of length `l` an `ceil(l/100)` value is used in the inference minimization. 67 | 68 | 69 | ### Examples 70 | 71 | The following command runs RCK inference on the clone- and allele-specific segment copy number tensor (stored in the `input.rck.scnt.tsv`) 72 | -------------------------------------------------------------------------------- /docs/img/RCK_Overview_vertical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/docs/img/RCK_Overview_vertical.png -------------------------------------------------------------------------------- /rck/__init__.py: -------------------------------------------------------------------------------- 1 | version = "1.1.0" 2 | -------------------------------------------------------------------------------- /rck/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/core/__init__.py -------------------------------------------------------------------------------- /rck/core/process.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from rck.core.io import EXTERNAL_NA_ID 4 | 5 | 6 | class ClusteringStrategy(Enum): 7 | IterativeSlider = 0 8 | 9 | 10 | def positions_aligned(segments_positions, other_positions): 11 | segments_positions_ids = {p.stable_id_non_hap for p in segments_positions} 12 | other_positions_ids = {p.stable_id_non_hap for p in other_positions} 13 | result = other_positions_ids <= segments_positions_ids 14 | return result 15 | 16 | 17 | def adj_groups_concur(adj_groups, adjacencies): 18 | nas_ids = {na.extra.get(EXTERNAL_NA_ID, na.idx) for na in adjacencies} 19 | for group in adj_groups: 20 | for aid in group.adjacencies_ids: 21 | if aid not in nas_ids: 22 | return False 23 | return True 24 | -------------------------------------------------------------------------------- /rck/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/__init__.py -------------------------------------------------------------------------------- /rck/utils/adj/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/adj/__init__.py -------------------------------------------------------------------------------- /rck/utils/adj/adjacency_group_inference.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | 4 | import pysam 5 | 6 | from rck.core.io import EXTERNAL_NA_ID, AG_LABELING 7 | from rck.core.structures import AdjacencyGroup, AdjacencyGroupType, Strand 8 | 9 | 10 | def infer_sniffles_molecule_groups(adjacencies, extra_rnames_field="rnames", gid_suffix=""): 11 | reads_to_adjacencies_ids = defaultdict(set) 12 | for adj in adjacencies: 13 | read_names = adj.extra.get(extra_rnames_field, "").split(",") 14 | if len(read_names) == 1 and len(read_names[0]) == 0: 15 | continue 16 | for read_name in read_names: 17 | reads_to_adjacencies_ids[read_name].add(adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased)) 18 | result = [] 19 | for cnt, (read_name, aids) in enumerate(reads_to_adjacencies_ids.items()): 20 | if len(aids) < 2: 21 | continue 22 | extra = {"source": read_name} 23 | gid = str(cnt) 24 | if len(gid_suffix) > 0: 25 | gid += "_" + gid_suffix 26 | ag = AdjacencyGroup(gid=gid, aids=list(aids), group_type=AdjacencyGroupType.MOLECULE, extra=extra) 27 | result.append(ag) 28 | return result 29 | 30 | 31 | def infer_short_nas_labeling_groups(adjacencies, gid_suffix="", max_size=1000, allow_intermediate_same=False, allow_intermediate_tra=False, 32 | allow_inv_signatures=True): 33 | positions_to_adjacencies = defaultdict(list) 34 | positions_by_chrs = defaultdict(list) 35 | result = [] 36 | for adj in adjacencies: 37 | p1, p2 = adj.position1, adj.position2 38 | positions_to_adjacencies[p1].append(adj) 39 | positions_to_adjacencies[p2].append(adj) 40 | positions_by_chrs[p1.chromosome].append(p1) 41 | positions_by_chrs[p2.chromosome].append(p2) 42 | positions_by_chr_to_index = {} 43 | for chr_name in list(positions_by_chrs.keys()): 44 | positions_by_chrs[chr_name] = sorted(positions_by_chrs[chr_name], key=lambda p: (p.coordinate, p.strand)) 45 | positions_by_chr_to_index[chr_name] = {p: cnt for cnt, p in enumerate(positions_by_chrs[chr_name])} 46 | processed_adj_ids = set() 47 | cnt = 0 48 | for adj in adjacencies: 49 | aid = adj.stable_id_non_phased 50 | if aid in processed_adj_ids: 51 | continue 52 | p1, p2 = adj.position1, adj.position2 53 | p1_chr, p2_chr = p1.chromosome, p2.chromosome 54 | if p1_chr != p2_chr: 55 | continue 56 | if not allow_inv_signatures and adj.position1.strand == adj.position2.strand: 57 | continue 58 | adj_size = adj.distance_non_hap 59 | if adj_size > max_size: 60 | continue 61 | positions = positions_by_chrs[p1_chr] 62 | p1_index, p2_index = positions_by_chr_to_index[p1_chr][p1], positions_by_chr_to_index[p1_chr][p2] 63 | aid = adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased) 64 | aids = [aid, aid] 65 | gid = str(cnt) 66 | if len(gid_suffix) > 0: 67 | gid += "_" + gid_suffix 68 | extra = {AG_LABELING: [0, 1]} 69 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra) 70 | if abs(p1_index - p2_index) == 1: 71 | result.append(ag) 72 | cnt += 1 73 | else: 74 | if not (allow_intermediate_same or allow_intermediate_tra): 75 | continue 76 | intermediate_indexes = list(range(p1_index + 1, p2_index)) 77 | has_same = False 78 | has_tra = False 79 | allow = False 80 | for index in intermediate_indexes: 81 | position = positions[index] 82 | adjs = positions_to_adjacencies[position] 83 | for adj in adjs: 84 | has_same |= adj.position1.chromosome == adj.position2.chromosome 85 | has_tra |= adj.position1.chromosome != adj.position2.chromosome 86 | allow |= has_same and allow_intermediate_same 87 | allow |= has_tra and allow_intermediate_tra 88 | if allow: 89 | result.append(ag) 90 | cnt += 1 91 | processed_adj_ids.add(adj.stable_id_non_phased) 92 | return result 93 | 94 | 95 | def get_mode_str(format="bam", input=False): 96 | result = "r" if input else "w" 97 | if format == "bam": 98 | result += "b" 99 | elif format == "cram": 100 | result += "c" 101 | return result 102 | 103 | 104 | def get_labeling_groups(read_alignments, read_adjacencies, strategy="skip", delta=500, neighbour_selection="first"): 105 | result = [] 106 | read_alignments = sorted(read_alignments, key=lambda e: (e.query_alignment_start, e.query_alignment_end)) 107 | processed_positions = set() 108 | positions_by_chrs = defaultdict(list) 109 | positions_to_alignments = defaultdict(list) 110 | alignments_to_positions = defaultdict(list) 111 | positions_to_adjacencies = defaultdict(list) 112 | for adj in read_adjacencies: 113 | p1 = adj.position1 114 | p2 = adj.position2 115 | positions_by_chrs[p1.chromosome].append(p1) 116 | positions_by_chrs[p2.chromosome].append(p2) 117 | for alignment in read_alignments: 118 | for p in [p1, p2]: 119 | if p.chromosome == alignment.reference_name and \ 120 | (alignment.reference_start - delta <= p.coordinate <= alignment.reference_end + delta): 121 | positions_to_alignments[p].append(alignment) 122 | alignments_to_positions[alignment].append(p) 123 | positions_to_adjacencies[p1].append(adj) 124 | positions_to_adjacencies[p2].append(adj) 125 | for alignment in alignments_to_positions.keys(): 126 | alignments_to_positions[alignment] = sorted(alignments_to_positions[alignment], key=lambda p: (p.coordinate, p.strand)) 127 | for chr_name in list(positions_by_chrs.keys()): 128 | positions_by_chrs[chr_name] = sorted(positions_by_chrs[chr_name], key=lambda p: (p.coordinate, p.strand)) 129 | cnt = 0 130 | for adj in read_adjacencies: 131 | aid = adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased) 132 | for p in [adj.position1, adj.position2]: 133 | if p in processed_positions: 134 | continue 135 | processed_positions.add(p) 136 | alignments = positions_to_alignments[p] 137 | for alignment in alignments: 138 | positions_on_alignment = alignments_to_positions[alignment] 139 | p_index = positions_on_alignment.index(p) 140 | direction_neighbours = positions_on_alignment[:p_index] if p.strand == Strand.FORWARD else positions_on_alignment[p_index + 1:] 141 | if len(direction_neighbours) == 0: 142 | continue 143 | ordered_neighbours = direction_neighbours if p.strand == Strand.REVERSE else direction_neighbours[::-1] 144 | neighbour = None 145 | for candidate in ordered_neighbours: 146 | if candidate.strand != p.strand: 147 | neighbour = candidate 148 | if neighbour_selection == "first": 149 | break 150 | if neighbour is None: 151 | continue 152 | # neighbour = direction_neighbours[-1] if p.strand == Strand.FORWARD else direction_neighbours[0] 153 | processed_positions.add(neighbour) 154 | neighbour_ids = [adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased) for adj in positions_to_adjacencies[neighbour]] 155 | adj_index = 0 if p == adj.position1 else 1 156 | neighbour_indexes = [0 if neighbour == adj.position1 else 1 for adj in positions_to_adjacencies[neighbour]] 157 | extra = { 158 | "alignment": alignment.query_name, 159 | AG_LABELING: [adj_index] + neighbour_indexes 160 | } 161 | ag = AdjacencyGroup(gid=cnt, aids=[aid] + neighbour_ids, group_type=AdjacencyGroupType.LABELING, extra=extra) 162 | result.append(ag) 163 | cnt += 1 164 | break 165 | return result 166 | 167 | 168 | def infer_alignment_labeling_groups(adjacencies, alignment_file_name, alignment_format="bam", 169 | extra_rnames_field="rnames", gid_suffix="", inconsistent_traversal_strategy="skip"): 170 | result = [] 171 | reads_to_adjacencies_ids = defaultdict(set) 172 | adjacencies_by_aids = {} 173 | for adj in adjacencies: 174 | aid = adj.stable_id_non_phased 175 | read_names = adj.extra.get(extra_rnames_field, "").split(",") 176 | if len(read_names) == 1 and len(read_names[0]) == 0: 177 | continue 178 | for read_name in read_names: 179 | reads_to_adjacencies_ids[read_name].add(aid) 180 | adjacencies_by_aids[aid] = adj 181 | mode = get_mode_str(format=alignment_format, input=True) 182 | current_read_name = None 183 | current_entries = [] 184 | cnt = 0 185 | alignment_1k_counter = 0 186 | with pysam.AlignmentFile(alignment_file_name, mode) as i_stream: 187 | if "SO:queryname" not in i_stream.text: 188 | raise ValueError("Input alignment file needs to be sorted by read (i.e., query) name. It is not.") 189 | for alignment_cnt, entry in enumerate(i_stream): 190 | if alignment_cnt / 1000 >= alignment_1k_counter: 191 | alignment_1k_counter += 1 192 | if entry.qname != current_read_name: 193 | if len(current_entries) > 0 and current_read_name in reads_to_adjacencies_ids: 194 | adjacencies = [adjacencies_by_aids[aid] for aid in reads_to_adjacencies_ids[current_read_name]] 195 | groups = get_labeling_groups(read_alignments=current_entries, read_adjacencies=adjacencies, strategy=inconsistent_traversal_strategy) 196 | for group in groups: 197 | gid = str(cnt) 198 | if len(gid_suffix) > 0: 199 | gid += "_" + gid_suffix 200 | group.gid = gid 201 | cnt += 1 202 | result.extend(groups) 203 | current_read_name = entry.qname 204 | current_entries = [entry] 205 | else: 206 | current_entries.append(entry) 207 | if len(current_entries) > 0 and current_read_name in reads_to_adjacencies_ids: 208 | adjacencies = [adjacencies_by_aids[aid] for aid in reads_to_adjacencies_ids[current_read_name]] 209 | groups = get_labeling_groups(read_alignments=current_entries, read_adjacencies=adjacencies, strategy=inconsistent_traversal_strategy) 210 | for group in groups: 211 | gid = str(cnt) 212 | if len(gid_suffix) > 0: 213 | gid += "_" + gid_suffix 214 | group.gid = gid 215 | cnt += 1 216 | result.extend(groups) 217 | return result 218 | 219 | 220 | def filter_alignment(adjacencies, alignment_file_name, output_alignment_file_name, alignment_format="bam", extra_rnames_field="rnames", output_alignment_format="bam"): 221 | all_read_names = set() 222 | for adj in adjacencies: 223 | read_names = adj.extra.get(extra_rnames_field, "").split(",") 224 | if len(read_names) == 1 and len(read_names[0]) == 0: 225 | continue 226 | for read_name in read_names: 227 | all_read_names.add(read_name) 228 | i_mode = get_mode_str(format=alignment_format, input=True) 229 | o_mode = get_mode_str(format=output_alignment_format, input=False) 230 | with pysam.AlignmentFile(alignment_file_name, i_mode) as i_stream: 231 | with pysam.AlignmentFile(output_alignment_file_name, o_mode, template=i_stream) as o_stream: 232 | for entry in i_stream: 233 | if entry.qname in all_read_names: 234 | o_stream.write(entry) 235 | -------------------------------------------------------------------------------- /rck/utils/adj/adjacency_group_process.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from copy import deepcopy 3 | 4 | import networkx as nx 5 | 6 | from rck.core.structures import AdjacencyGroup, AdjacencyGroupType 7 | from rck.core.io import AG_LABELING, EXTERNAL_NA_ID 8 | 9 | 10 | def refine_labeling_groups_old(adj_groups, gid_suffix="", retain_source_gids=False, iag=None): 11 | graph = nx.Graph() 12 | entries_to_adj_groups = defaultdict(list) 13 | groups_by_ids = {} 14 | for group in adj_groups: 15 | groups_by_ids[group.gid] = group 16 | entries = [(aid, index) for aid, index in zip(group.adjacencies_ids, group.extra.get(AG_LABELING, []))] 17 | if len(entries) < 2: 18 | continue 19 | for entry in entries: 20 | entries_to_adj_groups[entry].append(group) 21 | for l, r in zip(entries[:-1], entries[1:]): 22 | graph.add_edge(l, r) 23 | result = [] 24 | cnt = 0 25 | for cc in nx.connected_component_subgraphs(graph): 26 | entries = list(cc.nodes()) 27 | gids = set() 28 | for entry in entries: 29 | groups = entries_to_adj_groups[entry] 30 | for group in groups: 31 | gids.add(group.gid) 32 | groups = [groups_by_ids[gid] for gid in gids] 33 | aids = [entry[0] for entry in entries] 34 | indexes = [entry[1] for entry in entries] 35 | alignments = list(set(group.extra.get("alignment", "") for group in groups if len(group.extra.get("alignment", "")) > 0)) 36 | extra = { 37 | AG_LABELING: indexes, 38 | } 39 | if len(alignments) > 0: 40 | extra["alignment"] = alignments 41 | if retain_source_gids: 42 | extra["source_gids"] = sorted(gids) 43 | gid = str(cnt) 44 | if len(gid_suffix) > 0: 45 | gid += "_" + gid_suffix 46 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra) 47 | result.append(ag) 48 | cnt += 1 49 | return result 50 | 51 | 52 | def refine_labeling_groups_without_iag(adj_groups, gid_suffix="", retain_source_gids=False): 53 | graph = nx.Graph() 54 | entries_to_adj_groups = defaultdict(list) 55 | groups_by_ids = {} 56 | for group in adj_groups: 57 | groups_by_ids[group.gid] = group 58 | entries = [(aid, index) for aid, index in zip(group.adjacencies_ids, group.extra.get(AG_LABELING, []))] 59 | if len(entries) < 2: 60 | continue 61 | for entry in entries: 62 | entries_to_adj_groups[entry].append(group) 63 | for l, r in zip(entries[:-1], entries[1:]): 64 | graph.add_edge(l, r) 65 | result = [] 66 | cnt = 0 67 | for cc in nx.connected_component_subgraphs(graph): 68 | entries = list(cc.nodes()) 69 | gids = set() 70 | for entry in entries: 71 | groups = entries_to_adj_groups[entry] 72 | for group in groups: 73 | gids.add(group.gid) 74 | groups = [groups_by_ids[gid] for gid in gids] 75 | aids = [entry[0] for entry in entries] 76 | indexes = [entry[1] for entry in entries] 77 | alignments = list(set(group.extra.get("alignment", "") for group in groups if len(group.extra.get("alignment", "")) > 0)) 78 | extra = { 79 | AG_LABELING: indexes, 80 | } 81 | if len(alignments) > 0: 82 | extra["alignment"] = alignments 83 | if retain_source_gids: 84 | extra["source_gids"] = sorted(gids) 85 | gid = str(cnt) 86 | if len(gid_suffix) > 0: 87 | gid += "_" + gid_suffix 88 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra) 89 | result.append(ag) 90 | cnt += 1 91 | return result 92 | 93 | 94 | def refined_labeling_groups(adj_groups, iag=None, adjacencies=None, gid_suffix="", retain_source_gids=False): 95 | graph = nx.Graph() 96 | if adjacencies is not None: 97 | adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies} 98 | else: 99 | adjacencies_by_external_ids = {} 100 | adjacencies_by_positions = defaultdict(list) 101 | groups_by_ids = {} 102 | entries_to_adj_groups = defaultdict(list) 103 | for group in adj_groups: 104 | groups_by_ids[group.gid] = group 105 | internal_entries = [(aid, index) for aid, index in zip(group.adjacencies_ids, group.extra.get(AG_LABELING, []))] 106 | if len(internal_entries) < 2: 107 | continue 108 | entries = [] 109 | if iag is not None and adjacencies is not None: 110 | for aid, index in internal_entries: 111 | adjacency = adjacencies_by_external_ids[aid] 112 | position = adjacency.position1 if index == 0 else adjacency.position2 113 | adjacencies_by_positions[position].append(adjacency) 114 | entries.append(position) 115 | else: 116 | entries = internal_entries 117 | for entry in entries: 118 | entries_to_adj_groups[entry].append(group) 119 | for l, r in zip(entries[:-1], entries[1:]): 120 | graph.add_edge(l, r) 121 | if iag is not None and adjacencies is not None: 122 | l_ref_edges = list(iag.ref_adjacency_edges(nbunch=l, data=False)) 123 | r_ref_edges = list(iag.ref_adjacency_edges(nbunch=r, data=False)) 124 | ref_edges = l_ref_edges + r_ref_edges 125 | for (u, v) in ref_edges: 126 | graph.add_edge(u, v) 127 | result = [] 128 | cnt = 0 129 | for cc in nx.connected_component_subgraphs(graph): 130 | internal_entries = list(cc.nodes()) 131 | gids = set() 132 | for entry in internal_entries: 133 | groups = entries_to_adj_groups[entry] 134 | for group in groups: 135 | gids.add(group.gid) 136 | entries = [] 137 | if iag is not None and adjacencies is not None: 138 | for entry in internal_entries: 139 | if entry in adjacencies_by_positions: 140 | for adjacency in adjacencies_by_positions[entry]: 141 | aid = adjacency.extra.get(EXTERNAL_NA_ID, adjacency.stable_id_non_phased) 142 | index = 0 if adjacency.position1 == entry else 1 143 | entries.append((aid, index)) 144 | else: 145 | entries = internal_entries 146 | groups = [groups_by_ids[gid] for gid in gids] 147 | aids = [entry[0] for entry in entries] 148 | indexes = [entry[1] for entry in entries] 149 | alignments = list(set(group.extra.get("alignment", "") for group in groups if len(group.extra.get("alignment", "")) > 0)) 150 | extra = { 151 | AG_LABELING: indexes, 152 | } 153 | if len(alignments) > 0: 154 | extra["alignment"] = alignments 155 | if retain_source_gids: 156 | extra["source_gids"] = sorted(gids) 157 | gid = str(cnt) 158 | if len(gid_suffix) > 0: 159 | gid += "_" + gid_suffix 160 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra) 161 | result.append(ag) 162 | cnt += 1 163 | return result 164 | 165 | 166 | def projected_groups(groups, adjacencies, adjacencies_by_external_ids=None, gid_suffix=""): 167 | if adjacencies_by_external_ids is None: 168 | adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies} 169 | result = [] 170 | for group in groups: 171 | projected = [aid in adjacencies_by_external_ids for aid in group.adjacencies_ids] 172 | aids = [aid for aid, allowed in zip(group.adjacencies_ids, projected) if allowed] 173 | if len(aids) == 0: 174 | continue 175 | if group.adjacencies is not None: 176 | adjacencies = [adj for adj, allowed in zip(group.adjacencies, projected) if allowed] 177 | else: 178 | adjacencies = None 179 | ag = deepcopy(group) 180 | ag.adjacencies_ids = aids 181 | ag.adjacencies = adjacencies 182 | if group.group_type == AdjacencyGroupType.LABELING: 183 | if len(aids) < 2: 184 | continue 185 | labeling = [index for index, allowed in zip(group.extra[AG_LABELING], projected) if allowed] 186 | ag.extra[AG_LABELING] = labeling 187 | if not all(projected) and len(gid_suffix) > 0 and group.group_type != AdjacencyGroupType.GENERAL: 188 | ag.gid += "_" + gid_suffix 189 | result.append(ag) 190 | return result 191 | -------------------------------------------------------------------------------- /rck/utils/adj/adjacency_group_stats.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from rck.core.structures import AdjacencyGroup 4 | 5 | 6 | def groups_size_tally(adjacency_groups): 7 | result = defaultdict(int) 8 | for ag in adjacency_groups: 9 | ag: AdjacencyGroup = ag 10 | result[len(ag.adjacencies_ids)] += 1 11 | return result 12 | -------------------------------------------------------------------------------- /rck/utils/adj/analysis.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List 2 | 3 | import networkx as nx 4 | 5 | from rck.core.structures import Adjacency, Position, Strand, AdjacencyType 6 | 7 | 8 | class ComplexRearrSignature(object): 9 | def __init__(self, adjacencies: Iterable[Adjacency], ref_locations: Iterable[Adjacency] = None): 10 | self.adjacencies: List[Adjacency] = list(adjacencies) 11 | self.ref_adjacencies: List[Adjacency] = list(ref_locations) if ref_locations is not None else self.infer_ref_adjacencies(self.adjacencies) 12 | self.k = len(self.ref_adjacencies) 13 | 14 | @classmethod 15 | def infer_ref_adjacencies(cls, adjacencies: Iterable[Adjacency]) -> List[Adjacency]: 16 | result = set() 17 | for adjacency in adjacencies: 18 | for p in [adjacency.position1, adjacency.position2]: 19 | pr = Position.get_reciprocal(position=p) 20 | result.add(Adjacency(position1=p, position2=pr, adjacency_type=AdjacencyType.REFERENCE)) 21 | return list(result) 22 | 23 | 24 | def get_complex_rearrangements_signatures(adjacencies: Iterable[Adjacency]) -> Iterable[ComplexRearrSignature]: 25 | cr_graph = nx.MultiGraph() 26 | for adjacency in adjacencies: 27 | p1 = adjacency.position1.get_non_hap_copy() 28 | p2 = adjacency.position2.get_non_hap_copy() 29 | if p1.strand == Strand.REVERSE: 30 | p1 = Position.get_reciprocal(position=p1) 31 | if p2.strand == Strand.REVERSE: 32 | p2 = Position.get_reciprocal(position=p2) 33 | cr_graph.add_edge(p1, p2, adjacency=adjacency) 34 | result: List[ComplexRearrSignature] = [] 35 | for cc in nx.connected_component_subgraphs(cr_graph): 36 | result.append(ComplexRearrSignature(adjacencies=[edge[2]["adjacency"] for edge in cc.edges(data=True)])) 37 | return result 38 | -------------------------------------------------------------------------------- /rck/utils/adj/long_reads.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import defaultdict 3 | import networkx as nx 4 | import pysam 5 | 6 | from rck.core.io import read_adjacencies_from_source 7 | 8 | ##### 9 | 10 | 11 | def get_mode_str(format="bam", input=False): 12 | result = "r" if input else "w" 13 | if format == "bam": 14 | result += "b" 15 | elif format == "cram": 16 | result += "c" 17 | return result 18 | 19 | ##### 20 | 21 | 22 | def extract_long_reads(): 23 | pass 24 | 25 | 26 | def filter_alignment(): 27 | pass 28 | 29 | 30 | def infer_labeling_constraints(rck_nas_source, alignment_file, i_alignment_format, lr_field, min_sv_cnt, logger=None): 31 | logger = logger or logging.getLogger('dummy') 32 | logger.info("RCK NAs file object {file_name}".format(file_name=str(rck_nas_source))) 33 | nas = read_adjacencies_from_source(source=rck_nas_source) 34 | reads_to_nas = defaultdict(list) 35 | for na in nas: 36 | reads_str = na.extra.get(lr_field, "") 37 | reads = reads_str.split(",") 38 | for read in reads: 39 | if len(read) == 0: 40 | continue 41 | reads_to_nas[read].append(na) 42 | logger.debug("{reads_cnt} -- number of reads".format(reads_cnt=len(reads_to_nas))) 43 | reads = {read for read in reads_to_nas if len(reads_to_nas[read]) >= min_sv_cnt} 44 | logger.debug("{reads_cnt} -- number of reads that each span {min_sv_cnt}+ NAs".format(reads_cnt=len(reads), min_sv_cnt=min_sv_cnt)) 45 | location_graph = nx.Graph() 46 | mode = get_mode_str(format=i_alignment_format, input=True) 47 | current_read_name = None 48 | current_entries = [] 49 | with pysam.AlignmentFile(alignment_file, mode) as i_stream: 50 | if "SO:queryname" not in i_stream.text: 51 | logger.critical("Input alignment file {alignment_file} is not sorted by read (i.e., query) name".format(alignment_file=alignment_file)) 52 | raise Exception("Input bam file needs to be sorted by read (i.e., query) name") 53 | for entry in i_stream: 54 | if entry.qname != current_read_name: 55 | if len(current_entries) > 0: 56 | reads_novel_adjacencies = reads_to_nas[current_read_name] 57 | add_lr_labeling_constraints(location_graph=location_graph, alignment_entries=current_entries, nas=reads_novel_adjacencies) 58 | current_read_name = entry.qname 59 | current_entries = [entry] 60 | else: 61 | current_entries.append(entry) 62 | # last reads streak has to be processed as well 63 | if current_read_name in reads_to_nas: 64 | pass 65 | 66 | 67 | def add_lr_labeling_constraints(location_graph, alignment_entries, nas): 68 | entries = sorted(alignment_entries, key=lambda e: (e.query_alignment_start, e.query_alignment_end)) 69 | internal = len(nas) != len(entries) - 1 70 | 71 | pass 72 | 73 | 74 | def label_constraints_combining(): 75 | pass 76 | 77 | -------------------------------------------------------------------------------- /rck/utils/adj/main_chrs.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | 11 12 | 12 13 | 13 14 | 14 15 | 15 16 | 16 17 | 17 18 | 18 19 | 19 20 | 20 21 | 21 22 | 22 23 | X 24 | Y -------------------------------------------------------------------------------- /rck/utils/adj/rck_adg_infer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import os 4 | import sys 5 | from collections import defaultdict 6 | 7 | current_file_level = 3 8 | current_dir = os.path.dirname(os.path.realpath(__file__)) 9 | for _ in range(current_file_level): 10 | current_dir = os.path.dirname(current_dir) 11 | sys.path.append(current_dir) 12 | 13 | import rck 14 | from rck.core.io import get_standard_logger_from_args, read_adjacencies_from_source, write_adjacency_groups_to_destination, get_logging_cli_parser 15 | from rck.utils.adj.adjacency_group_inference import infer_sniffles_molecule_groups, infer_short_nas_labeling_groups, infer_alignment_labeling_groups, filter_alignment 16 | from rck.utils.adj.adjacency_group_process import refined_labeling_groups 17 | 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-infer") 21 | parser.add_argument("--version", action="version", version=rck.version) 22 | cli_logging_parser = get_logging_cli_parser() 23 | 24 | subparsers = parser.add_subparsers(title="commands", dest="command") 25 | subparsers.required = True 26 | ### 27 | sniffles_molecule_group_parser = subparsers.add_parser("sniffles-m", parents=[cli_logging_parser]) 28 | sniffles_molecule_group_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin) 29 | sniffles_molecule_group_parser.add_argument("--i-separator", default="\t") 30 | sniffles_molecule_group_parser.add_argument("--i-extra-separator", default=";") 31 | sniffles_molecule_group_parser.add_argument("--extra-rnames-field", default="rnames") 32 | sniffles_molecule_group_parser.add_argument("--fp", type=float, default=0.5) 33 | sniffles_molecule_group_parser.add_argument("--gid-suffix", dest="gid_suffix", default="sniffles-M") 34 | sniffles_molecule_group_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 35 | sniffles_molecule_group_parser.add_argument("--o-separator", default="\t") 36 | sniffles_molecule_group_parser.add_argument("--o-aids-separator", default=",") 37 | sniffles_molecule_group_parser.add_argument("--o-extra-separator", default=";") 38 | ### 39 | short_nas_labeling_group_parser = subparsers.add_parser("short-l", parents=[cli_logging_parser]) 40 | short_nas_labeling_group_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin) 41 | short_nas_labeling_group_parser.add_argument("--i-separator", default="\t") 42 | short_nas_labeling_group_parser.add_argument("--i-extra-separator", default=";") 43 | short_nas_labeling_group_parser.add_argument("--max-size", type=int, default=50000000) 44 | short_nas_labeling_group_parser.add_argument("--allow-intermediate-same", action="store_true", dest="allow_intermediate_same") 45 | short_nas_labeling_group_parser.add_argument("--allow-intermediate-tra", action="store_true", dest="allow_intermediate_tra") 46 | short_nas_labeling_group_parser.add_argument("--no-inv-signatures", action="store_false", dest="allow_inv_signature") 47 | short_nas_labeling_group_parser.add_argument("--no-refine", action="store_false", dest="refine") 48 | short_nas_labeling_group_parser.add_argument("--fp", type=float, default=1) 49 | short_nas_labeling_group_parser.add_argument("--gid-suffix", dest="gid_suffix", default="short-nas-L") 50 | short_nas_labeling_group_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 51 | short_nas_labeling_group_parser.add_argument("--o-separator", default="\t") 52 | short_nas_labeling_group_parser.add_argument("--o-aids-separator", default=",") 53 | short_nas_labeling_group_parser.add_argument("--o-extra-separator", default=";") 54 | ### 55 | sniffles_labeling_group_parser = subparsers.add_parser("sniffles-l", parents=[cli_logging_parser]) 56 | sniffles_labeling_group_parser.add_argument("--rck-adj", type=argparse.FileType("rt"), required=True) 57 | sniffles_labeling_group_parser.add_argument("--i-separator", default="\t") 58 | sniffles_labeling_group_parser.add_argument("--i-extra-separator", default=";") 59 | sniffles_labeling_group_parser.add_argument("--alignment", required=True) 60 | sniffles_labeling_group_parser.add_argument("--alignment-format", choices=["sam", "bam", "cram"], default="bam") 61 | sniffles_labeling_group_parser.add_argument("--extra-rnames-field", default="rnames") 62 | sniffles_labeling_group_parser.add_argument("--no-refine", action="store_false", dest="refine") 63 | sniffles_labeling_group_parser.add_argument("--fp", type=float, default=1) 64 | sniffles_labeling_group_parser.add_argument("--gid-suffix", default="sniffles-L") 65 | sniffles_labeling_group_parser.add_argument("-o", "--output", default=sys.stdout, type=argparse.FileType("wt")) 66 | sniffles_labeling_group_parser.add_argument("--o-separator", default="\t") 67 | sniffles_labeling_group_parser.add_argument("--o-aids-separator", default=",") 68 | sniffles_labeling_group_parser.add_argument("--o-extra-separator", default=";") 69 | ### 70 | filter_alignment_parser = subparsers.add_parser("filter-alignment", parents=[cli_logging_parser]) 71 | filter_alignment_parser.add_argument("--rck-adj", type=argparse.FileType("rt"), required=True) 72 | filter_alignment_parser.add_argument("--i-separator", default="\t") 73 | filter_alignment_parser.add_argument("--i-extra-separator", default=";") 74 | filter_alignment_parser.add_argument("--extra-rnames-field", default="rnames") 75 | filter_alignment_parser.add_argument("--alignment", required=True) 76 | filter_alignment_parser.add_argument("--alignment-format", choices=["sam", "bam", "cram"], default="bam") 77 | filter_alignment_parser.add_argument("-o", "--output", required=True) 78 | filter_alignment_parser.add_argument("--output-format", choices=["sam", "bam", "cram"], default="bam") 79 | ### 80 | args = parser.parse_args() 81 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-ADJ-GROUPS-infer") 82 | if args.command == "sniffles-m": 83 | logger.info("Inferring molecule adjacency groups from adjacencies with Sniffles RNAMES support extra info.") 84 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj)) 85 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, separator=args.i_separator, extra_separator=args.i_extra_separator) 86 | logger.info("Inferring molecule adjacency groups from read adjacencies") 87 | adj_groups = infer_sniffles_molecule_groups(adjacencies=adjacencies, extra_rnames_field=args.extra_rnames_field, gid_suffix=args.gid_suffix) 88 | logger.info("Inferred {cnt} molecule adjacency groups".format(cnt=len(adj_groups))) 89 | logger.info("Writing inferred molecule adjacency groups to {file}".format(file=args.output)) 90 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups, 91 | separator=args.o_separator, extra_separator=args.o_extra_separator, aids_separator=args.o_aids_separator, 92 | extra_fill="") 93 | elif args.command == "short-l": 94 | logger.info("Inferring labeling adjacency groups from adjacencies from adjacencies.") 95 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj)) 96 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, separator=args.i_separator, extra_separator=args.i_extra_separator) 97 | logger.info("Inferring labeling adjacency groups from read adjacencies") 98 | adj_groups = infer_short_nas_labeling_groups(adjacencies=adjacencies, gid_suffix=args.gid_suffix, max_size=args.max_size, 99 | allow_intermediate_same=args.allow_intermediate_same, 100 | allow_intermediate_tra=args.allow_intermediate_tra, allow_inv_signatures=args.allow_inv_signature) 101 | logger.info("Inferred {cnt} labeling adjacency groups".format(cnt=len(adj_groups))) 102 | if args.refine: 103 | logger.info("Refining inferred labeling adjacency groups") 104 | adj_groups = refined_labeling_groups(adj_groups=adj_groups, gid_suffix=args.gid_suffix) 105 | logger.info("A total of {cnt} refined labeling adjacency groups remain".format(cnt=len(adj_groups))) 106 | logger.info("Writing inferred labeling adjacency group s to {file}".format(file=args.output)) 107 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups, 108 | separator=args.o_separator, aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator, 109 | extra_fill="") 110 | elif args.command == "sniffles-l": 111 | logger.info("Inferring labeling adjacency groups from adjacencies, and their reads-of-origin alignments") 112 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj)) 113 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, extra_separator=args.i_extra_separator, separator=args.i_separator) 114 | logger.info("Inferring labeling adjacency groups from read adjacencies and their reads-of-origin alignments") 115 | adj_groups = infer_alignment_labeling_groups(adjacencies=adjacencies, alignment_file_name=args.alignment, alignment_format=args.alignment_format, 116 | extra_rnames_field=args.extra_rnames_field, gid_suffix=args.gid_suffix) 117 | logger.info("Inferred {cnt} labeling adjacency groups. There can be many duplicates, refinement shall take care of it.".format(cnt=len(adj_groups))) 118 | if args.refine: 119 | logger.info("Refining inferred labeling adjacency groups") 120 | adj_groups = refined_labeling_groups(adj_groups=adj_groups, gid_suffix=args.gid_suffix) 121 | logger.info("A total of {cnt} refined labeling adjacency groups remain".format(cnt=len(adj_groups))) 122 | logger.info("Writing inferred labeling adjacency group s to {file}".format(file=args.output)) 123 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups, 124 | separator=args.o_separator, aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator, 125 | extra_fill="") 126 | elif args.command == "filter-alignment": 127 | logger.info("Filtering input read alignment to retain only reads mentioned as supporting adjacencies from the input") 128 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj)) 129 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, extra_separator=args.i_extra_separator, separator=args.i_separator) 130 | logger.info("Filtering input alignment form file {file} and writing result in {o_file}".format(file=args.alignment, o_file=args.output)) 131 | filter_alignment(adjacencies=adjacencies, alignment_file_name=args.alignment, alignment_format=args.alignment_format, extra_rnames_field=args.extra_rnames_field, 132 | output_alignment_file_name=args.output, output_alignment_format=args.output_format) 133 | exit(0) 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /rck/utils/adj/rck_adg_process.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | 4 | import os 5 | import sys 6 | 7 | current_file_level = 3 8 | current_dir = os.path.dirname(os.path.realpath(__file__)) 9 | for _ in range(current_file_level): 10 | current_dir = os.path.dirname(current_dir) 11 | sys.path.append(current_dir) 12 | 13 | import rck 14 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, stream_adjacency_groups_from_source, write_adjacency_groups_to_destination, \ 15 | read_adjacency_groups_from_source, read_adjacencies_from_source 16 | from rck.core.structures import AdjacencyGroupType 17 | from rck.utils.adj.adjacency_group_process import refined_labeling_groups, projected_groups 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-process") 22 | parser.add_argument("--version", action="version", version=rck.version) 23 | cli_logging_parser = get_logging_cli_parser() 24 | 25 | subparsers = parser.add_subparsers(title="command", dest="command") 26 | subparsers.required = True 27 | ### 28 | cat_parser = subparsers.add_parser("cat", parents=[cli_logging_parser]) 29 | cat_parser.add_argument("rck_adg", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin]) 30 | cat_parser.add_argument("--i-separator", default="\t") 31 | cat_parser.add_argument("--i-extra-separator", default=";") 32 | cat_parser.add_argument("--i-aids-separator", default=",") 33 | cat_parser.add_argument("--enforce-unique-ids", action="store_true", dest="enforce_unique_ids") 34 | cat_parser.add_argument("--id-collision-strategy", choices=["skip", "error"], default="error") 35 | cat_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 36 | cat_parser.add_argument("--o-separator", default="\t") 37 | cat_parser.add_argument("--o-aids-separator", default=",") 38 | cat_parser.add_argument("--o-extra-separator", default=";") 39 | ### 40 | refine_parser = subparsers.add_parser("refine", parents=[cli_logging_parser]) 41 | refine_parser.add_argument("rck_adg", nargs="?", type=argparse.FileType("rt"), default=sys.stdin) 42 | refine_parser.add_argument("--i-separator", default="\t") 43 | refine_parser.add_argument("--i-extra-separator", default=";") 44 | refine_parser.add_argument("--i-aids-separator", default=",") 45 | # refine_parser.add_argument("--no-refine-m", action="store_false", dest="refine_m") 46 | # refine_parser.add_argument("--no-refine-l", action="store_false", dest="refine_l") 47 | # refine_parser.add_argument("--no-refine-n", action="store_false", dest="refine_n") 48 | refine_parser.add_argument("--gid-suffix", default="refined") 49 | refine_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 50 | refine_parser.add_argument("--o-separator", default="\t") 51 | refine_parser.add_argument("--o-aids-separator", default=",") 52 | refine_parser.add_argument("--o-extra-separator", default=";") 53 | ### 54 | project_parser = subparsers.add_parser("project", parents=[cli_logging_parser]) 55 | project_parser.add_argument("rck_adg", type=argparse.FileType("rt"), default=sys.stdin) 56 | project_parser.add_argument("--i-separator", default="\t") 57 | project_parser.add_argument("--i-extra-separator", default=";") 58 | project_parser.add_argument("--i-aids-separator", default=",") 59 | project_parser.add_argument("--adjacencies", required=True, type=argparse.FileType("rt")) 60 | project_parser.add_argument("--adj-separator", default="\t") 61 | project_parser.add_argument("--adj-extra-separator", default=";") 62 | project_parser.add_argument("--gid-suffix", default="projected") 63 | project_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 64 | project_parser.add_argument("--o-separator", default="\t") 65 | project_parser.add_argument("--o-aids-separator", default=",") 66 | project_parser.add_argument("--o-extra-separator", default=";") 67 | ### 68 | args = parser.parse_args() 69 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-ADJ-GROUPS-process") 70 | if args.command == "cat": 71 | adj_groups = itertools.chain(*(stream_adjacency_groups_from_source(source=adj_group_source, separator=args.i_separator, 72 | aids_separator=args.i_aids_separator, extra_separator=args.i_extra_separator) 73 | for adj_group_source in args.rck_adg)) 74 | if args.enforce_unique_ids: 75 | pass 76 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups, separator=args.o_separator, 77 | aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator) 78 | elif args.command == "refine": 79 | logger.info("Refining input adjacency groups") 80 | logger.info("Reading adjacency groups from {file}".format(file=args.rck_adg)) 81 | adg_groups = read_adjacency_groups_from_source(source=args.rck_adg, separator=args.i_separator, 82 | extra_separator=args.i_extra_separator, aids_separator=args.i_aids_separator) 83 | logger.info("A total of {cnt} adjacency groups has been read".format(cnt=len(adg_groups))) 84 | molecule_groups = [ag for ag in adg_groups if ag.group_type == AdjacencyGroupType.MOLECULE] 85 | logger.info("A total of {cnt} molecule adjacency groups has been read".format(cnt=len(molecule_groups))) 86 | labeling_groups = [ag for ag in adg_groups if ag.group_type == AdjacencyGroupType.LABELING] 87 | logger.info("A total of {cnt} labeling adjacency groups has been read".format(cnt=len(labeling_groups))) 88 | general_groups = [ag for ag in adg_groups if ag.group_type == AdjacencyGroupType.GENERAL] 89 | logger.info("A total of {cnt} general adjacency groups has been read".format(cnt=len(general_groups))) 90 | logger.info("Refining molecule adjacency groups") 91 | refined_molecule_groups = molecule_groups 92 | logger.info("A total of {cnt} refined molecule adjacency groups remains".format(cnt=len(refined_molecule_groups))) 93 | logger.info("Refining labeling adjacency groups") 94 | r_labeling_groups = refined_labeling_groups(adj_groups=labeling_groups, gid_suffix="" if len(args.gid_suffix) == 0 else args.gid_suffix + "-L", 95 | retain_source_gids=True) 96 | logger.info("A total of {cnt} refined labeling adjacency groups remains".format(cnt=len(r_labeling_groups))) 97 | logger.info("Refining general adjacency groups") 98 | refined_general_groups = general_groups 99 | logger.info("A total of {cnt} refined labeling general adjacency groups remains".format(cnt=len(refined_general_groups))) 100 | adj_groups = itertools.chain(refined_molecule_groups, r_labeling_groups, refined_general_groups) 101 | logger.info("Writing refined adjacency groups to {file}".format(file=args.output)) 102 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups, separator=args.o_separator, aids_separator=args.o_aids_separator) 103 | elif args.command == "project": 104 | logger.info("Projecting input adjacency groups based on input adjacencies") 105 | logger.info("Reading adjacency groups from {file}".format(file=args.rck_adg)) 106 | adg_groups = read_adjacency_groups_from_source(source=args.rck_adg, separator=args.i_separator, extra_separator=args.i_extra_separator, 107 | aids_separator=args.i_aids_separator) 108 | logger.info("A total of {cnt} adjacency gorups has been read".format(cnt=len(adg_groups))) 109 | adjacencies = read_adjacencies_from_source(source=args.adjacencies, separator=args.adj_separator, extra_separator=args.adj_extra_separator) 110 | p_groups = projected_groups(groups=adg_groups, adjacencies=adjacencies, gid_suffix=args.gid_suffix) 111 | logger.info("A total of {cnt} projected groups remained".format(cnt=len(p_groups))) 112 | logger.info("Writing projected adjacency groups to {file}".format(file=args.output)) 113 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=p_groups, separator=args.o_separator, 114 | aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator) 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /rck/utils/adj/rck_adg_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | import rck 5 | from rck.core.io import get_logging_cli_parser, read_adjacency_groups_from_source 6 | from rck.core.structures import AdjacencyGroupType 7 | from rck.utils.adj.adjacency_group_stats import groups_size_tally 8 | 9 | 10 | def main(): 11 | cli_logging_parser = get_logging_cli_parser() 12 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADG-STATS") 13 | parser.add_argument('--version', action='version', version=rck.version) 14 | subparsers = parser.add_subparsers(title="command", dest="command") 15 | subparsers.required = True 16 | ####### 17 | labeling_group_size_parser = subparsers.add_parser("size-l", parents=[cli_logging_parser], help="Group size for RCK AdjGROUP in input file") 18 | labeling_group_size_parser.add_argument("rck_adg", type=argparse.FileType("rt"), nargs="?", default=sys.stdin) 19 | labeling_group_size_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 20 | labeling_group_size_parser.add_argument("--no-allow-zero-values", action="store_false", dest="allow_zero_values") 21 | labeling_group_size_parser.add_argument("--min", type=int, default=-1) 22 | labeling_group_size_parser.add_argument("--max", type=int, default=-1) 23 | ####### 24 | args = parser.parse_args() 25 | if args.command == "size-l": 26 | adj_groups = read_adjacency_groups_from_source(source=args.rck_adg) 27 | labeling_adg = [ag for ag in adj_groups if ag.group_type == AdjacencyGroupType.LABELING] 28 | tally = groups_size_tally(adjacency_groups=labeling_adg) 29 | min_key, max_key = min(tally.keys()), max(tally.keys()) 30 | if args.max != -1: 31 | max_key = args.max 32 | if args.min != -1: 33 | min_key = args.min 34 | min_value = 0 35 | for key in tally: 36 | if key < min_key: 37 | min_value += tally[key] 38 | print("<{min_key}".format(min_key=min_key), min_value, sep=",", file=args.output) 39 | for value in range(min_key, max_key): 40 | if value not in tally and not args.allow_zero_values: 41 | continue 42 | print(value, tally.get(value, 0), sep=",", file=args.output) 43 | max_value = 0 44 | for key in tally: 45 | if key >= max_key: 46 | max_value += tally[key] 47 | print(">={max_key}".format(max_key=max_key), max_value, sep=",", file=args.output) 48 | 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /rck/utils/adj/rck_adj_long_reads.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from collections import defaultdict 5 | import pysam 6 | import networkx as nx 7 | 8 | from rck.core.io import read_adjacencies_from_source, get_logging_cli_parser, get_standard_logger_from_args 9 | from rck.utils.adj.long_reads import infer_labeling_constraints 10 | 11 | 12 | def get_mode_str(format="bam", input=False): 13 | result = "r" if input else "w" 14 | if format == "bam": 15 | result += "b" 16 | elif format == "cram": 17 | result += "c" 18 | return result 19 | 20 | 21 | def get_reads_set_from_source(source): 22 | reads = set() 23 | for line in source: 24 | line = line.strip() 25 | if len(line) == 0 or line.startswith("#"): 26 | continue 27 | reads.add(line) 28 | return reads 29 | 30 | 31 | def main(): 32 | parser = argparse.ArgumentParser() 33 | logging_parser = get_logging_cli_parser() 34 | ######## 35 | subparsers = parser.add_subparsers(title="commands", dest="command") 36 | subparsers.required = True 37 | ######## 38 | lr_extraction_parser = subparsers.add_parser("extract-lr", parents=[logging_parser]) 39 | lr_extraction_parser.add_argument("rck_nas", type=argparse.FileType("rt"), default=sys.stdin) 40 | lr_extraction_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 41 | lr_extraction_parser.add_argument("--min-sv-cnt", type=int, default=2) 42 | lr_extraction_parser.add_argument("--lr-field", default="support_read_names") 43 | ######### 44 | lr_alignment_filter_parser = subparsers.add_parser("filter-alignment", parents=[logging_parser]) 45 | lr_alignment_filter_parser.add_argument("alignment", nargs="?", type=str, default="-") 46 | lr_alignment_filter_parser.add_argument("--i-alignment-format", type=str, choices=["bam", "sam", "cram"], default="bam") 47 | lr_alignment_filter_parser.add_argument("-r", "--reads", type=argparse.FileType("rt"), required=True) 48 | lr_alignment_filter_parser.add_argument("--r-separator", default="\t") 49 | lr_alignment_filter_parser.add_argument("--s-separator", default="\t") 50 | lr_alignment_filter_parser.add_argument("-o", "--output", type=str, default="-") 51 | lr_alignment_filter_parser.add_argument("--o-alignment-format", type=str, choices=["bam", "sam", "cram"], default="bam") 52 | ######### 53 | labeling_constraint_inference_parser = subparsers.add_parser("label-const-inf", parents=[logging_parser]) 54 | labeling_constraint_inference_parser.add_argument("alignment", type=str, default="-") 55 | labeling_constraint_inference_parser.add_argument("--i-alignment-format", type=str, choices=["bam", "sam", "cram"], default="bam") 56 | labeling_constraint_inference_parser.add_argument("--rck-nas", type=argparse.FileType("rt"), required=True) 57 | labeling_constraint_inference_parser.add_argument("--min-sv-cnt", type=int, default=2) 58 | labeling_constraint_inference_parser.add_argument("--lr-field", default="support_read_names") 59 | labeling_constraint_inference_parser.add_argument("-o", "--output", type=argparse.FileType("rt"), default=sys.stdout) 60 | ######### 61 | labeling_constraint_combine_parser = subparsers.add_parser("label-const-com", parents=[logging_parser]) 62 | labeling_constraint_combine_parser.add_argument("label-constr", type=argparse.FileType("rt"), nargs="+") 63 | labeling_constraint_combine_parser.add_argument("-o", "--output", type=argparse.FileType("rt"), default=sys.stdout) 64 | ######### 65 | args = parser.parse_args() 66 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-LR") 67 | if args.command == "extract-lr": 68 | nas = read_adjacencies_from_source(source=args.rck_nas) 69 | reads_to_nas = defaultdict(list) 70 | for na in nas: 71 | reads_str = na.extra.get(args.lr_field, "") 72 | reads = reads_str.split(",") 73 | for read in reads: 74 | if len(read) == 0: 75 | continue 76 | reads_to_nas[read].append(na) 77 | extracted_read_names = {read for read in reads_to_nas if len(reads_to_nas[read]) >= args.min_sv_cnt} 78 | for read_name in extracted_read_names: 79 | print(read_name, file=args.output) 80 | elif args.command == "filter-alignment": 81 | reads = get_reads_set_from_source(source=args.reads) 82 | imode = get_mode_str(format=args.i_alignment_format, input=True) 83 | omode = get_mode_str(format=args.o_alignment_format, input=False) 84 | with pysam.AlignmentFile(args.alignment, imode) as i_stream: 85 | with pysam.AlignmentFile(args.output, omode, template=i_stream) as o_stream: 86 | for entry in i_stream: 87 | if entry.qname in reads: 88 | o_stream.write(entry) 89 | elif args.command == "label-const-inf": 90 | constraints = infer_labeling_constraints(rck_nas_source=args.rck_nas, alignment_file=args.alignment, i_alignment_format=args.i_alignment_format, 91 | lr_field=args.lr_field, min_sv_cnt=args.min_sv_cnt, logger=logger) 92 | 93 | elif args.command == "label-constr-com": 94 | pass 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /rck/utils/adj/rck_adj_process.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import os 4 | import sys 5 | from collections import defaultdict 6 | 7 | current_file_level = 3 8 | current_dir = os.path.dirname(os.path.realpath(__file__)) 9 | for _ in range(current_file_level): 10 | current_dir = os.path.dirname(current_dir) 11 | sys.path.append(current_dir) 12 | 13 | import rck 14 | from rck.core.io import read_adjacencies_from_source, write_adjacencies_to_destination, EXTERNAL_NA_ID, stream_adjacencies_from_source, get_logging_cli_parser, \ 15 | get_standard_logger_from_args 16 | from rck.utils.adj.process import get_shared_nas_parser, Merger, iter_over_string_entries_from_source, get_extra_field_regexes, \ 17 | filter_adjacencies_by_extra, \ 18 | KEEP, REMOVE, refined_adjacencies_reciprocal, update_adjacencies 19 | from rck.utils.adj.convert import get_chrs_regions_string_lists_from_source, get_chrs_regions_string_list_from_file, parse_segment_chr_region 20 | from rck.utils.adj.process import filter_adjacencies_by_chromosomal_regions, filter_adjacencies_by_size, iter_haploid_adjacencies 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-process") 25 | parser.add_argument('--version', action='version', version=rck.version) 26 | #### 27 | shared_parser = get_shared_nas_parser() 28 | cli_logging_parser = get_logging_cli_parser() 29 | shared_parser.add_argument("--output", "-o", dest="rck_adj_file", type=argparse.FileType("wt"), default=sys.stdout) 30 | shared_parser.add_argument("--no-sort", action="store_false", dest="sort") 31 | #### 32 | subparsers = parser.add_subparsers(title="commands", dest="command") 33 | subparsers.required = True 34 | #### 35 | filter_parser = subparsers.add_parser("filter", parents=[shared_parser, cli_logging_parser]) 36 | filter_parser.add_argument("rck_adj", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin]) 37 | filter_parser.add_argument("--keep-extra-field-regex", action="append", default=None) 38 | filter_parser.add_argument("--keep-extra-field-regex-file", type=argparse.FileType("rt"), default=None) 39 | filter_parser.add_argument("--keep-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP) 40 | filter_parser.add_argument("--keep-annotate", action="store_true", dest="annotate_retained") 41 | filter_parser.add_argument("--keep-annotate-s-extra-field", default=None, dest="annotate_seg_extra_field") 42 | filter_parser.add_argument("--keep-annotate-short-circ", action="store_true", dest="annotate_shirt_circ") 43 | filter_parser.add_argument("--keep-annotate-extra-prefix", dest="annotate_extra_prefix") 44 | filter_parser.add_argument("--remove-extra-field-regex", action="append", default=None) 45 | filter_parser.add_argument("--remove-extra-field-regex-file", type=argparse.FileType("rt"), default=None) 46 | filter_parser.add_argument("--remove-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP) 47 | filter_parser.add_argument("--min-size", type=int, default=0) 48 | filter_parser.add_argument("--max-size", type=int, default=1000000000) 49 | filter_parser.add_argument("--no-allow-inter-chr", action="store_false", dest="allow_inter_chr") 50 | filter_parser.add_argument("--no-allow-intra-chr", action="store_false", dest="allow_intra_chr") 51 | filter_parser.add_argument("--size-extra-field", default="svlen") 52 | filter_parser.add_argument("--size-extra-field-no-abs", action="store_false", dest="size_extra_field_abs") 53 | filter_parser.add_argument("--size-extra-seq-field") 54 | #### 55 | cat_parser = subparsers.add_parser("cat", parents=[shared_parser, cli_logging_parser], help="Concatenate Adjacencies in input files (NOTE: different from \"merge\")") 56 | cat_parser.add_argument("rck_adj", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin]) 57 | cat_parser.add_argument("--enforce-unique-ids", action="store_true", dest="enforce_unique_ids") 58 | cat_parser.add_argument("--id-collision-strategy", choices=['skip', 'error'], default='error') 59 | #### 60 | reciprocal_parser = subparsers.add_parser("reciprocal", parents=[shared_parser, cli_logging_parser], help="ensure that reciprocal novel adjacencies are treated as such") 61 | reciprocal_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin) 62 | reciprocal_parser.add_argument("--max-distance", type=int, default=50) 63 | #### 64 | haploid_parser = subparsers.add_parser("haploid", parents=[shared_parser, cli_logging_parser], help="collapse any info that is allele/haplotype-specific into a haploid mode") 65 | haploid_parser.add_argument("rck_adj", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin]) 66 | #### 67 | update_parser = subparsers.add_parser("update", parents=[shared_parser, cli_logging_parser], 68 | help="Updates adjacencies in the 'adj' with the info from --source based on aid matches. Outputs updated --target entries") 69 | update_parser.add_argument("rck_adj", type=argparse.FileType("rt")) 70 | update_parser.add_argument("--source", type=argparse.FileType("rt"), required=True) 71 | update_parser.add_argument("--exclude-extra-fields", default="") 72 | update_parser.add_argument("--include-extra-fields", default="") 73 | update_parser.add_argument("--no-include-missing", action="store_false", dest="include_missing") 74 | update_parser.add_argument("--no-coords-update", action="store_false", dest="coord_update") 75 | update_parser.add_argument("--no-coord1-update", action="store_false", dest="coord1_update") 76 | update_parser.add_argument("--no-coord2-update", action="store_false", dest="coord2_update") 77 | update_parser.add_argument("--no-strands-update", action="store_false", dest="strands_update") 78 | update_parser.add_argument("--no-strand1-update", action="store_false", dest="strand1_update") 79 | update_parser.add_argument("--no-strand2-update", action="store_false", dest="strand2_update") 80 | args = parser.parse_args() 81 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-ADK-process") 82 | processed_adjacencies = [] 83 | if args.o_extra_fields is None or len(args.o_extra_fields) == 0 or args.o_extra_fields == ",": 84 | extra = None 85 | elif args.o_extra_fields != "all": 86 | extra = args.o_extra_fields.split(",") 87 | else: 88 | extra = args.o_extra_fields 89 | if args.command == "cat": 90 | adjacencies = itertools.chain(*(stream_adjacencies_from_source(source=rck_adj_source) for rck_adj_source in args.rck_adj)) 91 | if args.enforce_unique_ids: 92 | processed_ids = set() 93 | adjacencies = [] 94 | for adj in adjacencies: 95 | aid = adj.extra.get(EXTERNAL_NA_ID, adj.idx) 96 | if aid in processed_ids: 97 | logger.debug("Adjacency id {aid} has been encountered more than once".format(aid=aid)) 98 | if args.id_collision_strategy == "skip": 99 | continue 100 | elif args.id_collision_strategy == "error": 101 | raise ValueError("More than one adjacency with id {aid}".format(aid=aid)) 102 | adjacencies.append(adj) 103 | processed_ids.add(aid) 104 | adjacencies = adjacencies 105 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=adjacencies, extra=extra, sort_adjacencies=args.sort) 106 | exit(0) 107 | elif args.command == "filter": 108 | logger.info("Filtering input adjacencies from following sources {sources}".format(sources=",".join(map(str, args.rck_adj)))) 109 | adjacencies = itertools.chain(*(stream_adjacencies_from_source(source=rck_adj_source) for rck_adj_source in args.rck_adj)) 110 | include_chrs_regions_strings = [] 111 | exclude_chrs_regions_strings = [] 112 | if args.chrs_include is not None: 113 | for chrs_lists in args.chrs_include: 114 | for chrs_list in chrs_lists: 115 | for chr_name in chrs_list.split(","): 116 | include_chrs_regions_strings.append(chr_name) 117 | if args.chrs_include_file is not None: 118 | for chr_name in get_chrs_regions_string_lists_from_source(source=args.chrs_include_file): 119 | include_chrs_regions_strings.append(chr_name) 120 | if args.chrs_exclude is not None: 121 | for chrs_lists in args.chrs_exclude: 122 | for chrs_list in chrs_lists: 123 | for chr_name in chrs_list.split(","): 124 | exclude_chrs_regions_strings.append(chr_name) 125 | if args.chrs_exclude_file is not None: 126 | for chr_name in get_chrs_regions_string_list_from_file(file_name=args.chrs_exclude_file): 127 | exclude_chrs_regions_strings.append(chr_name) 128 | include_regions = [parse_segment_chr_region(string) for string in include_chrs_regions_strings] 129 | exclude_regions = [parse_segment_chr_region(string) for string in exclude_chrs_regions_strings] 130 | adjacencies = filter_adjacencies_by_chromosomal_regions(adjacencies=adjacencies, include=include_regions, exclude=exclude_regions, 131 | include_both=args.include_both, exclude_both=args.exclude_both, 132 | include_spanning=args.include_spanning, exclude_spanning=args.exclude_spanning, 133 | annotate_retained=args.annotate_retained, annotate_retained_extra_field_prefix=args.annotate_extra_prefix, 134 | annotated_retained_segments_extra_field=args.annotate_seg_extra_field, annotate_short_circ=args.annotate_shirt_circ) 135 | keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else [] 136 | if args.keep_extra_field_regex_file is not None: 137 | keep_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.keep_extra_field_regex_file))) 138 | remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else [] 139 | if args.remove_extra_field_regex_file is not None: 140 | remove_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.remove_extra_field_regex_file))) 141 | keep_extra_field = get_extra_field_regexes(string_entries=keep_extra_field_entries) 142 | remove_extra_field = get_extra_field_regexes(string_entries=remove_extra_field_entries) 143 | adjacencies = filter_adjacencies_by_extra(adjacencies=adjacencies, 144 | keep_extra_field=keep_extra_field, keep_extra_field_missing_strategy=args.keep_extra_field_missing_strategy, 145 | remove_extra_field=remove_extra_field, remove_extra_field_missing_strategy=args.remove_extra_field_missing_strategy) 146 | adjacencies = filter_adjacencies_by_size(adjacencies=adjacencies, min_size=args.min_size, max_size=args.max_size, size_extra_field=args.size_extra_field, 147 | size_extra_seq_field=args.size_extra_seq_field, allow_inter_chr=args.allow_inter_chr, 148 | size_extra_field_abs=args.size_extra_field_abs, allow_intra_chr=args.allow_intra_chr,) 149 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=adjacencies, sort_adjacencies=False, extra=extra) 150 | exit(0) 151 | elif args.command == "reciprocal": 152 | adjacencies = read_adjacencies_from_source(source=args.rck_adj) 153 | processed_adjacencies = refined_adjacencies_reciprocal(novel_adjacencies=adjacencies, max_distance=args.max_distance, inplace=True) 154 | elif args.command == "haploid": 155 | adjacencies = itertools.chain(*(stream_adjacencies_from_source(source=rck_adj_source) for rck_adj_source in args.rck_adj)) 156 | haploid_adjacencies = iter_haploid_adjacencies(adjacencies=adjacencies, copy=False) 157 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=haploid_adjacencies, sort_adjacencies=False, extra=extra) 158 | exit(0) 159 | elif args.command == "update": 160 | adjacencies = read_adjacencies_from_source(source=args.rck_adj) 161 | source_adjacencies = read_adjacencies_from_source(source=args.source) 162 | extra_include = {v for v in args.include_extra_fields.split(",") if len(v) > 0} 163 | extra_exclude = {v for v in args.exclude_extra_fields.split(",") if len(v) > 0} 164 | processed_adjacencies = update_adjacencies(target_adjacencies=adjacencies, source_adjacencies=source_adjacencies, 165 | update_coords=args.update_coords, update_coord1=args.update_coord1, update_coord2=args.update_coord2, 166 | update_strands=args.update_strands, update_strand1=args.update_strand1, update_strand2=args.update_strand2, 167 | extra_exclude=extra_exclude, extra_include=extra_include, include_missing=args.include_missing) 168 | if len(processed_adjacencies) > 0: 169 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=processed_adjacencies, extra=extra, sort_adjacencies=args.sort) 170 | 171 | 172 | if __name__ == "__main__": 173 | main() 174 | -------------------------------------------------------------------------------- /rck/utils/adj/rck_adj_rck2x.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | 6 | current_file_level = 3 7 | current_dir = os.path.dirname(os.path.realpath(__file__)) 8 | for _ in range(current_file_level): 9 | current_dir = os.path.dirname(current_dir) 10 | sys.path.append(current_dir) 11 | 12 | import rck 13 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, read_adjacencies_from_source, write_adjacencies_to_vcf_sniffles_destination, \ 14 | write_adjacencies_to_circa_destination, read_chr_sizes_from_source, write_segments_to_circa_destination, write_adjacencies_to_bedpe_destination 15 | from rck.core.structures import AdjacencyType 16 | from rck.utils.adj.process import get_circa_adj_cnt, filter_adjacencies_by_size 17 | 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser(prog="RCK-UTILS-NAS-rck2x") 21 | parser.add_argument('--version', action='version', version=rck.version) 22 | cli_logging_parser = get_logging_cli_parser() 23 | ### 24 | subparsers = parser.add_subparsers(title="commands", dest="command") 25 | subparsers.required = True 26 | ### 27 | vcf_parser = subparsers.add_parser("vcf-sniffles", parents=[cli_logging_parser], help="Convert RCK Adjacencies to the VCF (Sniffles) format") 28 | vcf_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin) 29 | vcf_parser.add_argument("--separator", default="\t") 30 | vcf_parser.add_argument("--extra-separator", default=";") 31 | vcf_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout) 32 | vcf_parser.add_argument("--o-extra-fields", default="all") 33 | vcf_parser.add_argument("--o-no-include-ref", action="store_false", dest="include_ref") 34 | vcf_parser.add_argument("--clone-suffix", default="") 35 | vcf_parser.add_argument("--dummy-clone", default="dummy_clone") 36 | vcf_parser.add_argument("--dummy-clone-gt-extra") 37 | vcf_parser.add_argument("--dummy-gt", default="./.") 38 | vcf_parser.add_argument("--alt-extra") 39 | vcf_parser.add_argument("--ref-extra") 40 | ### 41 | circa_parser = subparsers.add_parser("circa", parents=[cli_logging_parser], help="Convert RCK Adjacencies to the TSV format supported by Circa") 42 | circa_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin) 43 | circa_parser.add_argument("--separator", default="\t") 44 | circa_parser.add_argument("--extra-separator", default=";") 45 | circa_parser.add_argument("--size-extra-field") 46 | circa_parser.add_argument("--size-extra-field-no-abs", action="store_false", dest="size_extra_field_abs") 47 | circa_parser.add_argument("--size-extra-seq-field") 48 | circa_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout) 49 | ### 50 | circa_density_parser = subparsers.add_parser("circa-dens", parents=[cli_logging_parser], 51 | help="Convert RCK Adjacencies to the TSV format with adjacencies density cnt per window supported by Circa") 52 | circa_density_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin) 53 | circa_density_parser.add_argument("--separator", default="\t") 54 | circa_density_parser.add_argument("--extra-separator", default=";") 55 | circa_density_parser.add_argument("--window-size", type=int, default=10000000) 56 | circa_density_parser.add_argument("--chr-sizes", type=argparse.FileType("rt")) 57 | circa_density_parser.add_argument("--element", choices=["breakend", "adj"], default="breakend") 58 | circa_density_parser.add_argument("--element-adj-cnt-full", action="store_true", dest="circa_element_adj_cnt_full") 59 | circa_density_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 60 | ### 61 | bedpe_parser = subparsers.add_parser("bedpe", parents=[cli_logging_parser], 62 | help="Convert RCK Adjacencies to the BEDPE format with only intra-chromosomal adjacencies considered") 63 | bedpe_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin) 64 | bedpe_parser.add_argument("--separator", default="\t") 65 | bedpe_parser.add_argument("--extra-separator", default=";") 66 | bedpe_parser.add_argument("--name-extra-field", default=None) 67 | bedpe_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 68 | ### 69 | args = parser.parse_args() 70 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-NAS-rck2x") 71 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj)) 72 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, extra_separator=args.extra_separator, separator=args.separator) 73 | if args.command == "vcf-sniffles": 74 | if not args.include_ref: 75 | logger.debug("Reference adjacencies were excluded from the output.") 76 | adjacencies = list(filter(lambda a: a.adjacency_type == AdjacencyType.NOVEL, adjacencies)) 77 | if args.o_extra_fields is None or len(args.o_extra_fields) == 0 or args.o_extra_fields == ",": 78 | extra = None 79 | elif args.o_extra_fields != "all": 80 | extra = args.o_extra_fields.split(",") 81 | else: 82 | extra = args.o_extra_fields 83 | logger.debug("Output extra fields are identified as {o_extra}".format(o_extra=",".join(extra) if extra is not None else "")) 84 | logger.info("Converting RCK formatted adjacencies to the VCF (Sniffles) format") 85 | logger.info("Writing adjacencies to {file}".format(file=args.output)) 86 | write_adjacencies_to_vcf_sniffles_destination(destination=args.output, adjacencies=adjacencies, extra=extra, 87 | dummy_clone=args.dummy_clone, clone_suffix=args.clone_suffix, 88 | alt_extra=args.alt_extra, ref_extra=args.ref_extra, 89 | dummy_clone_gt_extra=args.dummy_clone_gt_extra, dummy_gt=args.dummy_gt) 90 | elif args.command == "circa": 91 | logger.info("Converting input RCK formatted adjacencies into a Circa suitable format (extra column get transformed into a size column)") 92 | logger.info("Writing adjacencies info suitable for Circa to {file}".format(file=args.output)) 93 | write_adjacencies_to_circa_destination(destination=args.output, adjacencies=adjacencies, size_extra_field=args.size_extra_field, 94 | size_extra_seq_field=args.size_extra_seq_field, size_abs=args.size_extra_field_abs) 95 | elif args.command == "circa-dens": 96 | logger.info("Computing cnt of input RCK formatted adjacencies per window into a CIRCA suitable format") 97 | chr_sizes = args.chr_sizes 98 | if args.chr_sizes is not None: 99 | chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes) 100 | circa_adj_cnts = get_circa_adj_cnt(adjacencies=adjacencies, window_size=args.window_size, chr_sizes=chr_sizes, element=args.element, 101 | adj_full_cnt=args.circa_element_adj_cnt_full) 102 | segments = [] 103 | for segment, cnt in circa_adj_cnts.items(): 104 | segment.extra[args.element + "_cnt"] = cnt * segment.length / args.window_size 105 | segments.append(segment) 106 | write_segments_to_circa_destination(destination=args.output, segments=segments, extra=[args.element + "_cnt"]) 107 | elif args.command == "bedpe": 108 | logger.info(f"Converting and writing input RCK formatted adjacencies into BEDPE format to {args.output}") 109 | adjacencies = filter_adjacencies_by_size(adjacencies=adjacencies, allow_inter_chr=True) 110 | write_adjacencies_to_bedpe_destination(destination=args.output, adjacencies=adjacencies, name_extra_field=args.name_extra_field) 111 | logger.info("Success") 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /rck/utils/adj/stats.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | def get_size_bins(bins_strs): 5 | result = [-3000000000] 6 | for str_value in bins_strs: 7 | result.append(int(float(str_value))) 8 | result.append(3000000000) 9 | return result 10 | 11 | 12 | def get_adj_size(adjacency, size_extra_field="svlen", size_extra_field_abs=True, size_extra_seq_field=None): 13 | adj_size = None 14 | try: 15 | adj_size = int(float(adjacency.extra[size_extra_field])) 16 | if size_extra_field_abs: 17 | adj_size = abs(adj_size) 18 | except (KeyError, ValueError): 19 | pass 20 | if adj_size is None: 21 | try: 22 | adj_size = len(adjacency.extra[size_extra_seq_field]) 23 | except (KeyError, ValueError): 24 | pass 25 | if adj_size is None: 26 | adj_size = adjacency.distance_non_hap 27 | return adj_size 28 | 29 | 30 | def merged_source_tally(adjacencies, bins=None, extra_sources_field="supporting_sources", size_extra_field="svlen", size_extra_field_abs=True, size_extra_seq_field=None): 31 | if bins is None: 32 | bins = [-3000000000, 3000000000] 33 | result = defaultdict(lambda: defaultdict(int)) 34 | for adj in adjacencies: 35 | adj_size = get_adj_size(adjacency=adj, size_extra_field=size_extra_field, size_extra_field_abs=size_extra_field_abs, size_extra_seq_field=size_extra_seq_field) 36 | sources_string = adj.extra.get(extra_sources_field, None) 37 | target_bin = None 38 | for bin in bins: 39 | if adj_size < bin: 40 | target_bin = bin 41 | break 42 | source = ("None",) if sources_string is None else tuple(sorted(sources_string.split(","))) 43 | result[source][target_bin] += 1 44 | return result 45 | -------------------------------------------------------------------------------- /rck/utils/karyotype/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/karyotype/__init__.py -------------------------------------------------------------------------------- /rck/utils/karyotype/analysis.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from rck.core.io import FALSE_POSITIVE, AG_LABELING 4 | from rck.core.structures import Haplotype, CNBoundaries, AdjacencyGroupType, Phasing 5 | 6 | 7 | def scnb_violations(scnt, scnb, segments_syncs, segments=None, clone_ids=None, short_circuit=False): 8 | if clone_ids is None: 9 | clone_ids = set(scnt.keys()) & set(scnb.keys()) 10 | result = defaultdict(list) 11 | for clone_id in clone_ids: 12 | scnp = scnt[clone_id] 13 | scnbp = scnb[clone_id] 14 | sids = [s.stable_id_non_hap for s in segments] if segments is not None else scnp.records.keys() 15 | for sid in sids: 16 | sync_indicator = segments_syncs[sid] 17 | cna, cnb = scnp.get_cn(sid=sid, haplotype=Haplotype.A), scnp.get_cn(sid=sid, haplotype=Haplotype.B) 18 | lower_a = scnbp.get_cnb(sid=sid, hap=Haplotype.A, boundary_type=CNBoundaries.LOWER) 19 | lower_b = scnbp.get_cnb(sid=sid, hap=Haplotype.B, boundary_type=CNBoundaries.LOWER) 20 | upper_a = scnbp.get_cnb(sid=sid, hap=Haplotype.A, boundary_type=CNBoundaries.UPPER) 21 | upper_b = scnbp.get_cnb(sid=sid, hap=Haplotype.B, boundary_type=CNBoundaries.UPPER) 22 | if sync_indicator == 1: 23 | if (not lower_a <= cna <= upper_a) or (not lower_b <= cnb <= upper_b): 24 | result[clone_id].append(sid) 25 | else: 26 | if (not lower_b <= cna <= upper_b) or (not lower_a <= cnb <= upper_a): 27 | result[clone_id].append(sid) 28 | if short_circuit and len(result[clone_id]) > 0: 29 | return result 30 | return result 31 | 32 | 33 | def unique_realization_violations(adjacencies, acnt): 34 | pass 35 | 36 | 37 | def adjacency_groups_molecule_violations(groups, acnt, clone_ids=None, skip_missing_fp=True, short_circuit=False): 38 | result = [] 39 | if clone_ids is None: 40 | clone_ids = sorted(acnt.keys()) 41 | for group in filter(lambda ag: ag.group_type == AdjacencyGroupType.MOLECULE, groups): 42 | group_fp = group.extra.get(FALSE_POSITIVE, None) 43 | if group_fp is None: 44 | if not skip_missing_fp: 45 | result.append(group) 46 | if short_circuit: 47 | return result 48 | continue 49 | group_is_good = False 50 | for clone_id in clone_ids: 51 | acnp = acnt[clone_id] 52 | clone_present = acnp.haploid_adjacencies_present(adjacencies=group.adjacencies) 53 | inferred_fp = 1 - (len(clone_present) * 1.0 / len(group.adjacencies)) 54 | group_is_good |= inferred_fp <= group_fp 55 | if not group_is_good: 56 | result.append(group) 57 | if short_circuit and len(result) > 0: 58 | return result 59 | return result 60 | 61 | 62 | def adjacency_groups_general_violations(groups, acnt, clone_ids=None, skip_missing_fp=True, short_circuit=False): 63 | result = [] 64 | if clone_ids is None: 65 | clone_ids = sorted(acnt.keys()) 66 | for group in filter(lambda ag: ag.group_type == AdjacencyGroupType.GENERAL, groups): 67 | group_fp = group.extra.get(FALSE_POSITIVE, None) 68 | if group_fp is None: 69 | if not skip_missing_fp: 70 | result.append(group) 71 | if short_circuit: 72 | return result 73 | continue 74 | total_present = set() 75 | for clone_id in clone_ids: 76 | acnp = acnt[clone_id] 77 | clone_specific = acnp.haploid_adjacencies_present(adjacencies=group.adjacencies) 78 | for adjacency in clone_specific: 79 | total_present.add(adjacency.stable_id_non_phased) 80 | inferred_fp = 1 - (len(total_present) * 1.0 / len(group.adjacencies_ids)) 81 | if inferred_fp > group_fp: 82 | result.append(group) 83 | if short_circuit and len(result) > 0: 84 | return result 85 | return result 86 | 87 | 88 | def adjacency_groups_labeling_violations(groups, acnt, clone_ids=None, short_circuit=False): 89 | if clone_ids is None: 90 | clone_ids = sorted(acnt.keys()) 91 | result = [] 92 | for group in filter(lambda ag: ag.group_type == AdjacencyGroupType.LABELING, groups): 93 | adjacencies = group.adjacencies 94 | indexes = group.extra[AG_LABELING] 95 | haplotype_specific_presence = defaultdict(list) 96 | for hap in [Haplotype.A, Haplotype.B]: 97 | for adjacency, index in zip(adjacencies, indexes): 98 | phasings = [Phasing.AA] if hap == Haplotype.A else [Phasing.BB] 99 | local_phasings = [Phasing.AB, Phasing.BA] if hap == Haplotype.A else [Phasing.BA, Phasing.AB] 100 | phasings.append(local_phasings[index]) 101 | aid = adjacency.stable_id_non_phased 102 | cn = 0 103 | for clone_id in clone_ids: 104 | acnp = acnt[clone_id] 105 | for ph in phasings: 106 | cn += acnp.get_cn(aid=aid, phasing=ph) 107 | haplotype_specific_presence[hap].append(cn != 0) 108 | haplotype_specific_presence = dict(haplotype_specific_presence) 109 | for hap in [Haplotype.A, Haplotype.B]: 110 | haplotype_specific_presence[hap] = any(haplotype_specific_presence[hap]) 111 | if sum(haplotype_specific_presence.values()) > 1: 112 | result.append(group) 113 | if short_circuit: 114 | return result 115 | return result 116 | 117 | 118 | def nas_fp_violations(acnt, fp, adjacencies=None): 119 | pass 120 | -------------------------------------------------------------------------------- /rck/utils/karyotype/rck_kar_graph.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from rck.core.graph import construct_hiag_inflate_from_haploid_data 5 | from rck.core.io import get_logging_cli_parser, read_scnt_from_source, read_acnt_from_source, write_graph_to_destination 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser(prog="RCK-UTILS-KAR-graph") 10 | cli_logging_parser = get_logging_cli_parser() 11 | parser.add_argument("--acnt", required=True, type=argparse.FileType("rt")) 12 | parser.add_argument("--acnt-separator", default="\t") 13 | parser.add_argument("--acnt-extra-separator", default=";") 14 | parser.add_argument("--scnt", required=True, type=argparse.FileType("rt")) 15 | parser.add_argument("--scnt-separator", default="\t") 16 | parser.add_argument("--scnt-extra-separator", default=";") 17 | parser.add_argument("--clone") 18 | subparsers = parser.add_subparsers(title="commands", dest="command") 19 | subparsers.required = True 20 | writer_parser = subparsers.add_parser("write", parents=[cli_logging_parser]) 21 | writer_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 22 | writer_parser.add_argument("--style", choices=["edge-list"], default="edge-list") 23 | writer_parser.add_argument("--separator", default="\t") 24 | writer_parser.add_argument("--include-absent", action="store_true", dest="include_cn_0") 25 | args = parser.parse_args() 26 | segments, scnt = read_scnt_from_source(source=args.scnt, separator=args.scnt_separator, extra_separator=args.scnt_extra_separator, remove_cn_data_from_segs=True) 27 | adjacencies, acnt = read_acnt_from_source(source=args.acnt, separator=args.acnt_separator, extra_separator=args.acnt_extra_separator, remove_cn_data_from_adj=True) 28 | if args.command == "write": 29 | hiag = construct_hiag_inflate_from_haploid_data(hapl_segments=segments, hapl_adjacencies=adjacencies) 30 | if args.clone is None: 31 | common_clones = set(acnt.keys()) & set(scnt.keys()) 32 | if len(common_clones) == 0: 33 | raise ValueError("No common clones in Adjacency and Segment Copy Number tensors") 34 | args.clone = sorted(common_clones)[0] 35 | acnp, scnp = acnt[args.clone], scnt[args.clone] 36 | hiag.assign_copy_numbers_from_scn_profile(scn_profile=scnp) 37 | hiag.assign_copy_numbers_from_acn_profile(acn_profile=acnp) 38 | if not args.include_cn_0: 39 | hiag.remove_edges_with_zero_cn() 40 | write_graph_to_destination(graph=hiag, destination=args.output, style=args.style) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /rck/utils/karyotype/rck_kar_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import defaultdict 3 | 4 | from rck.core.graph import construct_hiag_inflate_from_haploid_data 5 | from rck.core.io import read_scnt_from_source, read_acnt_from_source, read_scnb_from_source, read_adjacency_groups_from_source, read_positions_from_source, get_logging_cli_parser, \ 6 | get_standard_logger_from_args, EXTERNAL_NA_ID 7 | from rck.core.structures import get_ref_telomeres_from_segments, AdjacencyType, AdjacencyGroupType, Segment 8 | from rck.utils.karyotype.analysis import adjacency_groups_molecule_violations, adjacency_groups_labeling_violations, adjacency_groups_general_violations 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(prog="RCK-UTILS-KAR-stats", parents=[get_logging_cli_parser()]) 13 | parser.add_argument("--verbose", choices=[0, 1, 2, 3, 4, 5], type=int, default=5) 14 | parser.add_argument("--acnt", required=True, type=argparse.FileType("rt")) 15 | parser.add_argument("--acnt-separator", default="\t") 16 | parser.add_argument("--acnt-extra-separator", default=";") 17 | parser.add_argument("--scnt", required=True, type=argparse.FileType("rt")) 18 | parser.add_argument("--scnt-separator", default="\t") 19 | parser.add_argument("--scnt-extra-separator", default=";") 20 | parser.add_argument("--scnb", type=argparse.FileType("rt")) 21 | parser.add_argument("--scnb-separator", default="\t") 22 | parser.add_argument("--scnb-extra-separator", default=";") 23 | parser.add_argument("--nas-fp", type=float, default=-1.0) 24 | parser.add_argument("--adjacency-groups", type=argparse.FileType("rt")) 25 | parser.add_argument("--adg-separator", default="\t") 26 | parser.add_argument("--adg-aids-separator", default=",") 27 | parser.add_argument("--adg-extra-separator", default=";") 28 | parser.add_argument("--telomere-positions", type=argparse.FileType("rt")) 29 | parser.add_argument("--telomere-positions-separator", default="\t") 30 | parser.add_argument("--telomere-positions-extra-separator", default=";") 31 | args = parser.parse_args() 32 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-KAR-stats") 33 | logger.info("Reading segment copy number tensor from {file}".format(file=args.scnt)) 34 | segments, scnt = read_scnt_from_source(source=args.scnt, separator=args.scnt_separator, extra_separator=args.scnt_extra_separator, remove_cn_data_from_segs=True) 35 | logger.info("Reading adjacency copy number tensor from {file}".format(file=args.acnt)) 36 | adjacencies, acnt = read_acnt_from_source(source=args.acnt, separator=args.acnt_separator, extra_separator=args.acnt_extra_separator, remove_cn_data_from_adj=True) 37 | if args.scnb is not None: 38 | logger.info("Reading segment copy number boundaries tensor from {file}".format(file=args.scnb)) 39 | _, scnb = read_scnb_from_source(source=args.scnb, separator=args.scnb_separator, extra_separator=args.scnb_extra_separator, remove_cnb_data_from_segs=True) 40 | else: 41 | logger.info("No segment copy number boundaries tensor is provided via --scnb flag") 42 | scnb = None 43 | if args.adjacency_groups is not None: 44 | logger.info("Reading adjacency groups information from {file}".format(file=args.adjacency_groups)) 45 | groups = read_adjacency_groups_from_source(source=args.adjacency_groups, separator=args.adg_separator, 46 | extra_separator=args.adg_extra_separator, aids_separator=args.adg_aids_separator) 47 | else: 48 | logger.info("No adjacency groups information is provided via --adjacency-groups flag") 49 | groups = [] 50 | if args.telomere_positions is not None: 51 | logger.info("Reading telomere positions from {file}".format(file=args.telomere_positions)) 52 | telomeres = read_positions_from_source(source=args.telomere_positions, separator=args.telomeres_positions_separator, 53 | extra_separator=args.telomere_positions_extra_separator) 54 | else: 55 | logger.info("No telomere positions are provided via --telomere-positions flag. Defaulting to reference telomere positions".format(file=args.telomere_positions)) 56 | telomeres = get_ref_telomeres_from_segments(segments=segments) 57 | segments_by_chrs = defaultdict(list) 58 | for segment in segments: 59 | segments_by_chrs[segment.chromosome].append(segment) 60 | print("A total of {cnt} chromosomes are observed".format(cnt=len(segments_by_chrs))) 61 | total_segments_cnt = 0 62 | for chr_name, chr_segments in segments_by_chrs.items(): 63 | total_segments_cnt += len(chr_segments) 64 | if args.verbose >= 3: 65 | print("Chromosome {chr_name} has {cnt} segments".format(chr_name=chr_name, cnt=len(chr_segments))) 66 | print("A total of {cnt} segments are observed".format(cnt=total_segments_cnt)) 67 | novel_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.NOVEL] 68 | reference_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.REFERENCE] 69 | print("A total of {cnt} adjacencies ({n_cnt} novel; {r_cnt} reference)".format(cnt=len(novel_adjacencies) + len(reference_adjacencies), 70 | n_cnt=len(novel_adjacencies), r_cnt=len(reference_adjacencies))) 71 | 72 | adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies} 73 | if groups is not None: 74 | for ag in groups: 75 | ag.populate_adjacencies_via_ids(source=adjacencies, source_by_ids=adjacencies_by_external_ids) 76 | molecule_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.MOLECULE] 77 | labeling_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.LABELING] 78 | general_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.GENERAL] 79 | if len(molecule_groups) > 0: 80 | logger.info("Checking compliance with {cnt} molecule groups".format(cnt=len(molecule_groups))) 81 | molecule_groups_violations = adjacency_groups_molecule_violations(groups=molecule_groups, acnt=acnt) 82 | if len(molecule_groups_violations): 83 | logger.error("A total of {cnt} molecule groups DO NOT agree with input karyotype. See molecule groups ids below".format(cnt=len(molecule_groups))) 84 | logger.error(", ".join([ag.gid for ag in molecule_groups_violations])) 85 | else: 86 | logger.info("All molecule groups agree with input karyotype") 87 | else: 88 | logger.info("No molecule groups were provided. Nothing to check.") 89 | if len(labeling_groups) > 0: 90 | logger.info("Checking compliance with {cnt} labeling groups".format(cnt=len(labeling_groups))) 91 | labeling_groups_violations = adjacency_groups_labeling_violations(groups=labeling_groups, acnt=acnt) 92 | if len(labeling_groups_violations): 93 | logger.error("A total of {cnt} labeling groups DO NOT agree with input karyotype. See labeling groups ids below".format(cnt=len(labeling_groups_violations))) 94 | logger.error(", ".join([ag.gid for ag in labeling_groups_violations])) 95 | else: 96 | logger.info("All labeling groups agree with input karyotype") 97 | else: 98 | logger.info("No labeling groups were provided. Nothing to check.") 99 | if len(general_groups) > 0: 100 | logger.info("Checking compliance with {cnt} general groups".format(cnt=len(general_groups))) 101 | general_groups_violations = adjacency_groups_general_violations(groups=general_groups, acnt=acnt) 102 | if len(general_groups_violations): 103 | logger.error("A total of {cnt} general groups DO NOT agree with input karyotype. See general groups ids below".format(cnt=len(general_groups_violations))) 104 | logger.error(", ".join([ag.gid for ag in general_groups_violations])) 105 | else: 106 | logger.info("All general groups agree with input karyotype") 107 | else: 108 | logger.info("No information about adjacency groups were provided. Nothing to check.") 109 | 110 | clone_ids = sorted(set(scnt.keys()) & set(acnt.keys())) 111 | for clone_id in clone_ids: 112 | logger.info("Checking balancing and telomeres for clone {clone_id}".format(clone_id=clone_id)) 113 | hiag = construct_hiag_inflate_from_haploid_data(hapl_segments=segments, hapl_adjacencies=adjacencies) 114 | scnp = scnt[clone_id] 115 | acnp = acnt[clone_id] 116 | hiag.assign_copy_numbers_from_scn_profile(scn_profile=scnp) 117 | hiag.assign_copy_numbers_from_acn_profile(acn_profile=acnp) 118 | hiag.remove_edges_with_zero_cn() 119 | logger.info("Checking that every vertex has a copy number excess >= 0.") 120 | for node in hiag.nodes(data=False): 121 | if hiag.node_imbalance(node=node) < 0: 122 | logger.warning("Something went WRONG! On segment extremity {node} there is a negative copy number excess...".format(node=str(node))) 123 | logger.info("Getting inferred telomeres.") 124 | diploid_telomeres = hiag.get_telomeres() 125 | inferred_hapl_telomeres_ids = {p.stable_id_non_hap for p in diploid_telomeres} 126 | input_hapl_telomers_ids = {p.stable_id_non_hap for p in telomeres} 127 | if inferred_hapl_telomeres_ids > input_hapl_telomers_ids: 128 | logger.error("Something went WRONG! Following segments extremities, while not specified specified as possible telomere sites were inferred as such.") 129 | logger.error(",".join(map(str, sorted(inferred_hapl_telomeres_ids - input_hapl_telomers_ids)))) 130 | else: 131 | logger.info("Everything is OK! in clone {clone_id} all extremities have non-negative copy number excess, and inferred telomere sites concur with the input" 132 | "".format(clone_id=clone_id)) 133 | length = 0 134 | for u, v, data in hiag.segment_edges(): 135 | s: Segment = data["object"] 136 | length += s.length * data["copy_number"] 137 | logger.info(f"Total length for clone {clone_id} = {length}") 138 | chromosome_cnt = sum(hiag.node_imbalance(node) for node in hiag.nodes(data=False)) / 2 139 | logger.info(f"Total number of chromosomes in clone {clone_id} = {chromosome_cnt}") 140 | 141 | 142 | if __name__ == "__main__": 143 | main() 144 | -------------------------------------------------------------------------------- /rck/utils/rck_input_refine.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import sys 5 | from copy import deepcopy 6 | 7 | current_file_level = 2 8 | current_dir = os.path.dirname(os.path.realpath(__file__)) 9 | for _ in range(current_file_level): 10 | current_dir = os.path.dirname(current_dir) 11 | sys.path.append(current_dir) 12 | 13 | import rck 14 | from rck.core.io import read_adjacencies_from_file, \ 15 | get_logging_cli_parser, get_standard_logger_from_args, get_full_path, read_scnt_from_file, read_positions_from_source, \ 16 | write_segments_to_file, write_scnt_to_file 17 | from rck.core.structures import refined_scnt, refined_scnt_with_adjacencies_and_telomeres 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(prog="RCK-UTILS-input-refine", parents=[get_logging_cli_parser()]) 22 | parser.add_argument("--version", action="version", version=rck.version) 23 | parser.add_argument("--scnt", required=True) 24 | parser.add_argument("--adjacencies", required=True) 25 | parser.add_argument("--clone-ids", default=None) 26 | parser.add_argument("--scnt-separator", default="\t") 27 | parser.add_argument("--adjacencies-separator", default="\t") 28 | parser.add_argument("--no-merge-fragments", action="store_false", dest="merge_fragments") 29 | parser.add_argument("--fragments-max-merge-gap", type=int, default=1000000000) 30 | parser.add_argument("--no-fill-gaps-fragments", action="store_false", dest="fill_gaps_fragments") 31 | parser.add_argument("--fragments-max-fill-gap", type=int, default=1000000000) 32 | parser.add_argument("--no-allow-unit-segments", action="store_false", dest="allow_unit_segments") 33 | parser.add_argument("--telomere-positions", type=argparse.FileType("rt")) 34 | parser.add_argument("--telomere-positions-separator", default="\t") 35 | parser.add_argument("--output-scnt", required=True) 36 | parser.add_argument("--output-fragments", required=True) 37 | args = parser.parse_args() 38 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-input-refine") 39 | clone_ids = args.clone_ids.split(",") if args.clone_ids is not None else None 40 | scnt_file = get_full_path(args.scnt_file) 41 | adj_file = get_full_path(args.adj) 42 | segments, scnt = read_scnt_from_file(file_name=scnt_file, clone_ids=clone_ids, separator=args.scnt_separator) 43 | clone_ids = sorted(set(scnt.keys())) 44 | segments, scnt, segments_ids_mapping = refined_scnt(segments=segments, scnt=scnt, 45 | merge_fragments=args.merge_fragments, max_merge_gap=args.fragments_max_merge_gap, 46 | fill_gaps=args.fill_gaps_fragments, max_fill_gap=args.fragments_max_fill_gap) 47 | 48 | adjacencies = read_adjacencies_from_file(file_name=adj_file, separator=args.adjacencies_separator) 49 | if args.telomere_positions is not None: 50 | telomere_positions = read_positions_from_source(source=args.telomere_positions, separator=args.telomere_positions_separator) 51 | else: 52 | telomere_positions = [] 53 | fragments = deepcopy(segments) 54 | segments, scnt = refined_scnt_with_adjacencies_and_telomeres(segments=segments, scnt=scnt, adjacencies=adjacencies, telomere_positions=telomere_positions) 55 | refined_scnt_file = os.path.expanduser(args.refined_scnt_file) 56 | refined_scnt_file = os.path.abspath(refined_scnt_file) 57 | fragments_file = get_full_path(path=args.output_fragments) 58 | 59 | write_segments_to_file(file_name=fragments_file, segments=fragments) 60 | write_scnt_to_file(file_name=refined_scnt_file, scnt=scnt, segments=segments) 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /rck/utils/scn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/scn/__init__.py -------------------------------------------------------------------------------- /rck/utils/scn/convert.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import math 3 | 4 | import gffutils 5 | 6 | from rck.core.structures import SegmentCopyNumberProfile, Segment, Haplotype 7 | from rck.utils.adj.convert import strip_chr 8 | 9 | BATTENBERG_SAMPLE_NAME = "sample" 10 | BATTENBERG_CHROMOSOME = "chr" 11 | BATTENBERG_START_POSITION = "startpos" 12 | BATTENBERG_END_POSITION = "endpos" 13 | BATTENBERG_CLONE1_CN_A = "nMaj1_A" 14 | BATTENBERG_CLONE1_CN_B = "nMin1_A" 15 | BATTENBERG_CLONE2_CN_A = "nMaj2_A" 16 | BATTENBERG_CLONE2_CN_B = "nMin2_A" 17 | 18 | 19 | def battenberg_force_non_negativity(cn): 20 | return cn if cn >= 0 else 0 21 | 22 | 23 | def battenberg_get_subclonal_cn(subclonal_cn_string, clonal_cn_int): 24 | if subclonal_cn_string == "NA": 25 | return clonal_cn_int 26 | return battenberg_force_non_negativity(int(subclonal_cn_string)) 27 | 28 | 29 | def battenberg_get_scnt_from_battenberg_file(file_name, sample_name, separator="\t", chr_strip=True): 30 | with open(file_name, "rt") as source: 31 | return get_scnt_from_battenberg_source(source=source, sample_name=sample_name, separator=separator, chr_strip=chr_strip) 32 | 33 | 34 | def get_scnt_from_battenberg_source(source, sample_name, separator="\t", chr_strip=True): 35 | clone1_name = "1" 36 | clone2_name = "2" 37 | scnt = {clone1_name: SegmentCopyNumberProfile(), clone2_name: SegmentCopyNumberProfile()} 38 | segments = [] 39 | reader = csv.DictReader(source, delimiter=separator) 40 | for row in reader: 41 | if BATTENBERG_SAMPLE_NAME in row and row[BATTENBERG_SAMPLE_NAME] != sample_name: 42 | continue 43 | start_coordinate = int(row[BATTENBERG_START_POSITION]) 44 | end_coordinate = int(row[BATTENBERG_END_POSITION]) 45 | chromosome = row[BATTENBERG_CHROMOSOME] 46 | if chr_strip: 47 | chromosome = strip_chr(chr_string=chromosome) 48 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coordinate, end=end_coordinate) 49 | clone1_scnp = scnt[clone1_name] 50 | clone2_scnp = scnt[clone2_name] 51 | cn1a = battenberg_force_non_negativity(int(row[BATTENBERG_CLONE1_CN_A])) 52 | cn1b = battenberg_force_non_negativity(int(row[BATTENBERG_CLONE1_CN_B])) 53 | clone1_scnp.set_cn_record_for_segment(segment=segment, cn=cn1a, haplotype=Haplotype.A) 54 | clone1_scnp.set_cn_record_for_segment(segment=segment, cn=cn1b, haplotype=Haplotype.B) 55 | cn2a = battenberg_get_subclonal_cn(subclonal_cn_string=row[BATTENBERG_CLONE2_CN_A], clonal_cn_int=cn1a) 56 | cn2b = battenberg_get_subclonal_cn(subclonal_cn_string=row[BATTENBERG_CLONE2_CN_B], clonal_cn_int=cn1b) 57 | clone2_scnp.set_cn_record_for_segment(segment=segment, cn=cn2a, haplotype=Haplotype.A) 58 | clone2_scnp.set_cn_record_for_segment(segment=segment, cn=cn2b, haplotype=Haplotype.B) 59 | segments.append(segment) 60 | return segments, scnt 61 | 62 | 63 | def hatchet_get_clone_ids_from_file(file_name, sample_name, separator="\t", min_usage=0.01): 64 | result = set() 65 | candidates = [] 66 | with open(file_name, "rt") as source: 67 | for line_cnt, line in enumerate(source): 68 | line = line.strip() 69 | data = line.split(separator) 70 | clone_data = data[6:] 71 | if line_cnt == 0: 72 | total_clone_cnt = int(len(clone_data) / 2) 73 | candidates = [str(cnt) for cnt in range(1, total_clone_cnt + 1)] 74 | if line.startswith("#"): 75 | continue 76 | sample = data[3] 77 | if sample != sample_name: 78 | continue 79 | for candidate_clone_id, clone_usage_str in zip(candidates, clone_data[1::2]): 80 | clone_usage = float(clone_usage_str) 81 | if clone_usage < min_usage: 82 | continue 83 | result.add(candidate_clone_id) 84 | if sorted(result) == candidates: 85 | return sorted(result) 86 | return sorted(result) 87 | 88 | 89 | def get_scnt_from_hatchet_file(file_name, sample_name, separator="\t", clone_ids=None, min_usage=0.01, chr_strip=True): 90 | if clone_ids is None: 91 | clone_ids = hatchet_get_clone_ids_from_file(file_name=file_name, sample_name=sample_name, separator=separator, min_usage=min_usage) 92 | with open(file_name, "rt") as source: 93 | return get_scnt_from_hatchet_source(source=source, separator=separator, clone_ids=clone_ids, chr_strip=chr_strip) 94 | 95 | 96 | def get_scnt_from_hatchet_source(source, sample_name, clone_ids, separator="\t", chr_strip=True): 97 | scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids} 98 | segments = [] 99 | clone_id_mappings = {} 100 | for line_cnt, line in enumerate(source): 101 | line = line.strip() 102 | data = line.split(separator) 103 | clone_data = data[6:] 104 | if line_cnt == 0: 105 | total_clone_cnt = int(len(clone_data) / 2) 106 | candidates = [str(cnt) for cnt in range(1, total_clone_cnt + 1)] 107 | for position_cnt, candidate in enumerate(candidates): 108 | if candidate in clone_ids: 109 | clone_id_mappings[candidate] = position_cnt 110 | clone_cn_strs = clone_data[::2] 111 | if line.startswith("#") or len(line) == 0: 112 | continue 113 | data_sample_name = data[3] 114 | if data_sample_name != sample_name: 115 | continue 116 | chromosome = data[0] 117 | if chr_strip: 118 | chromosome = strip_chr(chr_string=chromosome) 119 | start_coord = int(data[1]) 120 | end_coord = int(data[2]) - 1 121 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coord, end=end_coord) 122 | segments.append(segment) 123 | fid = segment.stable_id_non_hap 124 | for clone_id in clone_ids: 125 | cns_str = clone_cn_strs[clone_id_mappings[clone_id]] 126 | data = cns_str.split("|") 127 | cna = int(data[0]) 128 | cnb = int(data[1]) 129 | scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.A, cn=cna) 130 | scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.B, cn=cnb) 131 | return segments, scnt 132 | 133 | 134 | REMIXT_CHROMOSOME = "chromosome" 135 | REMIXT_START_POSITION = "start" 136 | REMIXT_END_POSITION = "end" 137 | REMIXT_CLONE1_CN_A = "major_1" 138 | REMIXT_CLONE1_CN_B = "minor_1" 139 | REMIXT_CLONE2_CN_A = "major_2" 140 | REMIXT_CLONE2_CN_B = "minor_2" 141 | 142 | 143 | def get_scnt_from_remixt_file(file_name, separator="\t", chr_strip=True): 144 | with open(file_name, "rt") as source: 145 | return get_scnt_from_remixt_source(source=source, separator=separator, chr_strip=chr_strip) 146 | 147 | 148 | def get_scnt_from_remixt_source(source, separator="\t", chr_strip=True): 149 | segments = [] 150 | clone1_id = "1" 151 | clone2_id = "2" 152 | scnt = {clone1_id: SegmentCopyNumberProfile(), clone2_id: SegmentCopyNumberProfile()} 153 | reader = csv.DictReader(source, delimiter=separator) 154 | for row in reader: 155 | chromosome = row[REMIXT_CHROMOSOME] 156 | if chr_strip: 157 | chromosome = strip_chr(chr_string=chromosome) 158 | start_coordinate = int(row[REMIXT_START_POSITION]) 159 | end_coordinate = int(row[REMIXT_END_POSITION]) - 1 160 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coordinate, end=end_coordinate) 161 | segments.append(segment) 162 | sid = segment.stable_id_non_hap 163 | clone_1_cn_a = int(row[REMIXT_CLONE1_CN_A]) 164 | clone_1_cn_b = int(row[REMIXT_CLONE1_CN_B]) 165 | clone_2_cn_a = int(row[REMIXT_CLONE2_CN_A]) 166 | clone_2_cn_b = int(row[REMIXT_CLONE2_CN_A]) 167 | clone1_scnp = scnt[clone1_id] 168 | clone2_scnp = scnt[clone2_id] 169 | clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_1_cn_a) 170 | clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_1_cn_b) 171 | clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_2_cn_a) 172 | clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_2_cn_b) 173 | return segments, scnt 174 | 175 | 176 | TITAN_CHROMOSOME = "Chromosome" 177 | TITAN_START_POSITION = "Start" 178 | TITAN_END_POSITION = "End" 179 | TITAN_MAJOR_CN = "MajorCN" 180 | TITAN_MINOR_CN = "MinorCN" 181 | TITAN_CLONE_ID = "Clonal_Cluster" 182 | TITAN_CORRECTED_CN = "Corrected_Copy_Number" 183 | TITAN_SAMPLE_NAME = "Sample" 184 | 185 | 186 | def titan_get_clone_ids_from_file(file_name, sample_name, separator="\t"): 187 | with open(file_name, "rt") as source: 188 | result = set() 189 | reader = csv.DictReader(source, delimiter=separator) 190 | for row in reader: 191 | if row[TITAN_SAMPLE_NAME] != sample_name: 192 | continue 193 | clone_id = row[TITAN_CLONE_ID] 194 | if clone_id != "NA": 195 | result.add(clone_id) 196 | return sorted(result) 197 | 198 | 199 | def get_scnt_from_titan_file(file_name, sample_name, clone_ids=None, separator="\t", corrected_cn_fix="None", chr_strip=True): 200 | if clone_ids is None: 201 | clone_ids = titan_get_clone_ids_from_file(file_name=file_name, sample_name=sample_name, separator=separator) 202 | with open(file_name, "rt") as source: 203 | return get_scnt_from_titan_source(source=source, sample_name=sample_name, clone_ids=clone_ids, separator=separator, corrected_cn_fix=corrected_cn_fix, chr_strip=chr_strip) 204 | 205 | 206 | def get_scnt_from_titan_source(source, sample_name, clone_ids, separator="\t", corrected_cn_fix="None", chr_strip=True): 207 | scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids} 208 | segments = [] 209 | reader = csv.DictReader(source, delimiter=separator) 210 | for row in reader: 211 | if row[TITAN_SAMPLE_NAME] != sample_name: 212 | continue 213 | chromosome = row[TITAN_CHROMOSOME] 214 | if chr_strip: 215 | chromosome = strip_chr(chr_string=chromosome) 216 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=int(row[TITAN_START_POSITION]), end=int(row[TITAN_END_POSITION])) 217 | sid = segment.stable_id_non_hap 218 | segments.append(segment) 219 | major_cn, minor_cn = int(row[TITAN_MAJOR_CN]), int(row[TITAN_MINOR_CN]) 220 | if minor_cn > major_cn: 221 | minor_cn, major_cn = major_cn, minor_cn 222 | titan_clone_id = row[TITAN_CLONE_ID] 223 | corrected_cn = int(row[TITAN_CORRECTED_CN]) 224 | for clone_id in clone_ids: 225 | scnp = scnt[clone_id] 226 | if titan_clone_id == clone_id: 227 | if major_cn + minor_cn != corrected_cn and corrected_cn_fix != "None": 228 | diff = corrected_cn - major_cn - minor_cn 229 | ### 230 | # initialize as 0 when corrected_cn_fix strategy does not match any known, yet is not "None" 231 | ### 232 | major_cn_addition = 0 233 | minor_cn_addition = 0 234 | if corrected_cn_fix == "equal": 235 | major_cn_addition = int(math.ceil(diff / 2)) 236 | minor_cn_addition = diff - major_cn_addition 237 | elif corrected_cn_fix == "relative-dist": 238 | relative_relation = minor_cn * 1.0 / major_cn 239 | major_cn_addition = int(math.ceil(diff / (1 + relative_relation))) 240 | minor_cn_addition = diff - major_cn_addition 241 | major_cn += major_cn_addition 242 | minor_cn += minor_cn_addition 243 | scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=major_cn) 244 | scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=minor_cn) 245 | else: 246 | scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=1) 247 | scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=1) 248 | return segments, scnt 249 | 250 | 251 | GINKGO_CHROMOSOME = "CHR" 252 | GINKGO_START_POSITION = "START" 253 | GINKGO_END_POSITION = "END" 254 | 255 | 256 | def get_scnt_from_ginkgo_file(file_name, sample_name, dummy_clone="1", separator="\t", chr_strip=True): 257 | with open(file_name, "rt") as source: 258 | return get_scnt_from_ginkgo_source(source=source, sample_name=sample_name, dummy_clone=dummy_clone, separator=separator, chr_strip=chr_strip) 259 | 260 | 261 | def get_scnt_from_ginkgo_source(source, sample_name, dummy_clone="1", separator="\t", chr_strip=True): 262 | scnp = SegmentCopyNumberProfile() 263 | segments = [] 264 | reader = csv.DictReader(source, delimiter=separator) 265 | for row in reader: 266 | chromosome = row[GINKGO_CHROMOSOME] 267 | if chr_strip: 268 | chromosome = strip_chr(chr_string=chromosome) 269 | start = int(row[GINKGO_START_POSITION]) 270 | end = int(row[GINKGO_END_POSITION]) 271 | try: 272 | cn = int(row[sample_name]) 273 | except KeyError: 274 | raise IOError("Could not obtain a segment copy value for sample {sample}. Make sure that --sample-name matches (including case) to the column header in the Ginkgo file") 275 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start, end=end) 276 | sid = segment.stable_id_non_hap 277 | segments.append(segment) 278 | scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=cn) 279 | scnt = {dummy_clone: scnp} 280 | return segments, scnt 281 | 282 | 283 | def get_segments_from_gff_file(file_name, chr_strip=True, chr_mapping=None, chr_mapping_missing_strategy="keep"): 284 | result = [] 285 | for record in gffutils.DataIterator(file_name): 286 | chr_name = record.chrom 287 | if chr_mapping is not None and chr_name not in chr_mapping and chr_mapping_missing_strategy == "skip": 288 | continue 289 | if chr_mapping is not None: 290 | chr_name = chr_mapping.get(chr_name, chr_name) 291 | if chr_strip: 292 | chr_name = strip_chr(chr_string=chr_name) 293 | extra = dict(record.attributes) 294 | new_extra = {} 295 | for key, value in extra.items(): 296 | if isinstance(value, list) and len(value) == 1: 297 | value = value[0] 298 | new_extra[key] = value 299 | segment = Segment.from_chromosome_coordinates(chromosome=chr_name, start=record.start, end=record.end) 300 | segment.extra.update(new_extra) 301 | result.append(segment) 302 | return result 303 | 304 | 305 | 306 | -------------------------------------------------------------------------------- /rck/utils/scn/rck_scnb.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import os 4 | 5 | import sys 6 | 7 | 8 | current_file_level = 3 9 | current_dir = os.path.dirname(os.path.realpath(__file__)) 10 | for _ in range(current_file_level): 11 | current_dir = os.path.dirname(current_dir) 12 | sys.path.append(current_dir) 13 | 14 | import rck 15 | from rck.core.io import read_scnt_from_source, extract_scnb_from_segments, write_scnb_to_destination 16 | from rck.core.structures import SCNBoundariesStrategies, LengthSpreadRelationships, SegmentCopyNumberBoundaries 17 | 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser("Creating boundaries for a RCK formatted segment copy number tensor") 21 | parser.add_argument('--version', action='version', version=rck.version) 22 | parser.add_argument("scnt", type=argparse.FileType("rt"), default=sys.stdin) 23 | parser.add_argument("--bnd-strategy", choices=[strategy.value for strategy in SCNBoundariesStrategies], type=SCNBoundariesStrategies.from_string, 24 | default=SCNBoundariesStrategies.UNIFORM_MIN_MAX.value) 25 | parser.add_argument("--uniform-spread-size", type=int, default=1) 26 | parser.add_argument("--length-spread-relation", choices=[rel.value for rel in LengthSpreadRelationships], type=LengthSpreadRelationships.from_string, 27 | default=LengthSpreadRelationships.DUMMY.value) 28 | parser.add_argument("--uniform-min", type=int, default=0) 29 | parser.add_argument("--uniform-max", type=int, default=10) 30 | parser.add_argument("--missing-only", action="store_true", dest="missing_only") 31 | parser.add_argument("--min-allow-zero-for-positive", type=int, default=-1) 32 | parser.add_argument("--max-allow-zero-for-positive", type=int, default=1000000000) 33 | parser.add_argument("--min-allow-positive-for-zero", type=int, default=-1) 34 | parser.add_argument("--max-allow-positive-for-zero", type=int, default=1000000000) 35 | parser.add_argument("--clone-ids", default="") 36 | parser.add_argument("--is-male", action="store_false", dest="is_female") 37 | parser.add_argument("--no-allow-unit-segments", dest="allow_unit_segments", action="store_false") 38 | parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout) 39 | parser.add_argument("--o-with-scnt", action="store_true", dest="output_scnt") 40 | parser.add_argument("--separator", default="\t") 41 | args = parser.parse_args() 42 | clone_ids = args.clone_ids.split(",") if len(args.clone_ids) > 0 else None 43 | segments, scnt = read_scnt_from_source(source=args.scnt, clone_ids=clone_ids) 44 | if clone_ids is None: 45 | clone_ids = sorted(scnt.keys()) 46 | try: 47 | scnb = extract_scnb_from_segments(segments=segments, clone_ids=clone_ids) 48 | except ValueError: 49 | scnb = {clone_id: SegmentCopyNumberBoundaries() for clone_id in clone_ids} 50 | for clone_id in clone_ids: 51 | scnb[clone_id].fill(segments=segments, scnp=scnt[clone_id], missing_only=args.missing_only, strategy=args.bnd_strategy, 52 | min_allow_zero_for_positive=args.min_allow_zero_for_positive, 53 | max_allow_zero_for_positive=args.max_allow_zero_for_positive, 54 | min_allow_positive_for_zero=args.min_allow_positive_for_zero, 55 | max_allow_positive_for_zero=args.max_allow_positive_for_zero, 56 | uniform_spread_size=args.uniform_spread_size, 57 | length_spread_relation=args.length_spread_relation, 58 | uniform_min=args.uniform_min, 59 | uniform_max=args.uniform_max, 60 | is_female=args.is_female) 61 | write_scnb_to_destination(destination=args.output, segments=segments, scnb=scnb, clone_ids=clone_ids) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /rck/utils/scn/rck_scnt_process.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import sys 4 | import os 5 | 6 | current_file_level = 3 7 | current_dir = os.path.dirname(os.path.realpath(__file__)) 8 | for _ in range(current_file_level): 9 | current_dir = os.path.dirname(current_dir) 10 | sys.path.append(current_dir) 11 | 12 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, get_full_path, read_scnt_from_file, write_scnt_to_file, \ 13 | write_scnt_to_destination, read_scnt_from_source, stream_segments_from_source, write_segments_to_destination 14 | from rck.core.structures import aligned_scnts, refined_scnt, cn_distance_inter_scnt 15 | from rck.utils.adj.process import KEEP, REMOVE, iter_over_string_entries_from_source, get_extra_field_regexes 16 | from rck.utils.scn.process import iter_haploid_segments, filter_segments_by_chromosomal_regions, filter_segments_by_extra, filter_segments_by_size 17 | from rck.utils.adj.convert import get_chrs_regions_string_list_from_file, parse_segment_chr_region, get_chrs_regions_string_lists_from_source 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-process") 22 | cli_logging_parser = get_logging_cli_parser() 23 | subparsers = parser.add_subparsers(title="command", dest="command") 24 | subparsers.required = True 25 | ### 26 | refine_parser = subparsers.add_parser("refine", parents=[cli_logging_parser]) 27 | refine_parser.add_argument('scnt', type=argparse.FileType("rt"), default=sys.stdin) 28 | refine_parser.add_argument("--separator", default="\t") 29 | refine_parser.add_argument("--no-allow-missing-clones", action="store_false", dest="allow_missing_clones") 30 | refine_parser.add_argument("--clone-ids", default=None) 31 | refine_parser.add_argument("--no-merge-fragments", action="store_false", dest="merge_fragments") 32 | refine_parser.add_argument("--max-merge-gap", type=int, default=1000000) 33 | refine_parser.add_argument("--no-fill-gaps", action="store_false", dest="fill_gaps") 34 | refine_parser.add_argument("--max-fill-gap", type=int, default=1000000) 35 | refine_parser.add_argument('--output', type=argparse.FileType("wt"), default=sys.stdout) 36 | ### 37 | align_parser = subparsers.add_parser("align", parents=[cli_logging_parser]) 38 | align_parser.add_argument("scnt", nargs="+") 39 | align_parser.add_argument("--separator", default="\t") 40 | align_parser.add_argument("--output-suffix", default="aligned") 41 | align_parser.add_argument("--no-allow-unit-segments", action="store_false", dest="allow_unit_segments") 42 | align_parser.add_argument("--output-dir", default="") 43 | ### 44 | distance_parser = subparsers.add_parser("distance", parents=[cli_logging_parser]) 45 | distance_parser.add_argument("--scnt1", type=argparse.FileType("rt"), required=True) 46 | distance_parser.add_argument("--scnt1-separator", default="\t") 47 | distance_parser.add_argument("--scnt1-extra-separator", default=";") 48 | distance_parser.add_argument("--scnt2", type=argparse.FileType("rt"), required=True) 49 | distance_parser.add_argument("--scnt2-separator", default="\t") 50 | distance_parser.add_argument("--scnt2-extra-separator", default=";") 51 | distance_parser.add_argument("--clone-ids", default=None) 52 | distance_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout) 53 | ### 54 | filter_parser = subparsers.add_parser("filter", parents=[cli_logging_parser]) 55 | filter_parser.add_argument("scnt", type=argparse.FileType("rt"), default=sys.stdin) 56 | filter_parser.add_argument("--separator", default="\t") 57 | filter_parser.add_argument("--extra-separator", default=";") 58 | filter_parser.add_argument("--o-extra-fields", default="all") 59 | filter_parser.add_argument("--chrs-include", action="append", nargs=1) 60 | filter_parser.add_argument("--chrs-include-file", type=argparse.FileType("rt")) 61 | filter_parser.add_argument("--chrs-include-no-full", action="store_false", dest="include_full") 62 | filter_parser.add_argument("--chrs-exclude", action="append", nargs=1) 63 | filter_parser.add_argument("--chrs-exclude-file", type=argparse.FileType("rt")) 64 | filter_parser.add_argument("--chrs-exclude-full", action="store_true", dest="exclude_full") 65 | filter_parser.add_argument("--keep-extra-field-regex", nargs="+", default=None) 66 | filter_parser.add_argument("--keep-extra-field-regex-file", type=argparse.FileType("rt"), default=None) 67 | filter_parser.add_argument("--keep-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP) 68 | filter_parser.add_argument("--remove-extra-field-regex", nargs="+", default=None) 69 | filter_parser.add_argument("--remove-extra-field-regex-file", type=argparse.FileType("rt"), default=None) 70 | filter_parser.add_argument("--remove-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP) 71 | filter_parser.add_argument("--min-size", type=int, default=0) 72 | filter_parser.add_argument("--max-size", type=int, default=1000000000) 73 | filter_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 74 | ### 75 | haploid_parser = subparsers.add_parser("haploid", parents=[cli_logging_parser]) 76 | haploid_parser.add_argument("scnt", type=argparse.FileType("rt"), default=sys.stdin) 77 | haploid_parser.add_argument("--separator", default="\t") 78 | haploid_parser.add_argument("--extra-separator", default=";") 79 | haploid_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout) 80 | ### 81 | args = parser.parse_args() 82 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT-process") 83 | 84 | if args.command == "refine": 85 | clone_ids = args.clone_ids.split(",") if args.clone_ids is not None else None 86 | logger.debug("Clone ids identified as {clone_ids}. If None -- all clone ids will be processed.".format(clone_ids=",".join(clone_ids))) 87 | logger.info("Reading Segment Copy Number Tensor form {file}".format(file=args.scnt)) 88 | segments, scnt = read_scnt_from_source(source=args.scnt, clone_ids=clone_ids, separator=args.separator) 89 | logger.info("Refining Segment Copy Number Tensor from {file}".format(file=args.scnt)) 90 | segments, scnt, _ = refined_scnt(segments=segments, scnt=scnt, 91 | merge_fragments=args.merge_fragments, max_merge_gap=args.max_merge_gap, 92 | fill_gaps=args.fill_gaps, max_fill_gap=args.max_fill_gap) 93 | logger.info("Writing refined Segment Copy Number Tensor to {file}".format(file=args.output)) 94 | write_scnt_to_destination(destination=args.output, scnt=scnt, segments=segments, clone_ids=clone_ids, separator=args.separator) 95 | elif args.command == "align": 96 | scnt_files = {} 97 | for path in args.scnt: 98 | full_path = get_full_path(path=path) 99 | name = os.path.splitext(os.path.basename(full_path))[0] 100 | if name.endswith(".scnt"): 101 | name = name[:-5] 102 | if name.endswith("."): 103 | name = name[:-1] 104 | scnt_files[name] = full_path 105 | logger.debug("Input Segment Copy Number Tensors (SCNT) identified as {input_scnts}".format(input_scnts=" , ".join(scnt_files.values()))) 106 | scnts_by_name = {} 107 | segments_by_name = {} 108 | clone_ids_by_scnt = {} 109 | logger.info("Reading input SCNTs") 110 | for name, path in scnt_files.items(): 111 | logger.debug("Reading SCNT from {file}".format(file=scnt_files[name])) 112 | segments, scnt = read_scnt_from_file(file_name=scnt_files[name], separator=args.separator) 113 | clone_ids_by_scnt[name] = sorted(scnt.keys()) 114 | scnts_by_name[name] = scnt 115 | segments_by_name[name] = segments 116 | if len(scnts_by_name.values()) == 1: 117 | logger.warning("Only one input SCNT identified. Doing nothing with it, outputting as is.") 118 | aligned_segments_by_name, aligned_scnts_by_name = segments_by_name, scnts_by_name 119 | else: 120 | logger.info("Aligning input SCNTs.") 121 | aligned_segments_by_name, aligned_scnts_by_name = aligned_scnts(segments_by_sample_names=segments_by_name, scnts_by_sample_names=scnts_by_name) 122 | result_base_names = {} 123 | cnt = 0 124 | for name in sorted(scnt_files.keys()): 125 | new_name = name 126 | if name in result_base_names: 127 | new_name = name + str(cnt) 128 | cnt += 1 129 | new_name = new_name + "." + args.output_suffix 130 | result_base_names[name] = new_name 131 | output_dir = args.output_dir if args.output_dir != "" else os.getcwd() 132 | output_dir = get_full_path(path=output_dir) 133 | logger.info("Writing aligned SCNTs") 134 | for name, new_name in result_base_names.items(): 135 | scnt = aligned_scnts_by_name[name] 136 | segments = aligned_segments_by_name[name] 137 | scnt_path = os.path.join(output_dir, new_name + "rck.scnt.tsv") 138 | logger.debug("Writing aligned SCNT {scnt_name} to {file}".format(scnt_name=name, file=scnt_path)) 139 | write_scnt_to_file(file_name=scnt_path, segments=segments, scnt=scnt, separator=args.separator) 140 | elif args.command == "filter": 141 | logger.info("Filtering input segments from following sources {sources}".format(sources=args.scnt)) 142 | segments = stream_segments_from_source(source=args.scnt, separator=args.separator, extra_separator=args.extra_separator) 143 | include_chrs_regions_strings = [] 144 | exclude_chrs_regions_strings = [] 145 | if args.chrs_include is not None: 146 | for chrs_lists in args.chrs_include: 147 | for chrs_list in chrs_lists: 148 | for chr_name in chrs_list.split(","): 149 | include_chrs_regions_strings.append(chr_name) 150 | if args.chrs_include_file is not None: 151 | for chr_name in get_chrs_regions_string_lists_from_source(source=args.chrs_include_file): 152 | include_chrs_regions_strings.append(chr_name) 153 | if args.chrs_exclude is not None: 154 | for chrs_lists in args.chrs_exclude: 155 | for chrs_list in chrs_lists: 156 | for chr_name in chrs_list.split(","): 157 | exclude_chrs_regions_strings.append(chr_name) 158 | if args.chrs_exclude_file is not None: 159 | for chr_name in get_chrs_regions_string_list_from_file(file_name=args.chrs_exclude_file): 160 | exclude_chrs_regions_strings.append(chr_name) 161 | include_regions = [parse_segment_chr_region(string) for string in include_chrs_regions_strings] 162 | exclude_regions = [parse_segment_chr_region(string) for string in exclude_chrs_regions_strings] 163 | segments = filter_segments_by_chromosomal_regions(segments=segments, include=include_regions, exclude=exclude_regions, 164 | include_full=args.include_full, exclude_full=args.exclude_full) 165 | keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else [] 166 | if args.keep_extra_field_regex_file is not None: 167 | keep_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.keep_extra_field_regex_file))) 168 | remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else [] 169 | if args.remove_extra_field_regex_file is not None: 170 | remove_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.remove_extra_field_regex_file))) 171 | keep_extra_field = get_extra_field_regexes(string_entries=keep_extra_field_entries) 172 | remove_extra_field = get_extra_field_regexes(string_entries=remove_extra_field_entries) 173 | segments = filter_segments_by_extra(segments=segments, keep_extra_field=keep_extra_field, keep_extra_field_missing_strategy=args.keep_extra_field_missing_strategy, 174 | remove_extra_field=remove_extra_field, remove_extra_field_missing_strategy=args.remove_extra_field_missing_strategy) 175 | segments = filter_segments_by_size(segments=segments, min_size=args.min_size, max_size=args.max_size) 176 | write_segments_to_destination(destination=args.output, segments=segments) 177 | 178 | elif args.command == "haploid": 179 | segments = stream_segments_from_source(source=args.scnt, separator=args.separator, extra_separator=args.extra_separator) 180 | haploid_segments = iter_haploid_segments(segments=segments, copy=False) 181 | write_segments_to_destination(destination=args.output, segments=haploid_segments) 182 | elif args.command == "distance": 183 | clone_ids = args.clone_ids 184 | if args.clone_ids is not None: 185 | clone_ids = args.clone_ids.split(",") 186 | segments1, scnt1 = read_scnt_from_source(source=args.scnt1, clone_ids=clone_ids, separator=args.scnt1_separator, 187 | extra_separator=args.scnt1_extra_separator, remove_cn_data_from_segs=True) 188 | segments2, scnt2 = read_scnt_from_source(source=args.scnt2, clone_ids=clone_ids, separator=args.scnt2_separator, 189 | extra_separator=args.scnt2_extra_separator, remove_cn_data_from_segs=True) 190 | segments_by_sample_names = {"1": segments1, "2": segments2} 191 | scnts_by_sample_names = {"1": scnt1, "2": scnt2} 192 | segments_by_sample_names, scnts_by_sample_names = aligned_scnts(segments_by_sample_names=segments_by_sample_names, 193 | scnts_by_sample_names=scnts_by_sample_names) 194 | segments = segments_by_sample_names["1"] 195 | scnt1, scnt2 = scnts_by_sample_names["1"], scnts_by_sample_names["2"] 196 | distance = cn_distance_inter_scnt(tensor1=scnt1, tensor2=scnt2, segments=segments, check_clone_ids_match=True) 197 | print("distance = ", distance) 198 | 199 | logger.info("Success!") 200 | 201 | 202 | if __name__ == "__main__": 203 | main() 204 | -------------------------------------------------------------------------------- /rck/utils/scn/rck_scnt_rck2x.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import os 4 | 5 | current_file_level = 3 6 | current_dir = os.path.dirname(os.path.realpath(__file__)) 7 | for _ in range(current_file_level): 8 | current_dir = os.path.dirname(current_dir) 9 | sys.path.append(current_dir) 10 | 11 | from rck.utils.scn.process import get_haploid_scnt, get_circa_segments_cna_fractions 12 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, read_scnt_from_source, write_scnt_to_shatterseek_destination, read_chr_sizes_from_source, \ 13 | write_segments_to_circa_destination 14 | from rck.utils.adj.process import get_chromosome_strip_parser 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-rck2x") 19 | cli_logging_parser = get_logging_cli_parser() 20 | chr_strip_parser = get_chromosome_strip_parser() 21 | subparsers = parser.add_subparsers(title="command", dest="command") 22 | subparsers.required = True 23 | #### 24 | shatterseek_parser = subparsers.add_parser("shatterseek", parents=[cli_logging_parser, chr_strip_parser]) 25 | shatterseek_parser.add_argument("rck_scnt", type=argparse.FileType("rt"), default=sys.stdin) 26 | shatterseek_parser.add_argument("--clone-id", required=True) 27 | shatterseek_parser.add_argument("--separator", default="\t") 28 | shatterseek_parser.add_argument("--extra-separator", default=";") 29 | shatterseek_parser.add_argument("--default-cn", type=int, default=0) 30 | shatterseek_parser.add_argument("--output-header", action="store_true", dest="output_header") 31 | shatterseek_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 32 | #### 33 | circa_dens_parser = subparsers.add_parser("circa-dens", parents=[cli_logging_parser, chr_strip_parser]) 34 | circa_dens_parser.add_argument("rck_scnt", type=argparse.FileType("rt"), default=sys.stdin) 35 | circa_dens_parser.add_argument("--clone-id", required=True) 36 | circa_dens_parser.add_argument("--separator", default="\t") 37 | circa_dens_parser.add_argument("--extra-separator", default=";") 38 | circa_dens_parser.add_argument("--cna-type", choices=["ampl", "del"], default="ampl") 39 | circa_dens_parser.add_argument("--haploid", action="store_true", dest="haploid") 40 | circa_dens_parser.add_argument("--inverse", action="store_true", dest="inverse") 41 | circa_dens_parser.add_argument("--window-size", type=int, default=10000000) 42 | circa_dens_parser.add_argument("--chr-sizes", type=argparse.FileType("rt")) 43 | circa_dens_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 44 | #### 45 | args = parser.parse_args() 46 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT") 47 | 48 | if args.command == "shatterseek": 49 | logger.info("Starting converting RCK Segment Copy Number Tensor data to ShatterSeek") 50 | logger.debug("Specified clone is {clone_id}".format(clone_id=args.clone_id)) 51 | logger.info("Reading RCK formatted data from {file}".format(file=args.rck_scnt)) 52 | segments, scnt = read_scnt_from_source(source=args.rck_scnt, separator=args.separator, extra_separator=args.extra_separator) 53 | logger.info("Read CN data is translated into a haploid (!!!) version of itself.") 54 | haploid_scnt = get_haploid_scnt(segments=segments, scnt=scnt) 55 | logger.info("Writing data for clone {clone_id} in a ShatterSeek suitable format to {file}".format(clone_id=args.clone_id, file=args.output)) 56 | write_scnt_to_shatterseek_destination(destination=args.output, segments=segments, scnt=haploid_scnt, clone_id=args.clone_id, 57 | default=args.default_cn, output_header=args.output_header) 58 | elif args.command == "circa-dens": 59 | logger.info("Starting computing ampl/del statistics from RKC Segment Copy Number Tensor Format") 60 | logger.debug("Specified clone is {clone_id}".format(clone_id=args.clone_id)) 61 | logger.info("Reading RCK formatted data from {file}".format(file=args.rck_scnt)) 62 | segments, scnt = read_scnt_from_source(source=args.rck_scnt, separator=args.separator, extra_separator=args.extra_separator) 63 | chr_sizes = args.chr_sizes 64 | if args.chr_sizes is not None: 65 | chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes) 66 | circa_segments_cna_fractions = get_circa_segments_cna_fractions(segments=segments, scnt=scnt, clone_id=args.clone_id, 67 | window_size=args.window_size, chr_sizes=chr_sizes, cna_type=args.cna_type, 68 | haploid=args.haploid) 69 | segments = [] 70 | total_average = 0 71 | total_length = 0 72 | for segment, cna_fraction in circa_segments_cna_fractions.items(): 73 | value = cna_fraction * segment.length / args.window_size 74 | if args.inverse: 75 | value = 1 - value 76 | segment.extra[args.cna_type + "_fraction"] = value 77 | total_length += segment.length 78 | total_average += cna_fraction * segment.length 79 | segments.append(segment) 80 | logger.info("Total average cna fraction is " + str(total_average / total_length)) 81 | write_segments_to_circa_destination(destination=args.output, segments=segments, extra=[args.cna_type + "_fraction"]) 82 | logger.info("Success!") 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /rck/utils/scn/rck_scnt_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from rck.core.io import get_logging_cli_parser, read_scnt_from_source 5 | from rck.utils.scn.stats import cn_distance 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-stats") 10 | cli_logging_parser = get_logging_cli_parser() 11 | subparsers = parser.add_subparsers(title="command", dest="command") 12 | subparsers.required = True 13 | ##### 14 | distance_parser = subparsers.add_parser("distance", parents=[cli_logging_parser]) 15 | distance_parser.add_argument('--scnt1', type=argparse.FileType("rt"), required=True) 16 | distance_parser.add_argument("--scnt1-separator", default="\t") 17 | distance_parser.add_argument("--scnt1-extra-separator", default=";") 18 | distance_parser.add_argument("--scnt1-clone-ids", default=None) 19 | distance_parser.add_argument('--scnt2', type=argparse.FileType("rt"), required=True) 20 | distance_parser.add_argument("--scnt2-separator", default="\t") 21 | distance_parser.add_argument("--scnt2-extra-separator", default=";") 22 | distance_parser.add_argument("--scnt2-clone-ids", default=None) 23 | distance_parser.add_argument("--topn", type=int, default=3) 24 | distance_parser.add_argument("--verbose", action="store_true", dest="verbose") 25 | distance_parser.add_argument("--both-haplotype-specific", action="store_true", dest="both_haplotype_specific") 26 | distance_parser.add_argument('-o', '--output', type=argparse.FileType("wt"), default=sys.stdout) 27 | ##### 28 | args = parser.parse_args() 29 | if args.command == "distance": 30 | scnt1_clone_ids = args.scnt1_clone_ids if args.scnt1_clone_ids is None else args.scnt1_clone_ids.split(",") 31 | segments1, scnt1 = read_scnt_from_source(source=args.scnt1, separator=args.scnt1_separator, extra_separator=args.scnt1_extra_separator, clone_ids=scnt1_clone_ids) 32 | scnt2_clone_ids = args.scnt2_clone_ids if args.scnt2_clone_ids is None else args.scnt2_clone_ids.split(",") 33 | segments2, scnt2 = read_scnt_from_source(source=args.scnt2, separator=args.scnt2_separator, extra_separator=args.scnt2_extra_separator, clone_ids=scnt2_clone_ids) 34 | result = cn_distance(segments1=segments1, scnt1=scnt1, segments2=segments2, scnt2=scnt2, both_haplotype_specific=args.both_haplotype_specific) 35 | sorted_result = sorted([(key, value) for key, value in result.items()], key=lambda entry: sum(entry[1].values())) 36 | output_result = sorted_result[:args.topn] 37 | if args.verbose: 38 | print(f'Length-weighted segment copy number distance for tensors in {args.scnt1.name} and {args.scnt2.name}', file=args.output) 39 | for cnt, (case, clone_specific_distance) in enumerate(output_result, start=1): 40 | print(f'{cnt}. Best distance (total) of {sum(clone_specific_distance.values()):,} with clone-specific ones {clone_specific_distance}, for case {case}', file=args.output) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /rck/utils/scn/rck_scnt_x2rck.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import os 4 | 5 | current_file_level = 3 6 | current_dir = os.path.dirname(os.path.realpath(__file__)) 7 | for _ in range(current_file_level): 8 | current_dir = os.path.dirname(current_dir) 9 | sys.path.append(current_dir) 10 | 11 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, write_scnt_to_destination, get_full_path, write_segments_to_destination 12 | from rck.utils.scn.convert import get_scnt_from_battenberg_source, get_scnt_from_hatchet_source, hatchet_get_clone_ids_from_file, \ 13 | get_scnt_from_remixt_source, titan_get_clone_ids_from_file, get_scnt_from_titan_source, get_scnt_from_ginkgo_source, get_segments_from_gff_file 14 | from rck.utils.adj.process import get_chromosome_strip_parser 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-x2rck") 19 | cli_logging_parser = get_logging_cli_parser() 20 | chr_strip_parser = get_chromosome_strip_parser() 21 | subparsers = parser.add_subparsers(title="command", dest="command") 22 | subparsers.required = True 23 | #### 24 | titan_parser = subparsers.add_parser("titan", parents=[cli_logging_parser, chr_strip_parser]) 25 | titan_parser.add_argument("titan_ichor_seg") 26 | titan_parser.add_argument("--sample-name", required=True) 27 | titan_parser.add_argument("--clone-ids", default=None) 28 | titan_parser.add_argument("--separator", default="\t") 29 | titan_parser.add_argument("--corrected-cn-fix", choices=["None", "equal", "relative-dist"], default="None") 30 | titan_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 31 | #### 32 | battenberg_parser = subparsers.add_parser("battenberg", parents=[cli_logging_parser, chr_strip_parser]) 33 | battenberg_parser.add_argument("battenberg", type=argparse.FileType("rt"), default=sys.stdin) 34 | battenberg_parser.add_argument("--separator", default="\t") 35 | battenberg_parser.add_argument("--sample-name", required=True) 36 | battenberg_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2") 37 | battenberg_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 38 | #### 39 | hatchet_parser = subparsers.add_parser("hatchet", parents=[cli_logging_parser, chr_strip_parser]) 40 | hatchet_parser.add_argument("hatchet", type=str) 41 | hatchet_parser.add_argument("--separator", default="\t") 42 | hatchet_parser.add_argument("--min-usage", type=float, default=0.01) 43 | hatchet_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 44 | group = hatchet_parser.add_mutually_exclusive_group(required=True) 45 | group.add_argument("--sample-name", default=None) 46 | group.add_argument("--clone-ids", default=None) 47 | #### 48 | remixt_parser = subparsers.add_parser("remixt", parents=[cli_logging_parser, chr_strip_parser]) 49 | remixt_parser.add_argument("remixt", type=argparse.FileType("rt"), default=sys.stdin) 50 | remixt_parser.add_argument("--separator", default="\t") 51 | remixt_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2") 52 | remixt_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 53 | #### 54 | ginkgo_parser = subparsers.add_parser("ginkgo", parents=[cli_logging_parser, chr_strip_parser]) 55 | ginkgo_parser.add_argument("ginkgo", type=argparse.FileType("rt"), default=sys.stdin) 56 | ginkgo_parser.add_argument("--separator", default="\t") 57 | ginkgo_parser.add_argument("--sample-name", required=True) 58 | ginkgo_parser.add_argument("--dummy-clone-name", default="1") 59 | ginkgo_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 60 | #### 61 | gff_parser = subparsers.add_parser("gff", parents=[cli_logging_parser, chr_strip_parser]) 62 | gff_parser.add_argument("gff", type=str) 63 | gff_parser.add_argument("--chr-mapping-file", type=argparse.FileType("rt")) 64 | gff_parser.add_argument("--chr-mapping-missing-strategy", choices=["keep", "skip"], default="keep") 65 | gff_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout) 66 | args = parser.parse_args() 67 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT") 68 | 69 | if args.command == "titan": 70 | logger.info("Converting allele-specific segment copy values form TitanCNA format to RCK") 71 | titan_full_path = get_full_path(path=args.titan_ichor_seg) 72 | if args.clone_ids is None: 73 | logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=titan_full_path)) 74 | clone_ids = titan_get_clone_ids_from_file(file_name=titan_full_path, sample_name=args.sample_name, separator=args.separator) 75 | else: 76 | clone_ids = sorted(set(args.clone_ids.split(","))) 77 | logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids))) 78 | with open(args.titan_ichor_seg, "rt") as source: 79 | logger.info("Reading allele-specific segment copy number values from {file}".format(file=titan_full_path)) 80 | segments, scnt = get_scnt_from_titan_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator, 81 | corrected_cn_fix=args.corrected_cn_fix, chr_strip=args.strip_chr) 82 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output)) 83 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator) 84 | elif args.command == "battenberg": 85 | logger.info("Converting allele-specific segment copy values form Battenberg format to RCK") 86 | clone_ids = args.clone_ids.split(",") 87 | logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids))) 88 | logger.info("Reading allele-specific segment copy number values form {file}".format(file=args.battenberg)) 89 | segments, scnt = get_scnt_from_battenberg_source(source=args.battenberg, sample_name=args.sample_name, separator=args.separator, chr_strip=args.strip_chr) 90 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output)) 91 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids) 92 | elif args.command == "hatchet": 93 | hatchet_full_path = get_full_path(path=args.hatchet) 94 | logger.info("Converting allele-specific segment copy values form HATCHet format to RCK") 95 | if args.clone_ids is None: 96 | logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=hatchet_parser)) 97 | clone_ids = hatchet_get_clone_ids_from_file(file_name=hatchet_full_path, sample_name=args.sample_name, separator=args.separator, min_usage=args.min_usage) 98 | else: 99 | clone_ids = sorted(set(args.clone_ids.split(","))) 100 | logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids))) 101 | with open(hatchet_full_path) as source: 102 | logger.info("Reading allele-specific segment copy number values from {file}".format(file=hatchet_full_path)) 103 | segments, scnt = get_scnt_from_hatchet_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator, chr_strip=args.strip_chr) 104 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output)) 105 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator) 106 | elif args.command == "remixt": 107 | logger.info("Converting allele-specific segment copy values form ReMixT format to RCK") 108 | clone_ids = args.clone_ids.split(",") 109 | logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids))) 110 | logger.info("Reading allele-specific segment copy number values from {file}".format(file=args.remixt)) 111 | segments, scnt = get_scnt_from_remixt_source(source=args.remixt, separator=args.separator, chr_strip=args.strip_chr) 112 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output)) 113 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids) 114 | elif args.command == "ginkgo": 115 | logger.info("Converting *haploid* segments copy values from Ginkgo format to RCK") 116 | logger.info("Reading *haploid* segments copy values from {file}".format(file=args.ginkgo)) 117 | segments, scnt = get_scnt_from_ginkgo_source(source=args.ginkgo, sample_name=args.sample_name, dummy_clone=args.dummy_clone_name, 118 | separator=args.separator, chr_strip=args.strip_chr) 119 | logger.info("Writing *haploid* segments copy number values in RCK format to {file}".format(file=args.output)) 120 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=set(args.dummy_clone_name), separator=args.separator) 121 | elif args.command == "gff": 122 | logger.info("Converting segments data from GFF format to RCK") 123 | logger.info("Reading segments from {file}".format(file=args.gff)) 124 | chr_mappings = None 125 | if args.chr_mapping_file is not None: 126 | chr_mappings = {} 127 | logger.info("Reading chromosome mapping data from {file}".format(file=args.chr_mapping_file)) 128 | for line in args.chr_mapping_file: 129 | line = line.strip() 130 | data = line.split("\t") 131 | chr_mappings[data[0]] = data[1] 132 | segments = get_segments_from_gff_file(file_name=args.gff, chr_strip=args.strip_chr, 133 | chr_mapping=chr_mappings, chr_mapping_missing_strategy=args.chr_mapping_missing_strategy) 134 | logger.info("Writing segments in RCK format to {file}".format(file=args.output)) 135 | write_segments_to_destination(destination=args.output, segments=segments) 136 | logger.info("Success!") 137 | 138 | 139 | if __name__ == "__main__": 140 | main() 141 | -------------------------------------------------------------------------------- /rck/utils/scn/stats.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import defaultdict 3 | 4 | from rck.core.structures import refined_scnt_with_adjacencies_and_telomeres, refined_scnt, cn_distance_inter_scnt 5 | 6 | 7 | class CloneCollectionCNDistanceInstance(object): 8 | def __init__(self, instances1, instances2, mapping1, mapping2): 9 | self.instances1 = instances1 10 | self.instances2 = instances2 11 | self.mapping1 = mapping1 12 | self.mapping2 = mapping2 13 | 14 | def __str__(self): 15 | return f'instance 1: ({",".join(self.instances1)}); instance 2: ({",".join(self.instances2)})' 16 | 17 | 18 | def cn_distance(segments1, scnt1, segments2, scnt2, both_haplotype_specific=False): 19 | positions_by_chr = defaultdict(set) 20 | for segments in [segments1, segments2]: 21 | for segment in segments: 22 | positions_by_chr[segment.chromosome].add(segment.start_position) 23 | positions_by_chr[segment.chromosome].add(segment.end_position) 24 | outermost_positions_per_chromosomes = {} 25 | for chr_name, positions in positions_by_chr.items(): 26 | outermost_positions_per_chromosomes[chr_name] = { 27 | "start": min(positions, key=lambda p: p.coordinate), 28 | "end": max(positions, key=lambda p: p.coordinate) 29 | } 30 | segments1, scnt1, _ = refined_scnt(segments=segments1, scnt=scnt1, merge_fragments=False, fill_gaps=True, extend_outermost=True, 31 | outermost_positions=outermost_positions_per_chromosomes, outermost_positions_margin=0) 32 | segments2, scnt2, _ = refined_scnt(segments=segments2, scnt=scnt2, merge_fragments=False, fill_gaps=True, extend_outermost=True, 33 | outermost_positions=outermost_positions_per_chromosomes, outermost_positions_margin=0) 34 | all_positions = set() 35 | for segments in [segments1, segments2]: 36 | for segment in segments: 37 | all_positions.add(segment.start_position) 38 | all_positions.add(segment.end_position) 39 | segments1, scnt1, _ = refined_scnt_with_adjacencies_and_telomeres(segments=segments1, scnt=scnt1, telomere_positions=all_positions) 40 | segments2, scnt2, _ = refined_scnt_with_adjacencies_and_telomeres(segments=segments2, scnt=scnt2, telomere_positions=all_positions) 41 | clone_ids1, clone_ids2 = list(set(scnt1.keys())), list(set(scnt2.keys())) 42 | matching_clone_ids_cnt = min((len(clone_ids1), len(clone_ids2))) 43 | result = {} 44 | for clone_ids1_instances in itertools.combinations(clone_ids1, matching_clone_ids_cnt): 45 | for clone_ids2_instances in itertools.permutations(clone_ids2, matching_clone_ids_cnt): 46 | clone_ids1_mapping = {str(cnt): clone_id for cnt, clone_id in enumerate(clone_ids1_instances)} 47 | clone_ids2_mapping = {str(cnt): clone_id for cnt, clone_id in enumerate(clone_ids2_instances)} 48 | tmp_scnt1 = {key: scnt1[value] for key, value in clone_ids1_mapping.items()} 49 | tmp_scnt2 = {key: scnt2[value] for key, value in clone_ids2_mapping.items()} 50 | clone_specific_distances = cn_distance_inter_scnt(tensor1=tmp_scnt1, tensor2=tmp_scnt2, segments=segments1) 51 | case = CloneCollectionCNDistanceInstance(instances1=clone_ids1_instances, instances2=clone_ids2_instances, 52 | mapping1=clone_ids1_mapping, mapping2=clone_ids2_mapping) 53 | result[case] = clone_specific_distances 54 | return result 55 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | import sys 4 | import os 5 | 6 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 7 | 8 | import rck 9 | 10 | setup( 11 | name="RCK", 12 | version=rck.version, 13 | author="Sergey Aganezov", 14 | author_email="aganezov@cs.jhu.edu", 15 | description="A tool for (R)econstruction of (C)ancer (K)aryotypes (both clone- and haplotype-specific)", 16 | license="MIT", 17 | keywords="RCK, rck, cancer, cancer genomics, cancer karyotypes, clonality, subclonality, copy number aberrations, breakpoints, structural variations, novel adjacencies", 18 | url="https://github.com/aganezov/rck", 19 | packages=["", "rck", "rck.core", "rck.utils", "rck.utils.scn", "rck.utils.adj"], 20 | include_package_data=True, 21 | entry_points={ 22 | "console_scripts": [ 23 | "rck = rck.rck_run:main", 24 | "rck-scnt-x2rck = rck.utils.scn.rck_scnt_x2rck:main", 25 | "rck-scnt-process = rck.utils.scn.rck_scnt_process:main", 26 | "rck-scnt-rck2x = rck.utils.scn.rck_scnt_rck2x:main", 27 | "rck-scnt-stats = rck.utils.scn.rck_scnt_stats:main", 28 | "rck-scnb = rck.utils.scn.rck_scnb:main", 29 | "rck-adj-x2rck = rck.utils.adj.rck_adj_x2rck:main", 30 | "rck-adj-rck2x = rck.utils.adj.rck_adj_rck2x:main", 31 | "rck-adj-process = rck.utils.adj.rck_adj_process:main", 32 | "rck-adj-stats = rck.utils.adj.rck_adj_stats:main", 33 | "rck-adg-infer = rck.utils.adj.rck_adg_infer:main", 34 | "rck-adg-process = rck.utils.adj.rck_adg_process:main", 35 | "rck-adg-stats = rck.utils.adj.rck_adg_stats:main", 36 | "rck-input-refine = rck.utils.rck_input_refine:main", 37 | "rck-kar-graph = rck.utils.karyotype.rck_kar_graph:main", 38 | "rck-kar-stats = rck.utils.karyotype.rck_kar_stats:main", 39 | ] 40 | }, 41 | install_requires=[ 42 | "networkx>=2", 43 | "scipy", 44 | "pyvcf", 45 | "pysam", 46 | "sortedcontainers", 47 | "pandas", 48 | "gffutils", 49 | ] 50 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_graph.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from rck.core.graph import IntervalAdjacencyGraph 4 | from rck.core.structures import Position, Strand, Segment, Adjacency, AdjacencyType 5 | 6 | 7 | class TestIntervalAdjacencyGraph(unittest.TestCase): 8 | def setUp(self): 9 | self.p1 = Position(chromosome="1", coordinate=1, strand=Strand.REVERSE) 10 | self.p2 = Position(chromosome="1", coordinate=2, strand=Strand.FORWARD) 11 | self.p3 = Position(chromosome="1", coordinate=3, strand=Strand.REVERSE) 12 | self.p4 = Position(chromosome="1", coordinate=4, strand=Strand.FORWARD) 13 | self.p5 = Position(chromosome="1", coordinate=5, strand=Strand.REVERSE) 14 | self.p6 = Position(chromosome="1", coordinate=6, strand=Strand.FORWARD) 15 | 16 | self.p7 = Position(chromosome="2", coordinate=1, strand=Strand.REVERSE) 17 | self.p8 = Position(chromosome="2", coordinate=2, strand=Strand.FORWARD) 18 | self.p9 = Position(chromosome="2", coordinate=3, strand=Strand.REVERSE) 19 | self.p10 = Position(chromosome="2", coordinate=4, strand=Strand.FORWARD) 20 | self.p11 = Position(chromosome="2", coordinate=5, strand=Strand.REVERSE) 21 | self.p12 = Position(chromosome="2", coordinate=6, strand=Strand.FORWARD) 22 | 23 | self.s1 = Segment(start_position=self.p1, end_position=self.p2) 24 | self.s2 = Segment(start_position=self.p3, end_position=self.p4) 25 | self.s3 = Segment(start_position=self.p5, end_position=self.p6) 26 | 27 | self.s4 = Segment(start_position=self.p7, end_position=self.p8) 28 | self.s5 = Segment(start_position=self.p9, end_position=self.p10) 29 | self.s6 = Segment(start_position=self.p11, end_position=self.p12) 30 | 31 | def test_construction_no_adjacencies(self): 32 | segments = [self.s1, self.s2, self.s3] 33 | iag = IntervalAdjacencyGraph(segments=segments, adjacencies=[]) 34 | nodes = list(iag.nodes(data=True)) 35 | self.assertEqual(len(nodes), 6) 36 | self.assertEqual(len(list(iag.edges())), 3) 37 | segment_edges = list(iag.segment_edges(data=True)) 38 | self.assertEqual(len(segment_edges), 3) 39 | self.assertEqual(len(list(iag.adjacency_edges(data=True))), 0) 40 | segments_edges_as_objects_on_edges = [e[2]["object"] for e in segment_edges] 41 | for s in segments: 42 | self.assertIn(s.start_position.idx, {n[0] for n in nodes}) 43 | self.assertIn(s.end_position.idx, {n[0] for n in nodes}) 44 | self.assertIn(s, segments_edges_as_objects_on_edges) 45 | 46 | def test_construction_only_ref_adjacencies(self): 47 | segments = [self.s1, self.s2, self.s3] 48 | ra1 = Adjacency(position1=self.s1.end_position, position2=self.s2.start_position, adjacency_type=AdjacencyType.REFERENCE) 49 | ra2 = Adjacency(position1=self.s2.end_position, position2=self.s3.start_position, adjacency_type=AdjacencyType.REFERENCE) 50 | adjacencies = [ra1, ra2] 51 | iag = IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies) 52 | nodes = list(iag.nodes(data=True)) 53 | self.assertEqual(len(nodes), 6) 54 | edges = list(iag.edges(data=True)) 55 | segment_edges = list(iag.segment_edges(data=True)) 56 | adjacency_edges = list(iag.adjacency_edges(data=True)) 57 | self.assertEqual(len(edges), 5) 58 | self.assertEqual(len(segment_edges), 3) 59 | self.assertEqual(len(adjacency_edges), 2) 60 | for s in segments: 61 | self.assertIn(s.start_position.idx, {n[0] for n in nodes}) 62 | self.assertIn(s.end_position.idx, {n[0] for n in nodes}) 63 | self.assertIn(s, {e[2]["object"] for e in segment_edges}) 64 | for a in adjacencies: 65 | self.assertIn(a, {e[2]["object"] for e in adjacency_edges}) 66 | 67 | def test_construction_ref_and_nov_adjacencies(self): 68 | segments = [self.s1, self.s2, self.s3] 69 | ra1 = Adjacency(position1=self.s1.end_position, position2=self.s2.start_position, adjacency_type=AdjacencyType.REFERENCE) 70 | ra2 = Adjacency(position1=self.s2.end_position, position2=self.s3.start_position, adjacency_type=AdjacencyType.REFERENCE) 71 | na1 = Adjacency(position1=self.s1.end_position, position2=self.s3.start_position, adjacency_type=AdjacencyType.NOVEL) 72 | adjacencies = [ra1, ra2, na1] 73 | iag = IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies) 74 | nodes = list(iag.nodes(data=True)) 75 | edges = list(iag.edges(data=True)) 76 | segment_edges = list(iag.segment_edges(data=True)) 77 | adjacency_edges = list(iag.adjacency_edges(data=True)) 78 | r_adjacency_edges = list(iag.ref_adjacency_edges(data=True)) 79 | n_adjacency_edges = list(iag.nov_adjacency_edges(data=True)) 80 | self.assertSetEqual({(e[0], e[1]) for e in adjacency_edges}, 81 | {(e[0], e[1]) for e in r_adjacency_edges}.union({(e[0], e[1]) for e in n_adjacency_edges})) 82 | self.assertSetEqual({e[2]["object"] for e in segment_edges}, set(segments)) 83 | self.assertEqual(len(nodes), 6) 84 | self.assertEqual(len(edges), 6) 85 | self.assertEqual(len(segment_edges), 3) 86 | self.assertEqual(len(adjacency_edges), 3) 87 | self.assertEqual(len(r_adjacency_edges), 2) 88 | self.assertEqual(len(n_adjacency_edges), 1) 89 | 90 | def test_construction_invalid_consistency_check_ref_from_different_chromosomes(self): 91 | invalid_ra = Adjacency(position1=self.s2.start_position, position2=self.s4.end_position, adjacency_type=AdjacencyType.REFERENCE) 92 | segments = [self.s1, self.s2, self.s3, self.s4] 93 | adjacencies = [invalid_ra] 94 | with self.assertRaises(ValueError): 95 | IntervalAdjacencyGraph.check_consistency(segments=segments, adjacencies=adjacencies) 96 | with self.assertRaises(ValueError): 97 | IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies) 98 | 99 | def test_construction_invalid_consistency_check_adjacency_with_position_not_from_segments(self): 100 | adjacency = Adjacency(position1=self.s4.end_position, position2=self.s1, adjacency_type=AdjacencyType.NOVEL) 101 | segments = [self.s1, self.s2, self.s3] 102 | adjacencies = [adjacency] 103 | with self.assertRaises(ValueError): 104 | IntervalAdjacencyGraph.check_consistency(segments=segments, adjacencies=adjacencies) 105 | with self.assertRaises(ValueError): 106 | IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies) 107 | -------------------------------------------------------------------------------- /tests/test_structures.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from rck.core.structures import PositionCluster 4 | from rck.core.structures import Strand, Position, Segment, Adjacency 5 | 6 | 7 | class StrandTestCase(unittest.TestCase): 8 | def test_strand_str(self): 9 | self.assertEqual(str(Strand.REVERSE), "-") 10 | self.assertEqual(str(Strand.FORWARD), "+") 11 | 12 | def test_from_pm_string(self): 13 | self.assertEqual(Strand.REVERSE, Strand.from_pm_string(string="-")) 14 | self.assertEqual(Strand.FORWARD, Strand.from_pm_string(string="+")) 15 | with self.assertRaises(ValueError): 16 | Strand.from_pm_string(string="?") 17 | 18 | 19 | class PositionTestCase(unittest.TestCase): 20 | def setUp(self): 21 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD) 22 | self.position2 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE) 23 | self.position3 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD) 24 | self.position4 = Position(chromosome="chr2", coordinate=1, strand=Strand.FORWARD) 25 | 26 | def test_empty_extra_creation(self): 27 | self.assertDictEqual(Position(chromosome="chrom1", coordinate=1, strand=Strand.FORWARD).extra, {}) 28 | 29 | def test_eq(self): 30 | non_position = "?" 31 | eq_position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD) 32 | self.assertNotEqual(self.position1, self.position2) 33 | self.assertNotEqual(self.position1, self.position3) 34 | self.assertNotEqual(self.position1, self.position4) 35 | self.assertNotEqual(self.position1, non_position) 36 | self.assertEqual(self.position1, eq_position1) 37 | 38 | def test_lt(self): 39 | self.assertLess(self.position2, self.position1) 40 | self.assertGreater(self.position1, self.position2) 41 | self.assertLess(self.position1, self.position3) 42 | chr5_position = Position(chromosome="chr5", coordinate=5, strand=Strand.FORWARD) 43 | chr10_position = Position(chromosome="chr10", coordinate=1, strand=Strand.REVERSE) 44 | self.assertLess(self.position1, self.position4) 45 | self.assertLess(chr5_position, chr10_position) 46 | 47 | 48 | class SegmentTestCase(unittest.TestCase): 49 | def setUp(self): 50 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE) 51 | self.position2 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD) 52 | 53 | def test_creation(self): 54 | position3 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD) 55 | position4 = Position(chromosome="chr2", coordinate=2, strand=Strand.FORWARD) 56 | position5 = Position(chromosome="chr1", coordinate=0, strand=Strand.FORWARD) 57 | for pos in [position3, position4, position5]: 58 | with self.assertRaises(ValueError): 59 | Segment(start_position=self.position1, end_position=pos) 60 | Segment(start_position=self.position1, end_position=self.position2) 61 | 62 | def test_idx(self): 63 | s = Segment(start_position=self.position1, end_position=self.position2) 64 | self.assertIsNone(s._idx) 65 | self.assertEqual(s.idx, "chr1:1-2") 66 | s = Segment(start_position=self.position1, end_position=self.position2, idx="idx") 67 | self.assertEqual(s.idx, "idx") 68 | s.idx = "idx2" 69 | self.assertEqual(s.idx, "idx2") 70 | 71 | def test_str(self): 72 | s = Segment(start_position=self.position1, end_position=self.position2) 73 | self.assertEqual(str(s), "chr1:1-2") 74 | s.idx = "idx" 75 | self.assertEqual(str(s), "idx") 76 | 77 | def test_chromosome(self): 78 | s = Segment(start_position=self.position1, end_position=self.position2) 79 | self.assertEqual(s.chromosome, self.position1.chromosome) 80 | self.assertEqual(s.chromosome, self.position2.chromosome) 81 | 82 | 83 | class AdjacencyTestCase(unittest.TestCase): 84 | def setUp(self): 85 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE) 86 | self.position2 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD) 87 | 88 | def test_creation(self): 89 | a = Adjacency(position1=self.position2, position2=self.position1) 90 | self.assertEqual(a.position1, self.position1) 91 | self.assertEqual(a.position2, self.position2) 92 | a = Adjacency(position1=self.position1, position2=self.position2) 93 | self.assertEqual(a.position1, self.position1) 94 | self.assertEqual(a.position2, self.position2) 95 | 96 | def test_idx(self): 97 | s = Adjacency(position1=self.position1, position2=self.position2, idx="idx") 98 | self.assertEqual(s.idx, "idx") 99 | s.idx = None 100 | self.assertEqual(s.idx, "[" + str(self.position1) + "]-[" + str(self.position2) + "]") 101 | 102 | 103 | class PositionClusterTestCase(unittest.TestCase): 104 | def setUp(self): 105 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD) 106 | self.position2 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE) 107 | self.position3 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD) 108 | self.position4 = Position(chromosome="chr2", coordinate=1, strand=Strand.FORWARD) 109 | 110 | def test_interning_sorting_on_creation(self): 111 | pc = PositionCluster(positions=[self.position3, self.position1, self.position2]) 112 | self.assertListEqual(pc.positions, [self.position2, self.position1, self.position3]) 113 | 114 | 115 | if __name__ == '__main__': 116 | unittest.main() 117 | --------------------------------------------------------------------------------