├── .gitignore
├── .travis.yml
├── .travis
└── install.sh
├── LICENSE
├── MANIFEST.in
├── README.md
├── conda_rck
└── meta.yaml
├── docs
├── Adjacencies.md
├── AdjacencyGroups.md
├── Installation.md
├── README.md
├── Segments.md
├── Usage.md
└── img
│ └── RCK_Overview_vertical.png
├── rck
├── __init__.py
├── core
│ ├── __init__.py
│ ├── graph.py
│ ├── ilp_gurobi.py
│ ├── io.py
│ ├── process.py
│ └── structures.py
├── rck_run.py
└── utils
│ ├── __init__.py
│ ├── adj
│ ├── __init__.py
│ ├── adjacency_group_inference.py
│ ├── adjacency_group_process.py
│ ├── adjacency_group_stats.py
│ ├── analysis.py
│ ├── convert.py
│ ├── long_reads.py
│ ├── main_chrs.txt
│ ├── process.py
│ ├── rck_adg_infer.py
│ ├── rck_adg_process.py
│ ├── rck_adg_stats.py
│ ├── rck_adj_long_reads.py
│ ├── rck_adj_process.py
│ ├── rck_adj_rck2x.py
│ ├── rck_adj_stats.py
│ ├── rck_adj_x2rck.py
│ └── stats.py
│ ├── karyotype
│ ├── __init__.py
│ ├── analysis.py
│ ├── rck_kar_graph.py
│ └── rck_kar_stats.py
│ ├── rck_input_refine.py
│ └── scn
│ ├── __init__.py
│ ├── convert.py
│ ├── process.py
│ ├── rck_scnb.py
│ ├── rck_scnt_process.py
│ ├── rck_scnt_rck2x.py
│ ├── rck_scnt_stats.py
│ ├── rck_scnt_x2rck.py
│ └── stats.py
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
├── test_graph.py
└── test_structures.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | .static_storage/
56 | .media/
57 | local_settings.py
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | analysis/
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | .idea/
109 | test_data/
110 |
111 | **/dev_tmp*
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - 3.6
4 | - 3.7
5 |
6 | os:
7 | - linux
8 | dist: xenial
9 | sudo: required
10 |
11 | matrix:
12 | include:
13 | - os: osx
14 | osx_image: xcode10.1
15 | language: generic
16 | env:
17 | - PYTHON=3.6
18 | - os: osx
19 | osx_image: xcode10.1
20 | language: generic
21 | env:
22 | - PYTHON=3.7
23 |
24 |
25 | before_install:
26 | - sudo chmod +x .travis/install.sh && sudo chown $USER .travis/install.sh && /bin/bash .travis/install.sh
27 |
28 | install:
29 | - hash -r
30 | - export PATH="$HOME/miniconda/bin:$PATH"
31 | # - export PATH="$HOME/miniconda/envs/test-environment/bin:$PATH"
32 | - echo $PATH
33 | - source activate test-environment
34 | - ls -l $HOME/miniconda/envs/test-environment/bin
35 | - travis_wait 30 pip install -e .
36 | - conda install -c gurobi gurobi
37 |
38 | script:
39 | - which python
40 | - python -c "import gurobi"
41 | - rck --help
42 | - rck-adj-x2rck --help
43 | - rck-adj-x2rck sniffles --help
44 | - rck-adj-x2rck lumpy --help
45 | - rck-adj-x2rck longranger --help
46 | - rck-adj-x2rck naibr --help
47 | - rck-adj-x2rck manta --help
48 | - rck-adj-x2rck grocsv --help
49 | - rck-adj-x2rck delly --help
50 | - rck-adj-x2rck pbsv --help
51 | - rck-adj-x2rck remixt --help
52 | - rck-adj-process --help
53 | - rck-adj-process cat --help
54 | - rck-adj-process reciprocal --help
55 | - rck-adj-process filter --help
56 | - rck-scnt-x2rck --help
57 | - rck-scnt-x2rck titan --help
58 | - rck-scnt-x2rck battenberg --help
59 | - rck-scnt-x2rck hatchet --help
60 | - rck-scnt-x2rck remixt --help
--------------------------------------------------------------------------------
/.travis/install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | if [ "$TRAVIS_OS_NAME" = 'osx' ]; then
4 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O ~/miniconda.sh
5 | bash ~/miniconda.sh -b -p $HOME/miniconda
6 | export PATH="$HOME/miniconda/bin:$PATH"
7 | hash -r
8 | conda config --set always_yes yes --set changeps1 no
9 | conda update -q conda
10 | conda info -a
11 | conda create -n test-environment python=$PYTHON
12 | source activate test-environment
13 | fi
14 |
15 | if [ "$TRAVIS_OS_NAME" = 'linux' ]; then
16 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
17 | bash miniconda.sh -b -p $HOME/miniconda
18 | export PATH="$HOME/miniconda/bin:$PATH"
19 | hash -r
20 | conda config --set always_yes yes --set changeps1 no
21 | conda update -q conda
22 | # Useful for debugging any issues with conda
23 | conda info -a
24 | conda create -n test-environment python=$TRAVIS_PYTHON_VERSION
25 | source activate test-environment
26 | fi
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2010-2017 Raphael Research Group, Princeton University, Princeton, NJ, USA.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.md setup.py
2 | recursive-include rck/utils *
3 | recursive-include docs *
4 | global-exclude __pycache__
5 | global-exclude *.py[co]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RCK
Reconstruction of clone- and haplotype-specific Cancer Karyotypes
2 |
3 | [](https://raw.githubusercontent.com/hyperium/hyper/master/LICENSE)
4 | [](https://www.python.org/downloads/)
5 | [](https://travis-ci.com/aganezov/RCK)
6 |
7 |
8 | **RCK** - is a method for **R**econstruction of clone- and haplotype-specific **C**ancer **K**aryotypes from tumor mixtures, distributed both as a standalone software package and as a Python library under the MIT licence.
9 |
10 | RCK has been initially designed and developed by Sergey Aganezov in the group of prof. Ben Raphael at Princeton University ([group site](http://compbio.cs.brown.edu/)).
11 | Current development of RCK is continued by Sergey Aganezov in the group of prof. Michael Schatz at Johns Hopkins University ([group site](http://schatz-lab.org/)).
12 |
13 | The full description of the algorithm and its application on published cancer datasets are described in:
14 |
15 | [Sergey Aganezov and Benjamin J. Raphael, 2019](https://www.biorxiv.org/content/10.1101/560839v1)
16 |
17 | ### Contents:
18 | 1. [Algorithm overview](#algorithm-overview)
19 | 2. [Installation](#installation)
20 | 3. [Input preprocessing](#input-preprocessing)
21 | 1. [Novel Adjacencies](#novel-adjacencies)
22 | 2. [Segment copy numbers](#segment-copy-numbers)
23 | 4. [High-level RCK data processing recipe](#RCK-data-processing-recipe)
24 | 5. [Running RCK](#running-rck)
25 | 6. [Results](#results)
26 | 7. [Citation](#citation)
27 | 8. [Issues](#issues)
28 |
29 | ### Algorithm Overview
30 |
31 | 
32 |
33 | RCK infers clone- and haplotype-speicifc cancer genome karyotypes from tumor mixtures.
34 |
35 | RCK assumes that:
36 | * the reference human genome is diploid (except for sex chromosomes)
37 | * somatic evolution is propagated by large scale rearrangements (any type, quantity, etc) that respect the infinite sites assumption (i.e., no genomic location, on either copy of the homologous chromosome, prticipates in the double-stranded breakage, which are requried for a rearrangement to happen, more than once thgoughout the entire somatic evolutionary history of the tumor);
38 | this can be relaxed for extremity-exclusivity constraint, if in the high confident input novel adjacencies some genomic location is shared.
39 | * no novel genomic locations (unless explicitly specified) can play a role of telomeres in the derived chromosomes
40 | * (approximate) clone- and allele-specific fragment/segment copy numbers are inferred by 3rd-party tools and are part of the input (see more in the [segments docs](docs/Segments.md))
41 | * (noisy) unlabeled (i.e., without haplotype labels) novel adjacencies (aka structural variants) are inferred by 3rd-party tools and are part of the input (see more in the [adjacencies docs](docs/Adjacencies.md))
42 |
43 | RCK uses a Diploid Interval Adjacency Graph to represent all possible segments and transitions between them (across all clones and the reference).
44 | RCK then solves an optimization problem of inferring clone- and haplotype-specific karyotypes (i.e., finding clone-specific edge multiplicity functions in the constructed DIAG) as an MILP program.
45 | Several constraints are taken into consideration (some of which are listed below) during the inference:
46 | * infinite sites complience (across all clones in the tumor)
47 | * adjacencies grouping (is part of the input, optional)
48 | * false positive for novel adjacencies presence in reconstructed karyotypes
49 | * maximum divergence from input (approximate) allele-specific segment/fragment copy number profile
50 | * preservatino of allele-separation across clones in tumor
51 | * telomere locations
52 |
53 | We note, that in contrast to some other cancer karyotype inference methods, RCK model has several advantages, that all work in q unifying computation framework and some/all of which differentiate RCK from other methods:
54 | * any level of sample heterogeneity (on the karyotype level): from homogeneous samples with a single derived clone, to tumor samples comprised of `n` derived genomes
55 | * support for any type of novel adjacencies signature (SV types), including copy-number-neutral ones, as well as the complicated ones arising from chromoplexy/chromothripsis events
56 | * model of diploid reference/non-haploid derived genomes
57 | * explicit control over telomere location during the inference
58 | * explicit fine-grain control over false positive in the novel adjacencies in the input and respectively their utilization in the inference
59 | * haplotype-specific (aka phased) inference both for segments and adjacencies across all clones in the tumor sample
60 | * support for (optional) 3rd-generation sequencing additional information
61 |
62 | ### Installation
63 |
64 | RCK shall work on latest macOS, and main Linux distribution.
65 | RCK is implemented in Python and designed to work with Python 3.7+.
66 | We highly recommend creating an independent python virtual environment for RCK usage.
67 |
68 | RCK itself can be installed in three different ways:
69 | * [conda](https://conda.io/docs/user-guide/overview.html) `conda install -c aganezov rck`
70 | * [pip (PyPI)](https://pip.pypa.io/en/stable/) `pip install rck`
71 | * source `python setup.py install`
72 |
73 | RCK requires an ILP solver installed on the system, as well as python bindings for it.
74 | Currently only Gurobi ILP solver is supported.
75 |
76 | For more details about installation please refer to the [installation documentation](docs/Installation.md).
77 |
78 | ### Input (preprocessing)
79 |
80 | The minimum input for RCK is comprised of two parts:
81 | 1. Unlabeled novel adjacencies (aka structural variations in the tumor sample)
82 | 2. Clone- and allele-specific segment copy numbers
83 |
84 | Additional input can contain:
85 | * Additional telomere locations
86 | * Segment-, clone-, and allele-specific boundaries (both lower and upper) on inferred copy numbers
87 | * Grouping information about clone-specific novel adjacencies (usually informed by 3rd-generation sequencing data), with individual False Positive rates per each group
88 | * False Positive rates for any subgroup of input novel adjacencies.
89 |
90 | RCK expects the input data to be in a (C/T)SV (Coma/Tab Separated Values) format.
91 | We provide a set of utility tools to convert input data obtained from a lot of state-of-the-atr methods outputs into the RCK suitable format.
92 |
93 | #### Novel Adjacencies
94 | Obtaining unlabeled (i.e., without allele-information) novel adjacencies (aka Structural Variants) is not a part of the RCK workflow, as there exist a lot of tools for obtaining those.
95 | We provide a `rck-adj-x2rck` utility to convert output from output format of SV detection tools to the RCK suitable format.
96 | We currently support converting the output of the following 3rd-party SV detection tools:
97 | * *short-reads*
98 | * **Delly** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3436805/) | [code](https://github.com/dellytools/delly)]
99 | * **Manta** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/26647377) | [code](https://github.com/Illumina/manta)]
100 | * **Lumpy** [[paper](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2014-15-6-r84) | [code](https://github.com/arq5x/lumpy-sv)]
101 | * **BreakDancer** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138716/) | [code](https://github.com/genome/breakdancer)]
102 | * *linked/barcode reads*
103 | * **LongRanger** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4786454/) | [code](https://github.com/10XGenomics/longranger)]
104 | * **GROC-SVs** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28714986) | [code](https://github.com/grocsvs/grocsvs)]
105 | * **NAIBR** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/29112732) | [code](https://github.com/raphael-group/NAIBR)]
106 | * *long reads*
107 | * **Sniffles** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5990442/) | [code](https://github.com/fritzsedlazeck/Sniffles)]
108 | * **PBSV** [paper | [code](https://github.com/PacificBiosciences/pbsv)]
109 | * *generic*
110 | * **SURVIVOR** [[paper](https://www.nature.com/articles/ncomms14061) | [code](https://github.com/fritzsedlazeck/SURVIVOR)]
111 |
112 | For more information about adjacencies, formats, converting, reciprocality, etc, please refer to [adjacencies documentation](docs/Adjacencies.md)
113 |
114 | #### Segment copy numbers
115 | Obtaining clone- and allele-specific segment copy numbers is not a part of the RCK workflow, as there exist a lof of tools for obtaining those.
116 | We provide a `rck-scnt-x2rck` utility to convert output from output format of other tools that infer clone- and allele-specific segment copy numbers to the RCK suitable format.
117 | We currently support converting the output of the following 3rd-party tools:
118 | * **HATCHet** [[paper](https://www.biorxiv.org/content/early/2018/12/17/496174) | [code](https://github.com/raphael-group/hatchet)] (*recommended* as it has fewest limitation w.r.t. tumor heterogeneity)
119 | * **TitanCNA** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/25060187) | [code](https://github.com/gavinha/TitanCNA)]
120 | * **Battenberg** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/22608083) | [code](https://github.com/cancerit/cgpBattenberg)]
121 | * **ReMixT** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28750660) | [code](https://bitbucket.org/dranew/remixt)]
122 | * **Ginkgo** [[paper](https://www.nature.com/articles/nmeth.3578) | [code](https://github.com/robertaboukhalil/ginkgo)] (Attention! *haploid* mode only)
123 |
124 | ## RCK data processing recipe
125 | For the most cases the cancer sample of interest is initially represented via a set `cancer.sr.fastq` of reads obtained via a sequencer.
126 | Additionally, a sequenced reads `normal.sr.fastq` from a matching normal sample need to be available.
127 | Most often case of analysis consists of having a standard Illumina paired-end sequenced reads for both the tumor and the matching normal.
128 | Increasingly 3rd-generation sequencing technologies are being utilized in cancer analysis.
129 | Let us assume that there may optionally be a set `cancer.lr.fastq` of reads for the cancer sample in question obtained via 3rd-generation sequencing technology.
130 |
131 | 1. Align sequenced reads (with you aligner of choice) `cancer.sr.fastq` and `normal.sr.fastq` for cancer and a matching normal samples to obtain `cancer.sr.bam` and `normal.sr.bam`
132 | 1. Optionally align sequenced long reads `cancer.lr.fastq` to obtain `cancer.lr.bam`
133 | 2. Run a tool of you choosing on `cancer.sr.fastq` to obtain a novel adjacencies VCF file `cancer.sr.vcf`
134 | 1. Optionally infer novel adjacencies on long-read dataset obtaining `cancer.lr.vcf`
135 | 2. Merge short- and long-read novel adjacencies into a unified set `cancer.vcf` (we suggest using SURVIVOR tool [[code](https://github.com/fritzsedlazeck/SURVIVOR) | [paper](https://www.nature.com/articles/ncomms14061)] for this task)
136 | 3. Convert novel adjacencies from VCF file `cancer.vcf` to the `RCK` input format via `rck-adj-x2rck x cancer.vcf -o input.rck.adj.tsv`, where `x` stands for the novel adjacency inference tool.
137 | Please, see [adjacencies docs](docs/Adjacencies.md) for list of supported tools and more detailed instructions on comparison.
138 | 4. Run any of the supported tools (HATCHet, TitanCNA, Battenberg, ReMixT) of choice to infer large-scale clone- and allele-specific fragment copy numbers `CN.data` (generic name of the tool-specific result)
139 | 5. Convert tool-specific copy-number data `CN.data` into `RCK` format via `rck-scnt-x2rck x CN-data -o input.rck.scnt.tsv`, where `x` stands for copy number inference tool.
140 | Please, see [segments docs](docs/Segments.md) for link to specific methods, as well as details on how to run conversion.
141 | 6. Run `RCK`
142 |
143 | ### Running RCK
144 | We provide the the `rck` tool to run the main RCK algorithm for clone- and haplotype specific cancer karyotypes reconstruction.
145 |
146 | With the minimum input for RCK the following is the example of running RCK:
147 |
148 | ````bash
149 | rck --scnt input.rck.scnt.tsv --adjacecnies input.rck.adj.tsv
150 | ````
151 |
152 | where:
153 | * `--scnt` corresponds to the clone- and allele-specific segments copy number input
154 | * `--adjacencies` corresponds to the unlabeled novel adjacencies input
155 |
156 | Additionally one can specify the `--workdir` working directory, where the input, preprocessing, and the output will be stored.
157 | For more on the `rck` command usage please refer to [usage documentation](docs/Usage.md).
158 |
159 | ### Results
160 | Here is the description of the results produced by `rck` main method for cancer karyotype reconstruction.
161 | For results on segment/adjacency conversion/processing, please refer to respective [segment](docs/Segments.md)/[adjacency](docs/Adjacencies.md) documentations.
162 |
163 | RCK's cancer karyotype reconstruction is stored in the `output` subdirectory in the working directory (the `--workdir`).
164 | The following two files depict the inferred clone- and haplotype-specific karyotypes:
165 | * `rck.scnt.tsv` - clone- and haplotype-specific segments copy numbers;
166 | * `rck.acnt.tsv` - clone- and haplotype-specific adjacencies copy numbers;
167 |
168 | For information about the format of the inferred clone- and haplotype-specific copy numbers on segments/adjacencies please refer to [segment](docs/Segments.md)/[adjacency](docs/Adjacencies.md) documentations
169 |
170 | Results in the original [manuscript](https://www.biorxiv.org/content/10.1101/560839v1) can be found in the [dedicated Github repository](https://github.com/aganezov/RCK-pub-data).
171 |
172 | ### Citation
173 | When using RCK's cancer karyotype reconstruction algorithm or any of RCK's utilities, please cite the following paper:
174 |
175 | [Sergey Aganezov and Benjamin J. Raphael, 2019](https://www.biorxiv.org/content/10.1101/560839v1)
176 |
177 | ### Issues
178 | If you experience any issues with RCK installation, usage, or results or want to see RCK enhanced in any way, shape or form, please create an issue on RCK [issue tracker](https://github.com/aganezov/RCK/issues).
179 | Please, make sure to specify the RCK's, Python's, and Gurobi's versions in question, and, if possible, provide (minimized) data, on which the issue(s) occur(s).
180 |
181 | If you want to discuss any avenues of collaboration, custom RCK applications, etc, please contact Sergey Aganezov at *aganezov(at)jhu.edu* or *sergeyaganezovjr(at)gmail.com*
182 |
--------------------------------------------------------------------------------
/conda_rck/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "RCK" %}
2 | {% set version = "1.1.0" %}
3 |
4 | package:
5 | name: "{{ name|lower }}"
6 | version: "{{ version }}"
7 |
8 | source:
9 | url: https://files.pythonhosted.org/packages/fd/85/69c8ba6c6e80e9d9acbb40365601ee0632f8ce7c8b61e051249a8db70595/RCK-1.1.0.tar.gz
10 | sha256: 522e7965be2eb3ed7089b37e05bb795846a7f78ec658815ad229779cd4874ca5
11 |
12 | build:
13 | number: 0
14 | script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vvv "
15 |
16 | requirements:
17 | host:
18 | - networkx >=2
19 | - scipy
20 | - pyvcf
21 | - pysam
22 | - sortedcontainers
23 | - pandas
24 | - gurobi
25 | - pip
26 | - gffutils
27 | - python >=3.6
28 | run:
29 | - networkx >=2
30 | - scipy
31 | - pyvcf
32 | - pysam
33 | - sortedcontainers
34 | - pandas
35 | - gurobi
36 | - pip
37 | - gffutils
38 | - python >=3.6
39 |
40 | test:
41 | imports:
42 | - rck
43 | - rck.core
44 | - rck.utils
45 | - rck.utils.adj
46 | - rck.utils.scn
47 | commands:
48 | - rck --help # [not win]
49 | - rck-adj-x2rck --help # [not win]
50 | - rck-adj-x2rck sniffles --help # [not win]
51 | - rck-adj-x2rck lumpy --help # [not win]
52 | - rck-adj-x2rck longranger --help # [not win]
53 | - rck-adj-x2rck naibr --help # [not win]
54 | - rck-adj-x2rck manta --help # [not win]
55 | - rck-adj-x2rck grocsvs --help # [not win]
56 | - rck-adj-x2rck delly --help # [not win]
57 | - rck-adj-x2rck pbsv --help # [not win]
58 | - rck-adj-x2rck remixt --help # [not win]
59 | - rck-adj-process --help # [not win]
60 | - rck-adj-process cat --help # [not win]
61 | - rck-adj-process reciprocal --help # [not win]
62 | - rck-adj-process filter --help # [not win]
63 | - rck-scnt-x2rck --help # [not win]
64 | - rck-scnt-x2rck titan --help # [not win]
65 | - rck-scnt-x2rck battenberg --help # [not win]
66 | - rck-scnt-x2rck hatchet --help # [not win]
67 | - rck-scnt-x2rck remixt --help # [not win]
68 | - rck-scnt-process --help # [not win]
69 | - rck-scnt-process refine --help # [not win]
70 | - rck-scnt-process align --help # [not win]
71 | - rck-scnt-process distance --help # [not win]
72 | - rck-scnb --help # [not win]
73 | - rck-input-refine --help # [not win]
74 |
75 | about:
76 | home: https://github.com/aganezov/rck
77 |
78 | extra:
79 | recipe-maintainers:
80 | - aganezov
81 |
--------------------------------------------------------------------------------
/docs/Adjacencies.md:
--------------------------------------------------------------------------------
1 | # Adjacencies
2 |
3 | ### Contents:
4 | * [Adjacencies overview](#adjacencies-overview)
5 | * [RCK adjacency format](#rck-adjacency-format)
6 | * [inferred clone- and haplotype-specific adjacency copy numbers](#inferred-clone--and-haplotype-specific-adjacency-copy-numbers)
7 | * [Converting to RCK format from SV detection tools](#converting-to-rck-format-from-sv-detection-tools)
8 | * [Processing RCK adjacencies](#processing-rck-adjacencies)
9 |
10 | ### Adjacencies overview
11 | One of the key concepts in the RCK model is the notion of *adjacency*.
12 | Adjacency is a transition between segment's extremities.
13 | There are two kinds of adjacencies:
14 | * **reference** (present in the reference genome, or inherited by the derived cancer genome(s))
15 | * **novel** (present in derive genomes only).
16 |
17 | Every adjacency `{(q,x,+|-),(p,y,+|-)}` describes a transition from (right|left) side of loci at coordinate `x` on chromosome `q` to (right|left) side of the loci at coordinate `y` on the chromosome `p`.
18 | Reference adjacencies naturally have a form of `{(chr,x,+),(chr,x+1,-)}` (i.e., on the same chromosome, neighbouring positions, and respective extremities via strands orientation).
19 |
20 | We call two adjacencies reciprocal if some pair of their extremities resemble a reference adjacency.
21 | For example, adjacencies `{(1,123450,+),(1,4567890,+)}` and `{(1,123451,-),(1,876534,+)}` are reciprocal, because extremity `(1,123450,+)` form first adjacency one, and extremity `(1,123451,-)` from second adjacency resemble a reference adjacency `{(1,123450,+),(1,123451,-)}`
22 |
23 | While sometimes novel adjacencies are classified as insertion, deletion, duplication, reversals (aka inversion), translocation, etc.
24 | This is usually done by looking at chromosomes, coordinates, and strands of involved extremities.
25 | For example insertion and deletion has the same *signature* (i.e., same chromosome, `+` strand on the leftmost extremity, and `-` on the rightmost extremity).
26 | Duplication has a signature of `-` strand followed by the `-` strand on the same chromosome.
27 | Reversal (event) usually involves two reciprocal novel adjacencies, with `+`,`+` strand signature on one, and `-`, `-` signature on another adjacency.
28 |
29 | While, indeed, aforementioned annotation correspond to cases, where respective rearrangement events would produce such novel adjacencies,
30 | it can also be the case that novel adjacencies that resemble signatures described above can be produced by more complex rearrangement events, such as *chromoplexy* and *chromothripsis*.
31 |
32 | ### RCK adjacency format
33 | RCK works with adjacencies in the following (T/C)SV (Tab/Comma Separated Values) text format:
34 |
35 | ````
36 | aid chr1 coord1 strand1 chr2 coord2 strand2 extra
37 | ````
38 | where every entry thus describes an adjacency `{(chr1, coord1, strand1), (chr2, coord2, strand2)}` with an id of `aid`.
39 | The `extra` field is designed to store `key=value` pairs of additional information about adjacencies, with entries being separated by `;`.
40 |
41 | There are several special extra fields, that RCK relies upon, when working:
42 | * `aid` -- copy of the `aid`. When reading adjacencies, id for the adjacency will be based on the column, not the extra field.
43 | * `cn` -- copy number values (refer to the following [subsection](#inferred-clone--and-haplotype-specific-adjacency-copy-numbers))
44 | * `at` -- adjacency type (either `N` for novel (default), or `R` for reference). By default all adjacencies are considered to be noevl, unless the adjacency id starts with the lower-case `r`.
45 |
46 | #### inferred clone- and haplotype-specific adjacency copy numbers
47 | The result sof the main RCK algorithm (via `rck` executable) contains the `rck.acnt.tsv` file, with entries following the RCK adjacencies format.
48 |
49 | Both novel and reference adjacencies are output in the result, and depending on the ``--o-acnt-mix-novel-and-reference`` novel and reference adjacencies are either going to be mixed together, or separated with novel adjacencies followed by the reference ones.
50 | While the adjacencies themselves are self-explanatory, the main important peace of information about them is the `cn` field in the `extra` column, that encodes the clone- and haplotype-specific copy number values.
51 |
52 | The `cn` value is a python/JSON dict with the following structure:
53 |
54 | ```
55 | {
56 | 'clone_id': {
57 | 'AA': int,
58 | 'AB': int,
59 | 'BA': int,
60 | 'BB': int
61 | },
62 | ...
63 | }
64 | ```
65 | where `clone_id` corresponds to the clone, for which haplotype-specific copy numbers are provided, and the
66 | `AA`, `AB`, `BA`, `BB` entries encode the copy number of the (haplotype) labeled versions of the adjacency (where the first position is labeled with the first haplotype letter, and the second position is labeled with the second haplotype letter).
67 |
68 | In the following example:
69 | ````
70 | aid chr1 coord1 strand1 chr2 coord2 strand2 extra
71 | id1 1 123450 + 1 123760 - cn={'c1':{'AA': 1, 'AB': 0, 'BA': 0, 'BB':0}, 'c2': {'AA': 2, 'AB': 0, 'BA': 0, 'BB':0}}
72 | ````
73 | where for the novel adjacency `{(1,123450,+),(1,123760,-)}` with id `id1` the following labeled adjacency `{(1,123450,+,A),(1,123760,-,A)}` has a copy number 1 in clone `c1` and copy number 2 in clone `c2`.
74 |
75 | ### Converting to RCK format from SV detection tools
76 | RCK installation adds `rck-adj-x2rck` adjacency-conversion executable tool to the `PATH` of your installation environment.
77 | With the help of `rck-adj-x2rck` one can convert (unlabeled) novel adjacency predictions from the following tools:
78 |
79 | * *short-reads*
80 | * **Delly** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3436805/) | [code](https://github.com/dellytools/delly)]
81 | * **Manta** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/26647377) | [code](https://github.com/Illumina/manta)]
82 | * **Lumpy** [[paper](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2014-15-6-r84) | [code](https://github.com/arq5x/lumpy-sv)]
83 | * **BreakDancer** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138716/) | [code](https://github.com/genome/breakdancer)]
84 | * *linked/barcode reads*
85 | * **LongRanger** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4786454/) | [code](https://github.com/10XGenomics/longranger)]
86 | * **GROC-SVs** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28714986) | [code](https://github.com/grocsvs/grocsvs)]
87 | * **NAIBR** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/29112732) | [code](https://github.com/raphael-group/NAIBR)]
88 | * *long reads*
89 | * **Sniffles** [[paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5990442/) | [code](https://github.com/fritzsedlazeck/Sniffles)]
90 | * **PBSV** [paper | [code](https://github.com/PacificBiosciences/pbsv)]
91 |
92 | The input is converted to the RCK adjacency format (described [above](#rck-adjacency-format)).
93 | `rck-adj-x2rck` tries to retain as much as possible of extra information from the SV detection tools input during the conversion, and such information is stored in the `extra` column.
94 |
95 | For help message of `rck-adj-x2rck`run:
96 | ````bash
97 | rck-adj-x2rck --help
98 | ````
99 |
100 | To get help in converting adjacency prediction from a specific tool `x` run:
101 | ````bash
102 | rck-adj-x2rck x --help
103 | ````
104 |
105 | The following optional command line arguments are shared for all of the input sources and can be beneficial to use:
106 | * `-o` | `--output` -- output file (default is `stdout`)
107 | * `--id-suffix` -- a suffix, that will be appended to every input adjacency id, when transforming to RCK format.
108 | This can be beneficial when working with several input sources of adjacencies, and one would want to differentiate based on the source
109 |
110 | An example of converting adjacency prediction by `Sniffles` tool to the RCK suitable format:
111 | ````bash
112 | rck-adj-x2rck sniffles SV_predictions.vcf --id-suffix sample-technology-sniffles -o sample-technology-sniffles.rck.adj.tsv
113 | ````
114 | Which will convert SV prediction in `SV_predictions.vcf` produced by Sniffles in sample `sample` that was sequenced with technology `technology` into the RCK formatted adjacency calls in the file `sample-technology-sniffles.rck.adj.tsv`.
115 |
116 | All converted adjacencies will have `id_sample-technology-sniffles`, where `id` is the VCF id provided by Sniffles.
117 |
118 | Not that the `--id-suffix` value here is provided as an example and is not mandatory to (i) be prent at all (default value is empty string) (ii) be in the form of `sample-technology-method`, though we found it useful on several occasions.
119 | Depending on your needs a different suffix values may be more useful.
120 |
121 | ### Processing RCK adjacencies
122 | RCK installation adds `rck-adj-process` adjacency processing executable tool to `PATH` of your installation environment.
123 | For `rck-adj-process` the following commands are available:
124 | * `cat` -- combining adjacencies from 1+ inputs into a single one
125 | * `reciprocal` -- updating extremities of adjacencies in the input, so that pairs of extremities of distinct adjacencies that resemble reciprocality, but are tno exactly 1 bp apart, are brought together.
126 | This option ran by default in the main `rck` executable, unless explicitly suppressed.
127 |
128 | Running `rck-adj-process command --help` provides one with the help on usage of each particular command.
129 |
130 |
--------------------------------------------------------------------------------
/docs/AdjacencyGroups.md:
--------------------------------------------------------------------------------
1 | # Adjacencies Groups
2 |
3 | ### Contents:
4 | * [Adjacencies Groups overview](#adjacencies-groups-overview)
5 | * [RCK Adjacencies Groups format](#rck-adjacencies-groups-format)
6 | * [Molecule Adjacencies Group](#molecule-adjacencies-group)
7 |
8 |
9 | ### Adjacencies Groups overview
10 | 3rd-generations sequencing experiments can produce groups of novel adjacencies for which we can infer additional, useful, information.
11 | We assume that all (novel) adjacencies are provided in the RCK input, but then we allow additional "grouping" information.
12 |
13 | ### RCK Adjacencies Groups format
14 | Adjacencies groups information is accepted into RCK workflow via the `--adjacency-groups` option that ust point to a file with RCK formatted adjacencies groups.
15 | RCK works with adjacencies groups in the following (T/C)SV (Tab/Comma Separated Values) text format:
16 | ```
17 | gid aids extra
18 | ```
19 | where every entry describes a subset of the adjacencies, that are part of the RCK input, with the group id of `gid`, and adjacencies ids in the group listed in a comma-separated fashion in the `aids` column.
20 | Comma-separated values in the `aids` column must match entries in the `aid` column in the RCK input adjacencies file.
21 | The extra field is designed to store `key=value` pairs of additional information about each adjacency groups, with entries being separated by `;`.
22 |
23 | There are several special extra fields, that RCK relies upon, when working:
24 |
25 | * `agt` -- adjacencies group type (`M` for [molecule groups](#molecule-adjacencies-group)).
26 | * `fp` -- maximum false positive (fraction) value for the adjacencies group
27 |
28 | ### Molecule Adjacencies Group
29 | This type of adjacencies groups usually comes from the 3rd-generation sequencing experiments as either a group of adjacencies supported by a single long read (long read sequencing),
30 | or a predicted by short reads with the same barcode (10x Genomics sequencing data), or coming from a single cell experiment.
31 | One way or the other, all of the adjacencies in the molecule group come from a single clone: either the single (part) of the derived chromosome (long reads + barcoded cases), which is in a single cell, which represents a single clone,
32 | or from a real single cell (single cell sequencing source), which, again, represents a single clone.
33 |
34 | Every group in the input adjacencies groups file with the entry `agt=M;` in the `extra` column is treated as the molecule adjacency group.
35 |
36 | So for every molecule group `U` comprised of `|U|` input adjacencies with a False Positive values of `f`, RCK forces that in at least one of the reconstructed clones, there will be at least `(1-f)*|U|` labeled representations of novel adjacencies from `U` present.
37 | We note, that such a constraint does not imply that only in one clone labeled realizations of adjacencies from `U` can be present.
38 | There may be several clones, in which different subsets of adjacencies from `U` have their labeled realizations present, but the constraints guaranties that in at least one clone, there will be `(1-f)|U|` of them.
39 |
40 |
--------------------------------------------------------------------------------
/docs/Installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 |
4 | ### Contents:
5 | * [Virtual environment](#virtual-environment)
6 | * [conda](#conda-recommended)
7 | * [pip](#pip)
8 | * [source](#source)
9 | * [Gurobi](#gurobi)
10 | * [Python bindings for Gurobi](#python-bindings-for-gurobi)
11 | * [Executables](#executables)
12 |
13 | ### virtual environment
14 | We recommend that RCK is installed in the isolated virtual environments.
15 | Virtual environments can be created via `anaconda` and `python` (w.r.t. RCK, which is written in Python).
16 |
17 | To create a virtual environment (named `rck-env`) with anaconda, run the following command:
18 | ````bash
19 | conda create --name rck-env python=3.7
20 | ````
21 |
22 | To create a virtual environment (named `rck-env`) with python, run the following command:
23 | ````bash
24 | python -m venv rck-env
25 | ````
26 |
27 | If virtual environments are used (which, again, we recommend), we assume that the environment is activated.
28 |
29 | ### conda (recommended)
30 | Run the following conda command, to install RCK:
31 | ````bash
32 | conda install -c aganezov rck
33 | ````
34 |
35 | Installation via conda automatically takes care of Gurobi python bindings (refer to respective [subsection](#python-bindings-for-gurobi)), and everything shall work from this part (assuming that Gurobi is correctly installed and working).
36 |
37 | ### pip
38 |
39 | Run the following command, to install RCK:
40 | ````bash
41 | pip install rck
42 | ````
43 |
44 | **WARNING**: this installation does take care of python bindings for Gurobi. Please, refer to respective [subsection](#python-bindings-for-gurobi) on how that can be addressed.
45 |
46 | ### source
47 |
48 | First, download the source code. Example is shown below:
49 | ````bash
50 | git clone https://github.com/aganezov/RCK.git
51 | ````
52 |
53 | then run the following command from the RCK source folder:
54 | ````bash
55 | python setup.py install
56 | ````
57 |
58 | **WARNING**: this installation does take care of python bindings for Gurobi. Please, refer to respective [subsection](#python-bindings-for-gurobi) on how that can be addressed.
59 |
60 | ### Gurobi
61 | [Gurobi](http://www.gurobi.com/) solver can be obtained from the official web site and installation procedure is also described there.
62 | Gurobi requires a valid license to run.
63 | Licensing [information](http://www.gurobi.com/downloads/licenses/license-center) is provided on the official website, and is available for free for individual academic users.
64 | More complicated setups with multi-user and cluster licenses are also available (and described on the official Gurobi website).
65 | Contact your university IT support for more information about any complication with Gurobi licensing and setup.
66 |
67 | RCK expects that Gurobi is installed on the machine in question.
68 | RCK requires python bindings be installed (in the virtual environment, if you use it (which we recommend)).
69 | Refer to the next [subsection](#python-bindings-for-gurobi) for details on how this can be addressed.
70 |
71 | ##### Python bindings for Gurobi
72 | RCK requires python bindings be installed (in the virtual environment, if you use it (which we recommend)).
73 | The following [documentation](https://www.gurobi.com/documentation/8.1/quickstart_windows/the_gurobi_python_interfac.html) of the Gurobi website explains how an installation of such bindings can be done.
74 |
75 | Recommended way is via anaconda.
76 | Regardless of whether conda is used for virtual environment, os just in general, the following command will install Python Guorbi bindings:
77 | ````bash
78 | conda install -c gurobi gurobi
79 | ````
80 |
81 | If not using conda, one needs to go to the Gurobi installation dir and locate the `setup.py` file and run the following command:
82 | ````bash
83 | python setup.py install
84 | ````
85 | Note that this way is deprecated by Gurobi.
86 |
87 |
88 | ### Executables
89 | Installation of RCK adds several executables to your `PATH` (if using virtual environment, this executables will be accessible only when the environment is activated):
90 | * `rck` - main executable that runs the RCK inference algorithm
91 | * `rck-adj-x2rck` - conversion of SV prediction from several 3rd-party SV prediction tools (refer to respective [docs section](Adjacencies.md#converting-to-rck-format-from-sv-detection-tools) for more details)
92 | * `rck-adj-process` - various processing options for RCK formatted adjacencies
93 | * `rck-scnt-x2rck` - conversion of the clone- and allele-specific segment copy number predictions by 3rd-party tools (refer to respective [docs section](Segments.md#converting-to-rck-format-from-clone--and-allele-specific-inference-tools) for more details)
94 | * `rck-scnt-process` - various processing options for RCK formatted segments, copy number, boundaries, etc.
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | Documentation for RCK is comprised of several documents.
2 | Information addresses the basic concepts, provides some examples, and covers usage of tools in the RCK distribution.
3 |
4 | ### Contents:
5 | * [Adjacencies](Adjacencies.md)
6 | * [Adjacencies Groups](AdjacencyGroups.md)
7 | * [Segments](Segments.md)
8 | * [Installation](Installation.md)
9 | * [Usage](Usage.md)
--------------------------------------------------------------------------------
/docs/Segments.md:
--------------------------------------------------------------------------------
1 | # Segments
2 |
3 | ### Contents:
4 | * [Segments overview](#segments-overview)
5 | * [RCK segments format](#rck-segments-format)
6 | * [inferred clone- and haplotype-specific segment copy numbers](#inferred-clone--and-haplotype-specific-segment-copy-numbers)
7 | * [input clone- and allle-specific segment copy numbers](#input-allele--and-clone-specific-segment-copy-numbers)
8 | * [Converting to RCK format from clone- and allele-specific inference tools](#converting-to-rck-format-from-clone--and-allele-specific-inference-tools)
9 | * [Processing RCK segments](#processing-rck-segments)
10 | * [Segments vs fragments](#segments-vs-fragments)
11 |
12 | ### Segments overview
13 |
14 | One of the key concepts in the RCK model if the notion of *segment*.
15 | A segment `(chr,start,end)` represents a continuous part of the reference genome's chromosome `chr` starting at `start` and ending at `end` (both inclusive).
16 | A segment `s=(chr,start,end)` naturally has two extremities `(chr,start,-)` and `(chr,end,+)` corresponding to tail `s^t` and head `s^h` of the segment `s`.
17 | In a diploid reference genome every segment `(chr,start,end)` (except for segment on sex chromosomes) has two haplotype-specific copies `(chr,start,end,A)` and `(chr,start,end,B)` respectively.
18 |
19 | Reference adjacencies correspond to pairs of adjacent extremities of consecutive segments.
20 | For example, two consecutive segments `a=(chr,10001,20000)` and `b=(chr,20001,30000)` determine a reference adjacency `{(chr,20000,+),(chr,20001,-)}`.
21 | Naturally for every chromosome that has two homologous copies, for every unlabeled reference adjacency `{(chr,20000,+),(chr,20001,-)}` there are two labeled reference adjacency counterparts:
22 | `{(chr,20000,+,A),(chr,20001,-,A)}` and `{(chr,20000,+,B),(chr,20001,-,B)}`.
23 |
24 | ### RCK segments format
25 | RCK works with segments in the following (T/C)SV (Tab/Comma Separated Values) text format (similar to that of bedpe):
26 | ````
27 | chr start end extra
28 | ````
29 | where every entry describes a segment `(chr,start,end)`.
30 | The `extra` field is designated to store `key=value` pairs of additional information about segments, with entries being separated by `;`.
31 |
32 | There are several special extra fields, that RCK relies upon, when working:
33 | * `cn` -- clone and allele/haplotype-specific copy number values of the segment (refer to the following [subsection](#inferred-clone--and-haplotype-specific-segment-copy-numbers))
34 | * `cnb` -- clone and allele/haplotype-specific copy number boundaries of the segment (refer to the respective [subsection](#copy-number-boundaries))
35 |
36 | #### inferred clone- and haplotype-specific segment copy numbers
37 | The result of the main RCK algorithm (via `rck` executable) contains the `rck.scnt.tsv` file, with entries following the RCK segments format.
38 | While the segments themselves are self-explanatory, the main important peace of information about them is the `cn` field in the `extra` column, that encode clone- and haplotype-specific copy number values.
39 |
40 | The `cn` value is a python/JSON dict with the following structure:
41 | ```
42 | {
43 | 'clone_id' : {
44 | 'A': int,
45 | 'B'; int
46 | },
47 | ...
48 | }
49 | ```
50 | where `clone_id` corresponds to the clone, for which haplotype-specific copy numbers are provided, with the `A` and `B` entries encoding the copy number of the multiplicity of the corresponding haplotype-specific segments.
51 |
52 | In the following example:
53 | ````
54 | chr start end extra
55 | 1 10000 20000 cn={'c1':{'A': 1, 'B': 2}, 'c2':{'A': 3 'B': 2}}
56 | ````
57 | where for the segment `(1,10000,20000)` its `A` haplotype-specific version has `1` copy in clone `c1` and `3` copies in clone `c2`, and its `B` haplotype-specific version has `2` copies in clone `c1` and `2` copies on clone `c2`.
58 |
59 | #### input allele- and clone-specific segment copy numbers
60 | The input for the `RCK` method expects clone- and *allele*-specific (approximate) segment copy numbers.
61 | While most methods follow the notion of `major` and `minor` alleles (based on the segment copy numbers), we employ the same format of `cn` field, as was described in the previous [subsection](#inferred-clone--and-haplotype-specific-segment-copy-numbers)
62 | The only and major difference is, that while the RCK output is haplotype-specific (i.e., `A` and `B` are matching and the same for every segment), the input is allele-specific (i.e., `A` and `B` entries do not necessarily match and can be"flipped").
63 |
64 | ### Converting to RCK format from clone- and allele-specific inference tools
65 | RCK installation adds `rck-scnt-x2rck` segment copy number conversion executable tool to the `PATH` of your installation environment.
66 | With the help of `rck-scnt-x2rck` one can convert clone- and allele-specific prediction from the following tools:
67 | * **HATCHet** [[paper](https://www.biorxiv.org/content/early/2018/12/17/496174) | [code](https://github.com/raphael-group/hatchet)] (*recommended* as it has fewest limitation w.r.t. tumor heterogeneity)
68 | * **TitanCNA** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/25060187) | [code](https://github.com/gavinha/TitanCNA)]
69 | * **Battenberg** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/22608083) | [code](https://github.com/cancerit/cgpBattenberg)]
70 | * **ReMixT** [[paper](https://www.ncbi.nlm.nih.gov/pubmed/28750660) | [code](https://bitbucket.org/dranew/remixt)]
71 | * **Ginkgo** [[paper](https://www.nature.com/articles/nmeth.3578) | [code](https://github.com/robertaboukhalil/ginkgo)] (Attention! *haploid* mode only)
72 |
73 | For help message of `rck-scnt-x2rck` run
74 | ````bash
75 | rck-scnt-x2rck --help
76 | ````
77 |
78 | To get help in converting clone- and allele-specific predictions from a specific tool `x` run:
79 | ````bash
80 | rck-scnt-x2rck x --help
81 | ````
82 |
83 | ### Processing RCK segments
84 | RCK installation adds `rck-scnt-process` segment copy number processing executable tool to the `PATH` of your installation environment.
85 | For `rck-scnt-process` the following commands are available:
86 | * `align` -- aligning segments (and corresponding segment copy number tensors) form 1+ segment copy number tensord
87 | * `refine` -- filling the missing spans in entries, of merging consecutive entries that have the same clone- and allele/haplotype-specific copy numbers.
88 | This option ran by default in the main `rck` executable, unless explicitly suppressed.
89 |
90 | Running `rck-scnt-process command --help` provides one with the help on usage of each particular command.
91 |
92 |
93 | ### Segments vs fragments
94 | A segment and a fragment are of the same nature: a consecutive part of the reference chromosome.
95 | Input clone- and allele-specific copy number are usually inferred on rather large spans, which we treat as fragments.
96 | Such fragments are further fragmented into smaller, actual segments, based on the novel adjacencies, in such a way that novel adjacencies involve segments' extremities.
97 | With smaller segments, we still retain information about allele-separation based on fragments, which span smaller segments.
98 |
--------------------------------------------------------------------------------
/docs/Usage.md:
--------------------------------------------------------------------------------
1 | # Usage
2 |
3 | ### Contents:
4 | * [Input data](#input-data)
5 | * [Mandatory](#mandatory)
6 | * [Optional](#optional)
7 | * [Running RCK](#running-rck)
8 | * [preprocessing options](#preprocessing-options)
9 | * [running options](#running-options)
10 | * [Examples](#examples)
11 |
12 | ### Input data
13 |
14 | #### Mandatory
15 | RCK expects two mandatory peaces of the input:
16 | * `--scnt` - clone- and allele-specific segment copy number predictions, obtained from 3rd-party tools (see [segments docs](Segments.md#converting-to-rck-format-from-clone--and-allele-specific-inference-tools) for more details).
17 | * `--adjacencies` (unlabeled) novel adjacencies (aka SVs) obtained from 3rd-party tools (see [adjacencies docs](Adjacencies.md#converting-to-rck-format-from-sv-detection-tools) for more details).
18 |
19 | Both inputs must be i the RCK format (refer to [segments](Segments.md#rck-segments-format) and [adjacencies](Adjacencies.md#rck-adjacency-format) docs on the formatting issues.)
20 |
21 | #### Optional
22 |
23 | * `--adjacency-groups` - Adjacencies groups (see [adjacencies groups docs](AdjacencyGroups.md) for more details).
24 | * `--clone-ids` - a comma-separated list of clone ids (as present in the `scnt` input).
25 | * **advanced** `--telomere-positions` and `--telomere-segments` - Telomeres (either via exact locations, or via segments, for which all spanned extremities will be considered as possible additional telomeres)
26 | * **advanced** `--fragments` - Fragments that span segments. Only works properly if no preprocessing on input `scnt` is performed.
27 |
28 | ### Running RCK
29 |
30 | Running RCK inference algorithm is achieved through the `rck` executable (which is automatically added to your `PATH` with RCK installation).
31 |
32 | #### preprocessing options
33 | When running RCK, a lot of input preprocessing options, that are achieved via RCK utilities.
34 |
35 | All preprocessing can be turned off via the `--no-pre` flag, but this is an advanced option, use with caution.
36 |
37 | All the `--pre-scnt-xxx` flags refer to `--scnt` input clone- and allele-specific segment copy number values (in RCK format) preprocessing, that is similar to the `refine` command on the `rck-scnt-process` tool.
38 |
39 | All the `--pre-scnb-xxx` flags refer to creating copy number boundaries for the inferred copy number values.
40 | By default the strategy for obtaining the copy number boundaries is the uniform min-max one, where regardless of the input clone- and allele-specific copy number, the lower bound is set to the `--pre-scnb-uniform-min` value (default 0) and the upper is set to the `--pre-scnb-uniform-max` value (default 10).
41 | When working with genomes with known highly amplified segments, one can think about altering the default value for the `--pre-scnb-uniform-max`.
42 |
43 | Preprocessing of the input adjacencies concerns the reciprocality adjustments, and is similar to the `rck-adj-process reciprocal` command and the `--pre-adj-xxx` option mirror those of the `rck-adj-process reciprocal`.
44 | Adjacency preprocessing can be turned off by specifying the `--pre-no-adj`.
45 |
46 | #### running options
47 |
48 | One of the main arguments in running `rck` is the `--workdir` option, specifying the working directory in which the three following directories are created:
49 | * `raw_input` - contains exact copies of the input files
50 | * `input` - contains fully preprocessed data
51 | * `output` - contains inference results from the RCK algorithm
52 |
53 | Running options for `rck` start with the `--run-` prefix.
54 | Running RCK without actually executing the inference algorithm can be achieved by using the `--no-run` flag.
55 | This will prevent the actual gurobi based ilp solving and respective karyotype inference, but will preprocess (unless disabled) of all the input, and putting the preprocessed data into the `workdir/input` directory.
56 |
57 | The `--run-g-` are the flags corresponding to setting Gurobi related options:
58 | * `--run-g-mip-gap` - the gap between the best bound and best objective, after which the Gurobi solver will stop crunching numbers (default: 0.015, or 1.5% difference)
59 | * `--run-g-time-limit` - the maximum time (in seconds) for gurobi to run, before stopping execution and taking the current best objective as the result (default: 28800, aka 8 hours)
60 | * `--run-g-threads` - number of threads gurobi will use (deault: 4)
61 | * `--run-g-allow-interrupted` - allow for gurobi run to be interrupted and still use the best obtained objective for the inference result
62 |
63 | Other flags:
64 | * `--run-nas-fp` - default False Positive upper bound (i.e., at most a `--run-nas-fp` fraction of input novel adjacencies can be *not* used in the inferred karyotypes). Default is 0.1
65 | * `--run-group-m-default-fp` - default False Positive values for *molecule* adjacencies groups (unless explicitly specified in with the `fp` value in the `extra` field). Default is 0.1
66 | * `--run-segment-length-attr` - an choice based attribute that is used to get the segments length. Default is `length_100` which means that for every segment of length `l` an `ceil(l/100)` value is used in the inference minimization.
67 |
68 |
69 | ### Examples
70 |
71 | The following command runs RCK inference on the clone- and allele-specific segment copy number tensor (stored in the `input.rck.scnt.tsv`)
72 |
--------------------------------------------------------------------------------
/docs/img/RCK_Overview_vertical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/docs/img/RCK_Overview_vertical.png
--------------------------------------------------------------------------------
/rck/__init__.py:
--------------------------------------------------------------------------------
1 | version = "1.1.0"
2 |
--------------------------------------------------------------------------------
/rck/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/core/__init__.py
--------------------------------------------------------------------------------
/rck/core/process.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 | from rck.core.io import EXTERNAL_NA_ID
4 |
5 |
6 | class ClusteringStrategy(Enum):
7 | IterativeSlider = 0
8 |
9 |
10 | def positions_aligned(segments_positions, other_positions):
11 | segments_positions_ids = {p.stable_id_non_hap for p in segments_positions}
12 | other_positions_ids = {p.stable_id_non_hap for p in other_positions}
13 | result = other_positions_ids <= segments_positions_ids
14 | return result
15 |
16 |
17 | def adj_groups_concur(adj_groups, adjacencies):
18 | nas_ids = {na.extra.get(EXTERNAL_NA_ID, na.idx) for na in adjacencies}
19 | for group in adj_groups:
20 | for aid in group.adjacencies_ids:
21 | if aid not in nas_ids:
22 | return False
23 | return True
24 |
--------------------------------------------------------------------------------
/rck/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/__init__.py
--------------------------------------------------------------------------------
/rck/utils/adj/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/adj/__init__.py
--------------------------------------------------------------------------------
/rck/utils/adj/adjacency_group_inference.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import defaultdict
3 |
4 | import pysam
5 |
6 | from rck.core.io import EXTERNAL_NA_ID, AG_LABELING
7 | from rck.core.structures import AdjacencyGroup, AdjacencyGroupType, Strand
8 |
9 |
10 | def infer_sniffles_molecule_groups(adjacencies, extra_rnames_field="rnames", gid_suffix=""):
11 | reads_to_adjacencies_ids = defaultdict(set)
12 | for adj in adjacencies:
13 | read_names = adj.extra.get(extra_rnames_field, "").split(",")
14 | if len(read_names) == 1 and len(read_names[0]) == 0:
15 | continue
16 | for read_name in read_names:
17 | reads_to_adjacencies_ids[read_name].add(adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased))
18 | result = []
19 | for cnt, (read_name, aids) in enumerate(reads_to_adjacencies_ids.items()):
20 | if len(aids) < 2:
21 | continue
22 | extra = {"source": read_name}
23 | gid = str(cnt)
24 | if len(gid_suffix) > 0:
25 | gid += "_" + gid_suffix
26 | ag = AdjacencyGroup(gid=gid, aids=list(aids), group_type=AdjacencyGroupType.MOLECULE, extra=extra)
27 | result.append(ag)
28 | return result
29 |
30 |
31 | def infer_short_nas_labeling_groups(adjacencies, gid_suffix="", max_size=1000, allow_intermediate_same=False, allow_intermediate_tra=False,
32 | allow_inv_signatures=True):
33 | positions_to_adjacencies = defaultdict(list)
34 | positions_by_chrs = defaultdict(list)
35 | result = []
36 | for adj in adjacencies:
37 | p1, p2 = adj.position1, adj.position2
38 | positions_to_adjacencies[p1].append(adj)
39 | positions_to_adjacencies[p2].append(adj)
40 | positions_by_chrs[p1.chromosome].append(p1)
41 | positions_by_chrs[p2.chromosome].append(p2)
42 | positions_by_chr_to_index = {}
43 | for chr_name in list(positions_by_chrs.keys()):
44 | positions_by_chrs[chr_name] = sorted(positions_by_chrs[chr_name], key=lambda p: (p.coordinate, p.strand))
45 | positions_by_chr_to_index[chr_name] = {p: cnt for cnt, p in enumerate(positions_by_chrs[chr_name])}
46 | processed_adj_ids = set()
47 | cnt = 0
48 | for adj in adjacencies:
49 | aid = adj.stable_id_non_phased
50 | if aid in processed_adj_ids:
51 | continue
52 | p1, p2 = adj.position1, adj.position2
53 | p1_chr, p2_chr = p1.chromosome, p2.chromosome
54 | if p1_chr != p2_chr:
55 | continue
56 | if not allow_inv_signatures and adj.position1.strand == adj.position2.strand:
57 | continue
58 | adj_size = adj.distance_non_hap
59 | if adj_size > max_size:
60 | continue
61 | positions = positions_by_chrs[p1_chr]
62 | p1_index, p2_index = positions_by_chr_to_index[p1_chr][p1], positions_by_chr_to_index[p1_chr][p2]
63 | aid = adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased)
64 | aids = [aid, aid]
65 | gid = str(cnt)
66 | if len(gid_suffix) > 0:
67 | gid += "_" + gid_suffix
68 | extra = {AG_LABELING: [0, 1]}
69 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra)
70 | if abs(p1_index - p2_index) == 1:
71 | result.append(ag)
72 | cnt += 1
73 | else:
74 | if not (allow_intermediate_same or allow_intermediate_tra):
75 | continue
76 | intermediate_indexes = list(range(p1_index + 1, p2_index))
77 | has_same = False
78 | has_tra = False
79 | allow = False
80 | for index in intermediate_indexes:
81 | position = positions[index]
82 | adjs = positions_to_adjacencies[position]
83 | for adj in adjs:
84 | has_same |= adj.position1.chromosome == adj.position2.chromosome
85 | has_tra |= adj.position1.chromosome != adj.position2.chromosome
86 | allow |= has_same and allow_intermediate_same
87 | allow |= has_tra and allow_intermediate_tra
88 | if allow:
89 | result.append(ag)
90 | cnt += 1
91 | processed_adj_ids.add(adj.stable_id_non_phased)
92 | return result
93 |
94 |
95 | def get_mode_str(format="bam", input=False):
96 | result = "r" if input else "w"
97 | if format == "bam":
98 | result += "b"
99 | elif format == "cram":
100 | result += "c"
101 | return result
102 |
103 |
104 | def get_labeling_groups(read_alignments, read_adjacencies, strategy="skip", delta=500, neighbour_selection="first"):
105 | result = []
106 | read_alignments = sorted(read_alignments, key=lambda e: (e.query_alignment_start, e.query_alignment_end))
107 | processed_positions = set()
108 | positions_by_chrs = defaultdict(list)
109 | positions_to_alignments = defaultdict(list)
110 | alignments_to_positions = defaultdict(list)
111 | positions_to_adjacencies = defaultdict(list)
112 | for adj in read_adjacencies:
113 | p1 = adj.position1
114 | p2 = adj.position2
115 | positions_by_chrs[p1.chromosome].append(p1)
116 | positions_by_chrs[p2.chromosome].append(p2)
117 | for alignment in read_alignments:
118 | for p in [p1, p2]:
119 | if p.chromosome == alignment.reference_name and \
120 | (alignment.reference_start - delta <= p.coordinate <= alignment.reference_end + delta):
121 | positions_to_alignments[p].append(alignment)
122 | alignments_to_positions[alignment].append(p)
123 | positions_to_adjacencies[p1].append(adj)
124 | positions_to_adjacencies[p2].append(adj)
125 | for alignment in alignments_to_positions.keys():
126 | alignments_to_positions[alignment] = sorted(alignments_to_positions[alignment], key=lambda p: (p.coordinate, p.strand))
127 | for chr_name in list(positions_by_chrs.keys()):
128 | positions_by_chrs[chr_name] = sorted(positions_by_chrs[chr_name], key=lambda p: (p.coordinate, p.strand))
129 | cnt = 0
130 | for adj in read_adjacencies:
131 | aid = adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased)
132 | for p in [adj.position1, adj.position2]:
133 | if p in processed_positions:
134 | continue
135 | processed_positions.add(p)
136 | alignments = positions_to_alignments[p]
137 | for alignment in alignments:
138 | positions_on_alignment = alignments_to_positions[alignment]
139 | p_index = positions_on_alignment.index(p)
140 | direction_neighbours = positions_on_alignment[:p_index] if p.strand == Strand.FORWARD else positions_on_alignment[p_index + 1:]
141 | if len(direction_neighbours) == 0:
142 | continue
143 | ordered_neighbours = direction_neighbours if p.strand == Strand.REVERSE else direction_neighbours[::-1]
144 | neighbour = None
145 | for candidate in ordered_neighbours:
146 | if candidate.strand != p.strand:
147 | neighbour = candidate
148 | if neighbour_selection == "first":
149 | break
150 | if neighbour is None:
151 | continue
152 | # neighbour = direction_neighbours[-1] if p.strand == Strand.FORWARD else direction_neighbours[0]
153 | processed_positions.add(neighbour)
154 | neighbour_ids = [adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased) for adj in positions_to_adjacencies[neighbour]]
155 | adj_index = 0 if p == adj.position1 else 1
156 | neighbour_indexes = [0 if neighbour == adj.position1 else 1 for adj in positions_to_adjacencies[neighbour]]
157 | extra = {
158 | "alignment": alignment.query_name,
159 | AG_LABELING: [adj_index] + neighbour_indexes
160 | }
161 | ag = AdjacencyGroup(gid=cnt, aids=[aid] + neighbour_ids, group_type=AdjacencyGroupType.LABELING, extra=extra)
162 | result.append(ag)
163 | cnt += 1
164 | break
165 | return result
166 |
167 |
168 | def infer_alignment_labeling_groups(adjacencies, alignment_file_name, alignment_format="bam",
169 | extra_rnames_field="rnames", gid_suffix="", inconsistent_traversal_strategy="skip"):
170 | result = []
171 | reads_to_adjacencies_ids = defaultdict(set)
172 | adjacencies_by_aids = {}
173 | for adj in adjacencies:
174 | aid = adj.stable_id_non_phased
175 | read_names = adj.extra.get(extra_rnames_field, "").split(",")
176 | if len(read_names) == 1 and len(read_names[0]) == 0:
177 | continue
178 | for read_name in read_names:
179 | reads_to_adjacencies_ids[read_name].add(aid)
180 | adjacencies_by_aids[aid] = adj
181 | mode = get_mode_str(format=alignment_format, input=True)
182 | current_read_name = None
183 | current_entries = []
184 | cnt = 0
185 | alignment_1k_counter = 0
186 | with pysam.AlignmentFile(alignment_file_name, mode) as i_stream:
187 | if "SO:queryname" not in i_stream.text:
188 | raise ValueError("Input alignment file needs to be sorted by read (i.e., query) name. It is not.")
189 | for alignment_cnt, entry in enumerate(i_stream):
190 | if alignment_cnt / 1000 >= alignment_1k_counter:
191 | alignment_1k_counter += 1
192 | if entry.qname != current_read_name:
193 | if len(current_entries) > 0 and current_read_name in reads_to_adjacencies_ids:
194 | adjacencies = [adjacencies_by_aids[aid] for aid in reads_to_adjacencies_ids[current_read_name]]
195 | groups = get_labeling_groups(read_alignments=current_entries, read_adjacencies=adjacencies, strategy=inconsistent_traversal_strategy)
196 | for group in groups:
197 | gid = str(cnt)
198 | if len(gid_suffix) > 0:
199 | gid += "_" + gid_suffix
200 | group.gid = gid
201 | cnt += 1
202 | result.extend(groups)
203 | current_read_name = entry.qname
204 | current_entries = [entry]
205 | else:
206 | current_entries.append(entry)
207 | if len(current_entries) > 0 and current_read_name in reads_to_adjacencies_ids:
208 | adjacencies = [adjacencies_by_aids[aid] for aid in reads_to_adjacencies_ids[current_read_name]]
209 | groups = get_labeling_groups(read_alignments=current_entries, read_adjacencies=adjacencies, strategy=inconsistent_traversal_strategy)
210 | for group in groups:
211 | gid = str(cnt)
212 | if len(gid_suffix) > 0:
213 | gid += "_" + gid_suffix
214 | group.gid = gid
215 | cnt += 1
216 | result.extend(groups)
217 | return result
218 |
219 |
220 | def filter_alignment(adjacencies, alignment_file_name, output_alignment_file_name, alignment_format="bam", extra_rnames_field="rnames", output_alignment_format="bam"):
221 | all_read_names = set()
222 | for adj in adjacencies:
223 | read_names = adj.extra.get(extra_rnames_field, "").split(",")
224 | if len(read_names) == 1 and len(read_names[0]) == 0:
225 | continue
226 | for read_name in read_names:
227 | all_read_names.add(read_name)
228 | i_mode = get_mode_str(format=alignment_format, input=True)
229 | o_mode = get_mode_str(format=output_alignment_format, input=False)
230 | with pysam.AlignmentFile(alignment_file_name, i_mode) as i_stream:
231 | with pysam.AlignmentFile(output_alignment_file_name, o_mode, template=i_stream) as o_stream:
232 | for entry in i_stream:
233 | if entry.qname in all_read_names:
234 | o_stream.write(entry)
235 |
--------------------------------------------------------------------------------
/rck/utils/adj/adjacency_group_process.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from copy import deepcopy
3 |
4 | import networkx as nx
5 |
6 | from rck.core.structures import AdjacencyGroup, AdjacencyGroupType
7 | from rck.core.io import AG_LABELING, EXTERNAL_NA_ID
8 |
9 |
10 | def refine_labeling_groups_old(adj_groups, gid_suffix="", retain_source_gids=False, iag=None):
11 | graph = nx.Graph()
12 | entries_to_adj_groups = defaultdict(list)
13 | groups_by_ids = {}
14 | for group in adj_groups:
15 | groups_by_ids[group.gid] = group
16 | entries = [(aid, index) for aid, index in zip(group.adjacencies_ids, group.extra.get(AG_LABELING, []))]
17 | if len(entries) < 2:
18 | continue
19 | for entry in entries:
20 | entries_to_adj_groups[entry].append(group)
21 | for l, r in zip(entries[:-1], entries[1:]):
22 | graph.add_edge(l, r)
23 | result = []
24 | cnt = 0
25 | for cc in nx.connected_component_subgraphs(graph):
26 | entries = list(cc.nodes())
27 | gids = set()
28 | for entry in entries:
29 | groups = entries_to_adj_groups[entry]
30 | for group in groups:
31 | gids.add(group.gid)
32 | groups = [groups_by_ids[gid] for gid in gids]
33 | aids = [entry[0] for entry in entries]
34 | indexes = [entry[1] for entry in entries]
35 | alignments = list(set(group.extra.get("alignment", "") for group in groups if len(group.extra.get("alignment", "")) > 0))
36 | extra = {
37 | AG_LABELING: indexes,
38 | }
39 | if len(alignments) > 0:
40 | extra["alignment"] = alignments
41 | if retain_source_gids:
42 | extra["source_gids"] = sorted(gids)
43 | gid = str(cnt)
44 | if len(gid_suffix) > 0:
45 | gid += "_" + gid_suffix
46 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra)
47 | result.append(ag)
48 | cnt += 1
49 | return result
50 |
51 |
52 | def refine_labeling_groups_without_iag(adj_groups, gid_suffix="", retain_source_gids=False):
53 | graph = nx.Graph()
54 | entries_to_adj_groups = defaultdict(list)
55 | groups_by_ids = {}
56 | for group in adj_groups:
57 | groups_by_ids[group.gid] = group
58 | entries = [(aid, index) for aid, index in zip(group.adjacencies_ids, group.extra.get(AG_LABELING, []))]
59 | if len(entries) < 2:
60 | continue
61 | for entry in entries:
62 | entries_to_adj_groups[entry].append(group)
63 | for l, r in zip(entries[:-1], entries[1:]):
64 | graph.add_edge(l, r)
65 | result = []
66 | cnt = 0
67 | for cc in nx.connected_component_subgraphs(graph):
68 | entries = list(cc.nodes())
69 | gids = set()
70 | for entry in entries:
71 | groups = entries_to_adj_groups[entry]
72 | for group in groups:
73 | gids.add(group.gid)
74 | groups = [groups_by_ids[gid] for gid in gids]
75 | aids = [entry[0] for entry in entries]
76 | indexes = [entry[1] for entry in entries]
77 | alignments = list(set(group.extra.get("alignment", "") for group in groups if len(group.extra.get("alignment", "")) > 0))
78 | extra = {
79 | AG_LABELING: indexes,
80 | }
81 | if len(alignments) > 0:
82 | extra["alignment"] = alignments
83 | if retain_source_gids:
84 | extra["source_gids"] = sorted(gids)
85 | gid = str(cnt)
86 | if len(gid_suffix) > 0:
87 | gid += "_" + gid_suffix
88 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra)
89 | result.append(ag)
90 | cnt += 1
91 | return result
92 |
93 |
94 | def refined_labeling_groups(adj_groups, iag=None, adjacencies=None, gid_suffix="", retain_source_gids=False):
95 | graph = nx.Graph()
96 | if adjacencies is not None:
97 | adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies}
98 | else:
99 | adjacencies_by_external_ids = {}
100 | adjacencies_by_positions = defaultdict(list)
101 | groups_by_ids = {}
102 | entries_to_adj_groups = defaultdict(list)
103 | for group in adj_groups:
104 | groups_by_ids[group.gid] = group
105 | internal_entries = [(aid, index) for aid, index in zip(group.adjacencies_ids, group.extra.get(AG_LABELING, []))]
106 | if len(internal_entries) < 2:
107 | continue
108 | entries = []
109 | if iag is not None and adjacencies is not None:
110 | for aid, index in internal_entries:
111 | adjacency = adjacencies_by_external_ids[aid]
112 | position = adjacency.position1 if index == 0 else adjacency.position2
113 | adjacencies_by_positions[position].append(adjacency)
114 | entries.append(position)
115 | else:
116 | entries = internal_entries
117 | for entry in entries:
118 | entries_to_adj_groups[entry].append(group)
119 | for l, r in zip(entries[:-1], entries[1:]):
120 | graph.add_edge(l, r)
121 | if iag is not None and adjacencies is not None:
122 | l_ref_edges = list(iag.ref_adjacency_edges(nbunch=l, data=False))
123 | r_ref_edges = list(iag.ref_adjacency_edges(nbunch=r, data=False))
124 | ref_edges = l_ref_edges + r_ref_edges
125 | for (u, v) in ref_edges:
126 | graph.add_edge(u, v)
127 | result = []
128 | cnt = 0
129 | for cc in nx.connected_component_subgraphs(graph):
130 | internal_entries = list(cc.nodes())
131 | gids = set()
132 | for entry in internal_entries:
133 | groups = entries_to_adj_groups[entry]
134 | for group in groups:
135 | gids.add(group.gid)
136 | entries = []
137 | if iag is not None and adjacencies is not None:
138 | for entry in internal_entries:
139 | if entry in adjacencies_by_positions:
140 | for adjacency in adjacencies_by_positions[entry]:
141 | aid = adjacency.extra.get(EXTERNAL_NA_ID, adjacency.stable_id_non_phased)
142 | index = 0 if adjacency.position1 == entry else 1
143 | entries.append((aid, index))
144 | else:
145 | entries = internal_entries
146 | groups = [groups_by_ids[gid] for gid in gids]
147 | aids = [entry[0] for entry in entries]
148 | indexes = [entry[1] for entry in entries]
149 | alignments = list(set(group.extra.get("alignment", "") for group in groups if len(group.extra.get("alignment", "")) > 0))
150 | extra = {
151 | AG_LABELING: indexes,
152 | }
153 | if len(alignments) > 0:
154 | extra["alignment"] = alignments
155 | if retain_source_gids:
156 | extra["source_gids"] = sorted(gids)
157 | gid = str(cnt)
158 | if len(gid_suffix) > 0:
159 | gid += "_" + gid_suffix
160 | ag = AdjacencyGroup(gid=gid, aids=aids, group_type=AdjacencyGroupType.LABELING, extra=extra)
161 | result.append(ag)
162 | cnt += 1
163 | return result
164 |
165 |
166 | def projected_groups(groups, adjacencies, adjacencies_by_external_ids=None, gid_suffix=""):
167 | if adjacencies_by_external_ids is None:
168 | adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies}
169 | result = []
170 | for group in groups:
171 | projected = [aid in adjacencies_by_external_ids for aid in group.adjacencies_ids]
172 | aids = [aid for aid, allowed in zip(group.adjacencies_ids, projected) if allowed]
173 | if len(aids) == 0:
174 | continue
175 | if group.adjacencies is not None:
176 | adjacencies = [adj for adj, allowed in zip(group.adjacencies, projected) if allowed]
177 | else:
178 | adjacencies = None
179 | ag = deepcopy(group)
180 | ag.adjacencies_ids = aids
181 | ag.adjacencies = adjacencies
182 | if group.group_type == AdjacencyGroupType.LABELING:
183 | if len(aids) < 2:
184 | continue
185 | labeling = [index for index, allowed in zip(group.extra[AG_LABELING], projected) if allowed]
186 | ag.extra[AG_LABELING] = labeling
187 | if not all(projected) and len(gid_suffix) > 0 and group.group_type != AdjacencyGroupType.GENERAL:
188 | ag.gid += "_" + gid_suffix
189 | result.append(ag)
190 | return result
191 |
--------------------------------------------------------------------------------
/rck/utils/adj/adjacency_group_stats.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | from rck.core.structures import AdjacencyGroup
4 |
5 |
6 | def groups_size_tally(adjacency_groups):
7 | result = defaultdict(int)
8 | for ag in adjacency_groups:
9 | ag: AdjacencyGroup = ag
10 | result[len(ag.adjacencies_ids)] += 1
11 | return result
12 |
--------------------------------------------------------------------------------
/rck/utils/adj/analysis.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, List
2 |
3 | import networkx as nx
4 |
5 | from rck.core.structures import Adjacency, Position, Strand, AdjacencyType
6 |
7 |
8 | class ComplexRearrSignature(object):
9 | def __init__(self, adjacencies: Iterable[Adjacency], ref_locations: Iterable[Adjacency] = None):
10 | self.adjacencies: List[Adjacency] = list(adjacencies)
11 | self.ref_adjacencies: List[Adjacency] = list(ref_locations) if ref_locations is not None else self.infer_ref_adjacencies(self.adjacencies)
12 | self.k = len(self.ref_adjacencies)
13 |
14 | @classmethod
15 | def infer_ref_adjacencies(cls, adjacencies: Iterable[Adjacency]) -> List[Adjacency]:
16 | result = set()
17 | for adjacency in adjacencies:
18 | for p in [adjacency.position1, adjacency.position2]:
19 | pr = Position.get_reciprocal(position=p)
20 | result.add(Adjacency(position1=p, position2=pr, adjacency_type=AdjacencyType.REFERENCE))
21 | return list(result)
22 |
23 |
24 | def get_complex_rearrangements_signatures(adjacencies: Iterable[Adjacency]) -> Iterable[ComplexRearrSignature]:
25 | cr_graph = nx.MultiGraph()
26 | for adjacency in adjacencies:
27 | p1 = adjacency.position1.get_non_hap_copy()
28 | p2 = adjacency.position2.get_non_hap_copy()
29 | if p1.strand == Strand.REVERSE:
30 | p1 = Position.get_reciprocal(position=p1)
31 | if p2.strand == Strand.REVERSE:
32 | p2 = Position.get_reciprocal(position=p2)
33 | cr_graph.add_edge(p1, p2, adjacency=adjacency)
34 | result: List[ComplexRearrSignature] = []
35 | for cc in nx.connected_component_subgraphs(cr_graph):
36 | result.append(ComplexRearrSignature(adjacencies=[edge[2]["adjacency"] for edge in cc.edges(data=True)]))
37 | return result
38 |
--------------------------------------------------------------------------------
/rck/utils/adj/long_reads.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from collections import defaultdict
3 | import networkx as nx
4 | import pysam
5 |
6 | from rck.core.io import read_adjacencies_from_source
7 |
8 | #####
9 |
10 |
11 | def get_mode_str(format="bam", input=False):
12 | result = "r" if input else "w"
13 | if format == "bam":
14 | result += "b"
15 | elif format == "cram":
16 | result += "c"
17 | return result
18 |
19 | #####
20 |
21 |
22 | def extract_long_reads():
23 | pass
24 |
25 |
26 | def filter_alignment():
27 | pass
28 |
29 |
30 | def infer_labeling_constraints(rck_nas_source, alignment_file, i_alignment_format, lr_field, min_sv_cnt, logger=None):
31 | logger = logger or logging.getLogger('dummy')
32 | logger.info("RCK NAs file object {file_name}".format(file_name=str(rck_nas_source)))
33 | nas = read_adjacencies_from_source(source=rck_nas_source)
34 | reads_to_nas = defaultdict(list)
35 | for na in nas:
36 | reads_str = na.extra.get(lr_field, "")
37 | reads = reads_str.split(",")
38 | for read in reads:
39 | if len(read) == 0:
40 | continue
41 | reads_to_nas[read].append(na)
42 | logger.debug("{reads_cnt} -- number of reads".format(reads_cnt=len(reads_to_nas)))
43 | reads = {read for read in reads_to_nas if len(reads_to_nas[read]) >= min_sv_cnt}
44 | logger.debug("{reads_cnt} -- number of reads that each span {min_sv_cnt}+ NAs".format(reads_cnt=len(reads), min_sv_cnt=min_sv_cnt))
45 | location_graph = nx.Graph()
46 | mode = get_mode_str(format=i_alignment_format, input=True)
47 | current_read_name = None
48 | current_entries = []
49 | with pysam.AlignmentFile(alignment_file, mode) as i_stream:
50 | if "SO:queryname" not in i_stream.text:
51 | logger.critical("Input alignment file {alignment_file} is not sorted by read (i.e., query) name".format(alignment_file=alignment_file))
52 | raise Exception("Input bam file needs to be sorted by read (i.e., query) name")
53 | for entry in i_stream:
54 | if entry.qname != current_read_name:
55 | if len(current_entries) > 0:
56 | reads_novel_adjacencies = reads_to_nas[current_read_name]
57 | add_lr_labeling_constraints(location_graph=location_graph, alignment_entries=current_entries, nas=reads_novel_adjacencies)
58 | current_read_name = entry.qname
59 | current_entries = [entry]
60 | else:
61 | current_entries.append(entry)
62 | # last reads streak has to be processed as well
63 | if current_read_name in reads_to_nas:
64 | pass
65 |
66 |
67 | def add_lr_labeling_constraints(location_graph, alignment_entries, nas):
68 | entries = sorted(alignment_entries, key=lambda e: (e.query_alignment_start, e.query_alignment_end))
69 | internal = len(nas) != len(entries) - 1
70 |
71 | pass
72 |
73 |
74 | def label_constraints_combining():
75 | pass
76 |
77 |
--------------------------------------------------------------------------------
/rck/utils/adj/main_chrs.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 | 4
5 | 5
6 | 6
7 | 7
8 | 8
9 | 9
10 | 10
11 | 11
12 | 12
13 | 13
14 | 14
15 | 15
16 | 16
17 | 17
18 | 18
19 | 19
20 | 20
21 | 21
22 | 22
23 | X
24 | Y
--------------------------------------------------------------------------------
/rck/utils/adj/rck_adg_infer.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import os
4 | import sys
5 | from collections import defaultdict
6 |
7 | current_file_level = 3
8 | current_dir = os.path.dirname(os.path.realpath(__file__))
9 | for _ in range(current_file_level):
10 | current_dir = os.path.dirname(current_dir)
11 | sys.path.append(current_dir)
12 |
13 | import rck
14 | from rck.core.io import get_standard_logger_from_args, read_adjacencies_from_source, write_adjacency_groups_to_destination, get_logging_cli_parser
15 | from rck.utils.adj.adjacency_group_inference import infer_sniffles_molecule_groups, infer_short_nas_labeling_groups, infer_alignment_labeling_groups, filter_alignment
16 | from rck.utils.adj.adjacency_group_process import refined_labeling_groups
17 |
18 |
19 | def main():
20 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-infer")
21 | parser.add_argument("--version", action="version", version=rck.version)
22 | cli_logging_parser = get_logging_cli_parser()
23 |
24 | subparsers = parser.add_subparsers(title="commands", dest="command")
25 | subparsers.required = True
26 | ###
27 | sniffles_molecule_group_parser = subparsers.add_parser("sniffles-m", parents=[cli_logging_parser])
28 | sniffles_molecule_group_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin)
29 | sniffles_molecule_group_parser.add_argument("--i-separator", default="\t")
30 | sniffles_molecule_group_parser.add_argument("--i-extra-separator", default=";")
31 | sniffles_molecule_group_parser.add_argument("--extra-rnames-field", default="rnames")
32 | sniffles_molecule_group_parser.add_argument("--fp", type=float, default=0.5)
33 | sniffles_molecule_group_parser.add_argument("--gid-suffix", dest="gid_suffix", default="sniffles-M")
34 | sniffles_molecule_group_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
35 | sniffles_molecule_group_parser.add_argument("--o-separator", default="\t")
36 | sniffles_molecule_group_parser.add_argument("--o-aids-separator", default=",")
37 | sniffles_molecule_group_parser.add_argument("--o-extra-separator", default=";")
38 | ###
39 | short_nas_labeling_group_parser = subparsers.add_parser("short-l", parents=[cli_logging_parser])
40 | short_nas_labeling_group_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin)
41 | short_nas_labeling_group_parser.add_argument("--i-separator", default="\t")
42 | short_nas_labeling_group_parser.add_argument("--i-extra-separator", default=";")
43 | short_nas_labeling_group_parser.add_argument("--max-size", type=int, default=50000000)
44 | short_nas_labeling_group_parser.add_argument("--allow-intermediate-same", action="store_true", dest="allow_intermediate_same")
45 | short_nas_labeling_group_parser.add_argument("--allow-intermediate-tra", action="store_true", dest="allow_intermediate_tra")
46 | short_nas_labeling_group_parser.add_argument("--no-inv-signatures", action="store_false", dest="allow_inv_signature")
47 | short_nas_labeling_group_parser.add_argument("--no-refine", action="store_false", dest="refine")
48 | short_nas_labeling_group_parser.add_argument("--fp", type=float, default=1)
49 | short_nas_labeling_group_parser.add_argument("--gid-suffix", dest="gid_suffix", default="short-nas-L")
50 | short_nas_labeling_group_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
51 | short_nas_labeling_group_parser.add_argument("--o-separator", default="\t")
52 | short_nas_labeling_group_parser.add_argument("--o-aids-separator", default=",")
53 | short_nas_labeling_group_parser.add_argument("--o-extra-separator", default=";")
54 | ###
55 | sniffles_labeling_group_parser = subparsers.add_parser("sniffles-l", parents=[cli_logging_parser])
56 | sniffles_labeling_group_parser.add_argument("--rck-adj", type=argparse.FileType("rt"), required=True)
57 | sniffles_labeling_group_parser.add_argument("--i-separator", default="\t")
58 | sniffles_labeling_group_parser.add_argument("--i-extra-separator", default=";")
59 | sniffles_labeling_group_parser.add_argument("--alignment", required=True)
60 | sniffles_labeling_group_parser.add_argument("--alignment-format", choices=["sam", "bam", "cram"], default="bam")
61 | sniffles_labeling_group_parser.add_argument("--extra-rnames-field", default="rnames")
62 | sniffles_labeling_group_parser.add_argument("--no-refine", action="store_false", dest="refine")
63 | sniffles_labeling_group_parser.add_argument("--fp", type=float, default=1)
64 | sniffles_labeling_group_parser.add_argument("--gid-suffix", default="sniffles-L")
65 | sniffles_labeling_group_parser.add_argument("-o", "--output", default=sys.stdout, type=argparse.FileType("wt"))
66 | sniffles_labeling_group_parser.add_argument("--o-separator", default="\t")
67 | sniffles_labeling_group_parser.add_argument("--o-aids-separator", default=",")
68 | sniffles_labeling_group_parser.add_argument("--o-extra-separator", default=";")
69 | ###
70 | filter_alignment_parser = subparsers.add_parser("filter-alignment", parents=[cli_logging_parser])
71 | filter_alignment_parser.add_argument("--rck-adj", type=argparse.FileType("rt"), required=True)
72 | filter_alignment_parser.add_argument("--i-separator", default="\t")
73 | filter_alignment_parser.add_argument("--i-extra-separator", default=";")
74 | filter_alignment_parser.add_argument("--extra-rnames-field", default="rnames")
75 | filter_alignment_parser.add_argument("--alignment", required=True)
76 | filter_alignment_parser.add_argument("--alignment-format", choices=["sam", "bam", "cram"], default="bam")
77 | filter_alignment_parser.add_argument("-o", "--output", required=True)
78 | filter_alignment_parser.add_argument("--output-format", choices=["sam", "bam", "cram"], default="bam")
79 | ###
80 | args = parser.parse_args()
81 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-ADJ-GROUPS-infer")
82 | if args.command == "sniffles-m":
83 | logger.info("Inferring molecule adjacency groups from adjacencies with Sniffles RNAMES support extra info.")
84 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj))
85 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, separator=args.i_separator, extra_separator=args.i_extra_separator)
86 | logger.info("Inferring molecule adjacency groups from read adjacencies")
87 | adj_groups = infer_sniffles_molecule_groups(adjacencies=adjacencies, extra_rnames_field=args.extra_rnames_field, gid_suffix=args.gid_suffix)
88 | logger.info("Inferred {cnt} molecule adjacency groups".format(cnt=len(adj_groups)))
89 | logger.info("Writing inferred molecule adjacency groups to {file}".format(file=args.output))
90 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups,
91 | separator=args.o_separator, extra_separator=args.o_extra_separator, aids_separator=args.o_aids_separator,
92 | extra_fill="")
93 | elif args.command == "short-l":
94 | logger.info("Inferring labeling adjacency groups from adjacencies from adjacencies.")
95 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj))
96 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, separator=args.i_separator, extra_separator=args.i_extra_separator)
97 | logger.info("Inferring labeling adjacency groups from read adjacencies")
98 | adj_groups = infer_short_nas_labeling_groups(adjacencies=adjacencies, gid_suffix=args.gid_suffix, max_size=args.max_size,
99 | allow_intermediate_same=args.allow_intermediate_same,
100 | allow_intermediate_tra=args.allow_intermediate_tra, allow_inv_signatures=args.allow_inv_signature)
101 | logger.info("Inferred {cnt} labeling adjacency groups".format(cnt=len(adj_groups)))
102 | if args.refine:
103 | logger.info("Refining inferred labeling adjacency groups")
104 | adj_groups = refined_labeling_groups(adj_groups=adj_groups, gid_suffix=args.gid_suffix)
105 | logger.info("A total of {cnt} refined labeling adjacency groups remain".format(cnt=len(adj_groups)))
106 | logger.info("Writing inferred labeling adjacency group s to {file}".format(file=args.output))
107 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups,
108 | separator=args.o_separator, aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator,
109 | extra_fill="")
110 | elif args.command == "sniffles-l":
111 | logger.info("Inferring labeling adjacency groups from adjacencies, and their reads-of-origin alignments")
112 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj))
113 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, extra_separator=args.i_extra_separator, separator=args.i_separator)
114 | logger.info("Inferring labeling adjacency groups from read adjacencies and their reads-of-origin alignments")
115 | adj_groups = infer_alignment_labeling_groups(adjacencies=adjacencies, alignment_file_name=args.alignment, alignment_format=args.alignment_format,
116 | extra_rnames_field=args.extra_rnames_field, gid_suffix=args.gid_suffix)
117 | logger.info("Inferred {cnt} labeling adjacency groups. There can be many duplicates, refinement shall take care of it.".format(cnt=len(adj_groups)))
118 | if args.refine:
119 | logger.info("Refining inferred labeling adjacency groups")
120 | adj_groups = refined_labeling_groups(adj_groups=adj_groups, gid_suffix=args.gid_suffix)
121 | logger.info("A total of {cnt} refined labeling adjacency groups remain".format(cnt=len(adj_groups)))
122 | logger.info("Writing inferred labeling adjacency group s to {file}".format(file=args.output))
123 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups,
124 | separator=args.o_separator, aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator,
125 | extra_fill="")
126 | elif args.command == "filter-alignment":
127 | logger.info("Filtering input read alignment to retain only reads mentioned as supporting adjacencies from the input")
128 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj))
129 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, extra_separator=args.i_extra_separator, separator=args.i_separator)
130 | logger.info("Filtering input alignment form file {file} and writing result in {o_file}".format(file=args.alignment, o_file=args.output))
131 | filter_alignment(adjacencies=adjacencies, alignment_file_name=args.alignment, alignment_format=args.alignment_format, extra_rnames_field=args.extra_rnames_field,
132 | output_alignment_file_name=args.output, output_alignment_format=args.output_format)
133 | exit(0)
134 |
135 |
136 | if __name__ == "__main__":
137 | main()
138 |
--------------------------------------------------------------------------------
/rck/utils/adj/rck_adg_process.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import itertools
3 |
4 | import os
5 | import sys
6 |
7 | current_file_level = 3
8 | current_dir = os.path.dirname(os.path.realpath(__file__))
9 | for _ in range(current_file_level):
10 | current_dir = os.path.dirname(current_dir)
11 | sys.path.append(current_dir)
12 |
13 | import rck
14 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, stream_adjacency_groups_from_source, write_adjacency_groups_to_destination, \
15 | read_adjacency_groups_from_source, read_adjacencies_from_source
16 | from rck.core.structures import AdjacencyGroupType
17 | from rck.utils.adj.adjacency_group_process import refined_labeling_groups, projected_groups
18 |
19 |
20 | def main():
21 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-process")
22 | parser.add_argument("--version", action="version", version=rck.version)
23 | cli_logging_parser = get_logging_cli_parser()
24 |
25 | subparsers = parser.add_subparsers(title="command", dest="command")
26 | subparsers.required = True
27 | ###
28 | cat_parser = subparsers.add_parser("cat", parents=[cli_logging_parser])
29 | cat_parser.add_argument("rck_adg", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin])
30 | cat_parser.add_argument("--i-separator", default="\t")
31 | cat_parser.add_argument("--i-extra-separator", default=";")
32 | cat_parser.add_argument("--i-aids-separator", default=",")
33 | cat_parser.add_argument("--enforce-unique-ids", action="store_true", dest="enforce_unique_ids")
34 | cat_parser.add_argument("--id-collision-strategy", choices=["skip", "error"], default="error")
35 | cat_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
36 | cat_parser.add_argument("--o-separator", default="\t")
37 | cat_parser.add_argument("--o-aids-separator", default=",")
38 | cat_parser.add_argument("--o-extra-separator", default=";")
39 | ###
40 | refine_parser = subparsers.add_parser("refine", parents=[cli_logging_parser])
41 | refine_parser.add_argument("rck_adg", nargs="?", type=argparse.FileType("rt"), default=sys.stdin)
42 | refine_parser.add_argument("--i-separator", default="\t")
43 | refine_parser.add_argument("--i-extra-separator", default=";")
44 | refine_parser.add_argument("--i-aids-separator", default=",")
45 | # refine_parser.add_argument("--no-refine-m", action="store_false", dest="refine_m")
46 | # refine_parser.add_argument("--no-refine-l", action="store_false", dest="refine_l")
47 | # refine_parser.add_argument("--no-refine-n", action="store_false", dest="refine_n")
48 | refine_parser.add_argument("--gid-suffix", default="refined")
49 | refine_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
50 | refine_parser.add_argument("--o-separator", default="\t")
51 | refine_parser.add_argument("--o-aids-separator", default=",")
52 | refine_parser.add_argument("--o-extra-separator", default=";")
53 | ###
54 | project_parser = subparsers.add_parser("project", parents=[cli_logging_parser])
55 | project_parser.add_argument("rck_adg", type=argparse.FileType("rt"), default=sys.stdin)
56 | project_parser.add_argument("--i-separator", default="\t")
57 | project_parser.add_argument("--i-extra-separator", default=";")
58 | project_parser.add_argument("--i-aids-separator", default=",")
59 | project_parser.add_argument("--adjacencies", required=True, type=argparse.FileType("rt"))
60 | project_parser.add_argument("--adj-separator", default="\t")
61 | project_parser.add_argument("--adj-extra-separator", default=";")
62 | project_parser.add_argument("--gid-suffix", default="projected")
63 | project_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
64 | project_parser.add_argument("--o-separator", default="\t")
65 | project_parser.add_argument("--o-aids-separator", default=",")
66 | project_parser.add_argument("--o-extra-separator", default=";")
67 | ###
68 | args = parser.parse_args()
69 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-ADJ-GROUPS-process")
70 | if args.command == "cat":
71 | adj_groups = itertools.chain(*(stream_adjacency_groups_from_source(source=adj_group_source, separator=args.i_separator,
72 | aids_separator=args.i_aids_separator, extra_separator=args.i_extra_separator)
73 | for adj_group_source in args.rck_adg))
74 | if args.enforce_unique_ids:
75 | pass
76 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups, separator=args.o_separator,
77 | aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator)
78 | elif args.command == "refine":
79 | logger.info("Refining input adjacency groups")
80 | logger.info("Reading adjacency groups from {file}".format(file=args.rck_adg))
81 | adg_groups = read_adjacency_groups_from_source(source=args.rck_adg, separator=args.i_separator,
82 | extra_separator=args.i_extra_separator, aids_separator=args.i_aids_separator)
83 | logger.info("A total of {cnt} adjacency groups has been read".format(cnt=len(adg_groups)))
84 | molecule_groups = [ag for ag in adg_groups if ag.group_type == AdjacencyGroupType.MOLECULE]
85 | logger.info("A total of {cnt} molecule adjacency groups has been read".format(cnt=len(molecule_groups)))
86 | labeling_groups = [ag for ag in adg_groups if ag.group_type == AdjacencyGroupType.LABELING]
87 | logger.info("A total of {cnt} labeling adjacency groups has been read".format(cnt=len(labeling_groups)))
88 | general_groups = [ag for ag in adg_groups if ag.group_type == AdjacencyGroupType.GENERAL]
89 | logger.info("A total of {cnt} general adjacency groups has been read".format(cnt=len(general_groups)))
90 | logger.info("Refining molecule adjacency groups")
91 | refined_molecule_groups = molecule_groups
92 | logger.info("A total of {cnt} refined molecule adjacency groups remains".format(cnt=len(refined_molecule_groups)))
93 | logger.info("Refining labeling adjacency groups")
94 | r_labeling_groups = refined_labeling_groups(adj_groups=labeling_groups, gid_suffix="" if len(args.gid_suffix) == 0 else args.gid_suffix + "-L",
95 | retain_source_gids=True)
96 | logger.info("A total of {cnt} refined labeling adjacency groups remains".format(cnt=len(r_labeling_groups)))
97 | logger.info("Refining general adjacency groups")
98 | refined_general_groups = general_groups
99 | logger.info("A total of {cnt} refined labeling general adjacency groups remains".format(cnt=len(refined_general_groups)))
100 | adj_groups = itertools.chain(refined_molecule_groups, r_labeling_groups, refined_general_groups)
101 | logger.info("Writing refined adjacency groups to {file}".format(file=args.output))
102 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=adj_groups, separator=args.o_separator, aids_separator=args.o_aids_separator)
103 | elif args.command == "project":
104 | logger.info("Projecting input adjacency groups based on input adjacencies")
105 | logger.info("Reading adjacency groups from {file}".format(file=args.rck_adg))
106 | adg_groups = read_adjacency_groups_from_source(source=args.rck_adg, separator=args.i_separator, extra_separator=args.i_extra_separator,
107 | aids_separator=args.i_aids_separator)
108 | logger.info("A total of {cnt} adjacency gorups has been read".format(cnt=len(adg_groups)))
109 | adjacencies = read_adjacencies_from_source(source=args.adjacencies, separator=args.adj_separator, extra_separator=args.adj_extra_separator)
110 | p_groups = projected_groups(groups=adg_groups, adjacencies=adjacencies, gid_suffix=args.gid_suffix)
111 | logger.info("A total of {cnt} projected groups remained".format(cnt=len(p_groups)))
112 | logger.info("Writing projected adjacency groups to {file}".format(file=args.output))
113 | write_adjacency_groups_to_destination(destination=args.output, adjacency_groups=p_groups, separator=args.o_separator,
114 | aids_separator=args.o_aids_separator, extra_separator=args.o_extra_separator)
115 |
116 |
117 | if __name__ == "__main__":
118 | main()
119 |
--------------------------------------------------------------------------------
/rck/utils/adj/rck_adg_stats.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 |
4 | import rck
5 | from rck.core.io import get_logging_cli_parser, read_adjacency_groups_from_source
6 | from rck.core.structures import AdjacencyGroupType
7 | from rck.utils.adj.adjacency_group_stats import groups_size_tally
8 |
9 |
10 | def main():
11 | cli_logging_parser = get_logging_cli_parser()
12 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADG-STATS")
13 | parser.add_argument('--version', action='version', version=rck.version)
14 | subparsers = parser.add_subparsers(title="command", dest="command")
15 | subparsers.required = True
16 | #######
17 | labeling_group_size_parser = subparsers.add_parser("size-l", parents=[cli_logging_parser], help="Group size for RCK AdjGROUP in input file")
18 | labeling_group_size_parser.add_argument("rck_adg", type=argparse.FileType("rt"), nargs="?", default=sys.stdin)
19 | labeling_group_size_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
20 | labeling_group_size_parser.add_argument("--no-allow-zero-values", action="store_false", dest="allow_zero_values")
21 | labeling_group_size_parser.add_argument("--min", type=int, default=-1)
22 | labeling_group_size_parser.add_argument("--max", type=int, default=-1)
23 | #######
24 | args = parser.parse_args()
25 | if args.command == "size-l":
26 | adj_groups = read_adjacency_groups_from_source(source=args.rck_adg)
27 | labeling_adg = [ag for ag in adj_groups if ag.group_type == AdjacencyGroupType.LABELING]
28 | tally = groups_size_tally(adjacency_groups=labeling_adg)
29 | min_key, max_key = min(tally.keys()), max(tally.keys())
30 | if args.max != -1:
31 | max_key = args.max
32 | if args.min != -1:
33 | min_key = args.min
34 | min_value = 0
35 | for key in tally:
36 | if key < min_key:
37 | min_value += tally[key]
38 | print("<{min_key}".format(min_key=min_key), min_value, sep=",", file=args.output)
39 | for value in range(min_key, max_key):
40 | if value not in tally and not args.allow_zero_values:
41 | continue
42 | print(value, tally.get(value, 0), sep=",", file=args.output)
43 | max_value = 0
44 | for key in tally:
45 | if key >= max_key:
46 | max_value += tally[key]
47 | print(">={max_key}".format(max_key=max_key), max_value, sep=",", file=args.output)
48 |
49 |
50 |
51 | if __name__ == "__main__":
52 | main()
53 |
--------------------------------------------------------------------------------
/rck/utils/adj/rck_adj_long_reads.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import sys
4 | from collections import defaultdict
5 | import pysam
6 | import networkx as nx
7 |
8 | from rck.core.io import read_adjacencies_from_source, get_logging_cli_parser, get_standard_logger_from_args
9 | from rck.utils.adj.long_reads import infer_labeling_constraints
10 |
11 |
12 | def get_mode_str(format="bam", input=False):
13 | result = "r" if input else "w"
14 | if format == "bam":
15 | result += "b"
16 | elif format == "cram":
17 | result += "c"
18 | return result
19 |
20 |
21 | def get_reads_set_from_source(source):
22 | reads = set()
23 | for line in source:
24 | line = line.strip()
25 | if len(line) == 0 or line.startswith("#"):
26 | continue
27 | reads.add(line)
28 | return reads
29 |
30 |
31 | def main():
32 | parser = argparse.ArgumentParser()
33 | logging_parser = get_logging_cli_parser()
34 | ########
35 | subparsers = parser.add_subparsers(title="commands", dest="command")
36 | subparsers.required = True
37 | ########
38 | lr_extraction_parser = subparsers.add_parser("extract-lr", parents=[logging_parser])
39 | lr_extraction_parser.add_argument("rck_nas", type=argparse.FileType("rt"), default=sys.stdin)
40 | lr_extraction_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
41 | lr_extraction_parser.add_argument("--min-sv-cnt", type=int, default=2)
42 | lr_extraction_parser.add_argument("--lr-field", default="support_read_names")
43 | #########
44 | lr_alignment_filter_parser = subparsers.add_parser("filter-alignment", parents=[logging_parser])
45 | lr_alignment_filter_parser.add_argument("alignment", nargs="?", type=str, default="-")
46 | lr_alignment_filter_parser.add_argument("--i-alignment-format", type=str, choices=["bam", "sam", "cram"], default="bam")
47 | lr_alignment_filter_parser.add_argument("-r", "--reads", type=argparse.FileType("rt"), required=True)
48 | lr_alignment_filter_parser.add_argument("--r-separator", default="\t")
49 | lr_alignment_filter_parser.add_argument("--s-separator", default="\t")
50 | lr_alignment_filter_parser.add_argument("-o", "--output", type=str, default="-")
51 | lr_alignment_filter_parser.add_argument("--o-alignment-format", type=str, choices=["bam", "sam", "cram"], default="bam")
52 | #########
53 | labeling_constraint_inference_parser = subparsers.add_parser("label-const-inf", parents=[logging_parser])
54 | labeling_constraint_inference_parser.add_argument("alignment", type=str, default="-")
55 | labeling_constraint_inference_parser.add_argument("--i-alignment-format", type=str, choices=["bam", "sam", "cram"], default="bam")
56 | labeling_constraint_inference_parser.add_argument("--rck-nas", type=argparse.FileType("rt"), required=True)
57 | labeling_constraint_inference_parser.add_argument("--min-sv-cnt", type=int, default=2)
58 | labeling_constraint_inference_parser.add_argument("--lr-field", default="support_read_names")
59 | labeling_constraint_inference_parser.add_argument("-o", "--output", type=argparse.FileType("rt"), default=sys.stdout)
60 | #########
61 | labeling_constraint_combine_parser = subparsers.add_parser("label-const-com", parents=[logging_parser])
62 | labeling_constraint_combine_parser.add_argument("label-constr", type=argparse.FileType("rt"), nargs="+")
63 | labeling_constraint_combine_parser.add_argument("-o", "--output", type=argparse.FileType("rt"), default=sys.stdout)
64 | #########
65 | args = parser.parse_args()
66 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-LR")
67 | if args.command == "extract-lr":
68 | nas = read_adjacencies_from_source(source=args.rck_nas)
69 | reads_to_nas = defaultdict(list)
70 | for na in nas:
71 | reads_str = na.extra.get(args.lr_field, "")
72 | reads = reads_str.split(",")
73 | for read in reads:
74 | if len(read) == 0:
75 | continue
76 | reads_to_nas[read].append(na)
77 | extracted_read_names = {read for read in reads_to_nas if len(reads_to_nas[read]) >= args.min_sv_cnt}
78 | for read_name in extracted_read_names:
79 | print(read_name, file=args.output)
80 | elif args.command == "filter-alignment":
81 | reads = get_reads_set_from_source(source=args.reads)
82 | imode = get_mode_str(format=args.i_alignment_format, input=True)
83 | omode = get_mode_str(format=args.o_alignment_format, input=False)
84 | with pysam.AlignmentFile(args.alignment, imode) as i_stream:
85 | with pysam.AlignmentFile(args.output, omode, template=i_stream) as o_stream:
86 | for entry in i_stream:
87 | if entry.qname in reads:
88 | o_stream.write(entry)
89 | elif args.command == "label-const-inf":
90 | constraints = infer_labeling_constraints(rck_nas_source=args.rck_nas, alignment_file=args.alignment, i_alignment_format=args.i_alignment_format,
91 | lr_field=args.lr_field, min_sv_cnt=args.min_sv_cnt, logger=logger)
92 |
93 | elif args.command == "label-constr-com":
94 | pass
95 |
96 | if __name__ == "__main__":
97 | main()
98 |
--------------------------------------------------------------------------------
/rck/utils/adj/rck_adj_process.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import itertools
3 | import os
4 | import sys
5 | from collections import defaultdict
6 |
7 | current_file_level = 3
8 | current_dir = os.path.dirname(os.path.realpath(__file__))
9 | for _ in range(current_file_level):
10 | current_dir = os.path.dirname(current_dir)
11 | sys.path.append(current_dir)
12 |
13 | import rck
14 | from rck.core.io import read_adjacencies_from_source, write_adjacencies_to_destination, EXTERNAL_NA_ID, stream_adjacencies_from_source, get_logging_cli_parser, \
15 | get_standard_logger_from_args
16 | from rck.utils.adj.process import get_shared_nas_parser, Merger, iter_over_string_entries_from_source, get_extra_field_regexes, \
17 | filter_adjacencies_by_extra, \
18 | KEEP, REMOVE, refined_adjacencies_reciprocal, update_adjacencies
19 | from rck.utils.adj.convert import get_chrs_regions_string_lists_from_source, get_chrs_regions_string_list_from_file, parse_segment_chr_region
20 | from rck.utils.adj.process import filter_adjacencies_by_chromosomal_regions, filter_adjacencies_by_size, iter_haploid_adjacencies
21 |
22 |
23 | def main():
24 | parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-process")
25 | parser.add_argument('--version', action='version', version=rck.version)
26 | ####
27 | shared_parser = get_shared_nas_parser()
28 | cli_logging_parser = get_logging_cli_parser()
29 | shared_parser.add_argument("--output", "-o", dest="rck_adj_file", type=argparse.FileType("wt"), default=sys.stdout)
30 | shared_parser.add_argument("--no-sort", action="store_false", dest="sort")
31 | ####
32 | subparsers = parser.add_subparsers(title="commands", dest="command")
33 | subparsers.required = True
34 | ####
35 | filter_parser = subparsers.add_parser("filter", parents=[shared_parser, cli_logging_parser])
36 | filter_parser.add_argument("rck_adj", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin])
37 | filter_parser.add_argument("--keep-extra-field-regex", action="append", default=None)
38 | filter_parser.add_argument("--keep-extra-field-regex-file", type=argparse.FileType("rt"), default=None)
39 | filter_parser.add_argument("--keep-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP)
40 | filter_parser.add_argument("--keep-annotate", action="store_true", dest="annotate_retained")
41 | filter_parser.add_argument("--keep-annotate-s-extra-field", default=None, dest="annotate_seg_extra_field")
42 | filter_parser.add_argument("--keep-annotate-short-circ", action="store_true", dest="annotate_shirt_circ")
43 | filter_parser.add_argument("--keep-annotate-extra-prefix", dest="annotate_extra_prefix")
44 | filter_parser.add_argument("--remove-extra-field-regex", action="append", default=None)
45 | filter_parser.add_argument("--remove-extra-field-regex-file", type=argparse.FileType("rt"), default=None)
46 | filter_parser.add_argument("--remove-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP)
47 | filter_parser.add_argument("--min-size", type=int, default=0)
48 | filter_parser.add_argument("--max-size", type=int, default=1000000000)
49 | filter_parser.add_argument("--no-allow-inter-chr", action="store_false", dest="allow_inter_chr")
50 | filter_parser.add_argument("--no-allow-intra-chr", action="store_false", dest="allow_intra_chr")
51 | filter_parser.add_argument("--size-extra-field", default="svlen")
52 | filter_parser.add_argument("--size-extra-field-no-abs", action="store_false", dest="size_extra_field_abs")
53 | filter_parser.add_argument("--size-extra-seq-field")
54 | ####
55 | cat_parser = subparsers.add_parser("cat", parents=[shared_parser, cli_logging_parser], help="Concatenate Adjacencies in input files (NOTE: different from \"merge\")")
56 | cat_parser.add_argument("rck_adj", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin])
57 | cat_parser.add_argument("--enforce-unique-ids", action="store_true", dest="enforce_unique_ids")
58 | cat_parser.add_argument("--id-collision-strategy", choices=['skip', 'error'], default='error')
59 | ####
60 | reciprocal_parser = subparsers.add_parser("reciprocal", parents=[shared_parser, cli_logging_parser], help="ensure that reciprocal novel adjacencies are treated as such")
61 | reciprocal_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin)
62 | reciprocal_parser.add_argument("--max-distance", type=int, default=50)
63 | ####
64 | haploid_parser = subparsers.add_parser("haploid", parents=[shared_parser, cli_logging_parser], help="collapse any info that is allele/haplotype-specific into a haploid mode")
65 | haploid_parser.add_argument("rck_adj", type=argparse.FileType("rt"), nargs="+", default=[sys.stdin])
66 | ####
67 | update_parser = subparsers.add_parser("update", parents=[shared_parser, cli_logging_parser],
68 | help="Updates adjacencies in the 'adj' with the info from --source based on aid matches. Outputs updated --target entries")
69 | update_parser.add_argument("rck_adj", type=argparse.FileType("rt"))
70 | update_parser.add_argument("--source", type=argparse.FileType("rt"), required=True)
71 | update_parser.add_argument("--exclude-extra-fields", default="")
72 | update_parser.add_argument("--include-extra-fields", default="")
73 | update_parser.add_argument("--no-include-missing", action="store_false", dest="include_missing")
74 | update_parser.add_argument("--no-coords-update", action="store_false", dest="coord_update")
75 | update_parser.add_argument("--no-coord1-update", action="store_false", dest="coord1_update")
76 | update_parser.add_argument("--no-coord2-update", action="store_false", dest="coord2_update")
77 | update_parser.add_argument("--no-strands-update", action="store_false", dest="strands_update")
78 | update_parser.add_argument("--no-strand1-update", action="store_false", dest="strand1_update")
79 | update_parser.add_argument("--no-strand2-update", action="store_false", dest="strand2_update")
80 | args = parser.parse_args()
81 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-ADK-process")
82 | processed_adjacencies = []
83 | if args.o_extra_fields is None or len(args.o_extra_fields) == 0 or args.o_extra_fields == ",":
84 | extra = None
85 | elif args.o_extra_fields != "all":
86 | extra = args.o_extra_fields.split(",")
87 | else:
88 | extra = args.o_extra_fields
89 | if args.command == "cat":
90 | adjacencies = itertools.chain(*(stream_adjacencies_from_source(source=rck_adj_source) for rck_adj_source in args.rck_adj))
91 | if args.enforce_unique_ids:
92 | processed_ids = set()
93 | adjacencies = []
94 | for adj in adjacencies:
95 | aid = adj.extra.get(EXTERNAL_NA_ID, adj.idx)
96 | if aid in processed_ids:
97 | logger.debug("Adjacency id {aid} has been encountered more than once".format(aid=aid))
98 | if args.id_collision_strategy == "skip":
99 | continue
100 | elif args.id_collision_strategy == "error":
101 | raise ValueError("More than one adjacency with id {aid}".format(aid=aid))
102 | adjacencies.append(adj)
103 | processed_ids.add(aid)
104 | adjacencies = adjacencies
105 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=adjacencies, extra=extra, sort_adjacencies=args.sort)
106 | exit(0)
107 | elif args.command == "filter":
108 | logger.info("Filtering input adjacencies from following sources {sources}".format(sources=",".join(map(str, args.rck_adj))))
109 | adjacencies = itertools.chain(*(stream_adjacencies_from_source(source=rck_adj_source) for rck_adj_source in args.rck_adj))
110 | include_chrs_regions_strings = []
111 | exclude_chrs_regions_strings = []
112 | if args.chrs_include is not None:
113 | for chrs_lists in args.chrs_include:
114 | for chrs_list in chrs_lists:
115 | for chr_name in chrs_list.split(","):
116 | include_chrs_regions_strings.append(chr_name)
117 | if args.chrs_include_file is not None:
118 | for chr_name in get_chrs_regions_string_lists_from_source(source=args.chrs_include_file):
119 | include_chrs_regions_strings.append(chr_name)
120 | if args.chrs_exclude is not None:
121 | for chrs_lists in args.chrs_exclude:
122 | for chrs_list in chrs_lists:
123 | for chr_name in chrs_list.split(","):
124 | exclude_chrs_regions_strings.append(chr_name)
125 | if args.chrs_exclude_file is not None:
126 | for chr_name in get_chrs_regions_string_list_from_file(file_name=args.chrs_exclude_file):
127 | exclude_chrs_regions_strings.append(chr_name)
128 | include_regions = [parse_segment_chr_region(string) for string in include_chrs_regions_strings]
129 | exclude_regions = [parse_segment_chr_region(string) for string in exclude_chrs_regions_strings]
130 | adjacencies = filter_adjacencies_by_chromosomal_regions(adjacencies=adjacencies, include=include_regions, exclude=exclude_regions,
131 | include_both=args.include_both, exclude_both=args.exclude_both,
132 | include_spanning=args.include_spanning, exclude_spanning=args.exclude_spanning,
133 | annotate_retained=args.annotate_retained, annotate_retained_extra_field_prefix=args.annotate_extra_prefix,
134 | annotated_retained_segments_extra_field=args.annotate_seg_extra_field, annotate_short_circ=args.annotate_shirt_circ)
135 | keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else []
136 | if args.keep_extra_field_regex_file is not None:
137 | keep_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.keep_extra_field_regex_file)))
138 | remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else []
139 | if args.remove_extra_field_regex_file is not None:
140 | remove_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.remove_extra_field_regex_file)))
141 | keep_extra_field = get_extra_field_regexes(string_entries=keep_extra_field_entries)
142 | remove_extra_field = get_extra_field_regexes(string_entries=remove_extra_field_entries)
143 | adjacencies = filter_adjacencies_by_extra(adjacencies=adjacencies,
144 | keep_extra_field=keep_extra_field, keep_extra_field_missing_strategy=args.keep_extra_field_missing_strategy,
145 | remove_extra_field=remove_extra_field, remove_extra_field_missing_strategy=args.remove_extra_field_missing_strategy)
146 | adjacencies = filter_adjacencies_by_size(adjacencies=adjacencies, min_size=args.min_size, max_size=args.max_size, size_extra_field=args.size_extra_field,
147 | size_extra_seq_field=args.size_extra_seq_field, allow_inter_chr=args.allow_inter_chr,
148 | size_extra_field_abs=args.size_extra_field_abs, allow_intra_chr=args.allow_intra_chr,)
149 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=adjacencies, sort_adjacencies=False, extra=extra)
150 | exit(0)
151 | elif args.command == "reciprocal":
152 | adjacencies = read_adjacencies_from_source(source=args.rck_adj)
153 | processed_adjacencies = refined_adjacencies_reciprocal(novel_adjacencies=adjacencies, max_distance=args.max_distance, inplace=True)
154 | elif args.command == "haploid":
155 | adjacencies = itertools.chain(*(stream_adjacencies_from_source(source=rck_adj_source) for rck_adj_source in args.rck_adj))
156 | haploid_adjacencies = iter_haploid_adjacencies(adjacencies=adjacencies, copy=False)
157 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=haploid_adjacencies, sort_adjacencies=False, extra=extra)
158 | exit(0)
159 | elif args.command == "update":
160 | adjacencies = read_adjacencies_from_source(source=args.rck_adj)
161 | source_adjacencies = read_adjacencies_from_source(source=args.source)
162 | extra_include = {v for v in args.include_extra_fields.split(",") if len(v) > 0}
163 | extra_exclude = {v for v in args.exclude_extra_fields.split(",") if len(v) > 0}
164 | processed_adjacencies = update_adjacencies(target_adjacencies=adjacencies, source_adjacencies=source_adjacencies,
165 | update_coords=args.update_coords, update_coord1=args.update_coord1, update_coord2=args.update_coord2,
166 | update_strands=args.update_strands, update_strand1=args.update_strand1, update_strand2=args.update_strand2,
167 | extra_exclude=extra_exclude, extra_include=extra_include, include_missing=args.include_missing)
168 | if len(processed_adjacencies) > 0:
169 | write_adjacencies_to_destination(destination=args.rck_adj_file, adjacencies=processed_adjacencies, extra=extra, sort_adjacencies=args.sort)
170 |
171 |
172 | if __name__ == "__main__":
173 | main()
174 |
--------------------------------------------------------------------------------
/rck/utils/adj/rck_adj_rck2x.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 |
5 |
6 | current_file_level = 3
7 | current_dir = os.path.dirname(os.path.realpath(__file__))
8 | for _ in range(current_file_level):
9 | current_dir = os.path.dirname(current_dir)
10 | sys.path.append(current_dir)
11 |
12 | import rck
13 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, read_adjacencies_from_source, write_adjacencies_to_vcf_sniffles_destination, \
14 | write_adjacencies_to_circa_destination, read_chr_sizes_from_source, write_segments_to_circa_destination, write_adjacencies_to_bedpe_destination
15 | from rck.core.structures import AdjacencyType
16 | from rck.utils.adj.process import get_circa_adj_cnt, filter_adjacencies_by_size
17 |
18 |
19 | def main():
20 | parser = argparse.ArgumentParser(prog="RCK-UTILS-NAS-rck2x")
21 | parser.add_argument('--version', action='version', version=rck.version)
22 | cli_logging_parser = get_logging_cli_parser()
23 | ###
24 | subparsers = parser.add_subparsers(title="commands", dest="command")
25 | subparsers.required = True
26 | ###
27 | vcf_parser = subparsers.add_parser("vcf-sniffles", parents=[cli_logging_parser], help="Convert RCK Adjacencies to the VCF (Sniffles) format")
28 | vcf_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin)
29 | vcf_parser.add_argument("--separator", default="\t")
30 | vcf_parser.add_argument("--extra-separator", default=";")
31 | vcf_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout)
32 | vcf_parser.add_argument("--o-extra-fields", default="all")
33 | vcf_parser.add_argument("--o-no-include-ref", action="store_false", dest="include_ref")
34 | vcf_parser.add_argument("--clone-suffix", default="")
35 | vcf_parser.add_argument("--dummy-clone", default="dummy_clone")
36 | vcf_parser.add_argument("--dummy-clone-gt-extra")
37 | vcf_parser.add_argument("--dummy-gt", default="./.")
38 | vcf_parser.add_argument("--alt-extra")
39 | vcf_parser.add_argument("--ref-extra")
40 | ###
41 | circa_parser = subparsers.add_parser("circa", parents=[cli_logging_parser], help="Convert RCK Adjacencies to the TSV format supported by Circa")
42 | circa_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin)
43 | circa_parser.add_argument("--separator", default="\t")
44 | circa_parser.add_argument("--extra-separator", default=";")
45 | circa_parser.add_argument("--size-extra-field")
46 | circa_parser.add_argument("--size-extra-field-no-abs", action="store_false", dest="size_extra_field_abs")
47 | circa_parser.add_argument("--size-extra-seq-field")
48 | circa_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout)
49 | ###
50 | circa_density_parser = subparsers.add_parser("circa-dens", parents=[cli_logging_parser],
51 | help="Convert RCK Adjacencies to the TSV format with adjacencies density cnt per window supported by Circa")
52 | circa_density_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin)
53 | circa_density_parser.add_argument("--separator", default="\t")
54 | circa_density_parser.add_argument("--extra-separator", default=";")
55 | circa_density_parser.add_argument("--window-size", type=int, default=10000000)
56 | circa_density_parser.add_argument("--chr-sizes", type=argparse.FileType("rt"))
57 | circa_density_parser.add_argument("--element", choices=["breakend", "adj"], default="breakend")
58 | circa_density_parser.add_argument("--element-adj-cnt-full", action="store_true", dest="circa_element_adj_cnt_full")
59 | circa_density_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
60 | ###
61 | bedpe_parser = subparsers.add_parser("bedpe", parents=[cli_logging_parser],
62 | help="Convert RCK Adjacencies to the BEDPE format with only intra-chromosomal adjacencies considered")
63 | bedpe_parser.add_argument("rck_adj", type=argparse.FileType("rt"), default=sys.stdin)
64 | bedpe_parser.add_argument("--separator", default="\t")
65 | bedpe_parser.add_argument("--extra-separator", default=";")
66 | bedpe_parser.add_argument("--name-extra-field", default=None)
67 | bedpe_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
68 | ###
69 | args = parser.parse_args()
70 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-NAS-rck2x")
71 | logger.info("Reading adjacencies from {file}".format(file=args.rck_adj))
72 | adjacencies = read_adjacencies_from_source(source=args.rck_adj, extra_separator=args.extra_separator, separator=args.separator)
73 | if args.command == "vcf-sniffles":
74 | if not args.include_ref:
75 | logger.debug("Reference adjacencies were excluded from the output.")
76 | adjacencies = list(filter(lambda a: a.adjacency_type == AdjacencyType.NOVEL, adjacencies))
77 | if args.o_extra_fields is None or len(args.o_extra_fields) == 0 or args.o_extra_fields == ",":
78 | extra = None
79 | elif args.o_extra_fields != "all":
80 | extra = args.o_extra_fields.split(",")
81 | else:
82 | extra = args.o_extra_fields
83 | logger.debug("Output extra fields are identified as {o_extra}".format(o_extra=",".join(extra) if extra is not None else ""))
84 | logger.info("Converting RCK formatted adjacencies to the VCF (Sniffles) format")
85 | logger.info("Writing adjacencies to {file}".format(file=args.output))
86 | write_adjacencies_to_vcf_sniffles_destination(destination=args.output, adjacencies=adjacencies, extra=extra,
87 | dummy_clone=args.dummy_clone, clone_suffix=args.clone_suffix,
88 | alt_extra=args.alt_extra, ref_extra=args.ref_extra,
89 | dummy_clone_gt_extra=args.dummy_clone_gt_extra, dummy_gt=args.dummy_gt)
90 | elif args.command == "circa":
91 | logger.info("Converting input RCK formatted adjacencies into a Circa suitable format (extra column get transformed into a size column)")
92 | logger.info("Writing adjacencies info suitable for Circa to {file}".format(file=args.output))
93 | write_adjacencies_to_circa_destination(destination=args.output, adjacencies=adjacencies, size_extra_field=args.size_extra_field,
94 | size_extra_seq_field=args.size_extra_seq_field, size_abs=args.size_extra_field_abs)
95 | elif args.command == "circa-dens":
96 | logger.info("Computing cnt of input RCK formatted adjacencies per window into a CIRCA suitable format")
97 | chr_sizes = args.chr_sizes
98 | if args.chr_sizes is not None:
99 | chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes)
100 | circa_adj_cnts = get_circa_adj_cnt(adjacencies=adjacencies, window_size=args.window_size, chr_sizes=chr_sizes, element=args.element,
101 | adj_full_cnt=args.circa_element_adj_cnt_full)
102 | segments = []
103 | for segment, cnt in circa_adj_cnts.items():
104 | segment.extra[args.element + "_cnt"] = cnt * segment.length / args.window_size
105 | segments.append(segment)
106 | write_segments_to_circa_destination(destination=args.output, segments=segments, extra=[args.element + "_cnt"])
107 | elif args.command == "bedpe":
108 | logger.info(f"Converting and writing input RCK formatted adjacencies into BEDPE format to {args.output}")
109 | adjacencies = filter_adjacencies_by_size(adjacencies=adjacencies, allow_inter_chr=True)
110 | write_adjacencies_to_bedpe_destination(destination=args.output, adjacencies=adjacencies, name_extra_field=args.name_extra_field)
111 | logger.info("Success")
112 |
113 |
114 | if __name__ == "__main__":
115 | main()
116 |
--------------------------------------------------------------------------------
/rck/utils/adj/stats.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 |
4 | def get_size_bins(bins_strs):
5 | result = [-3000000000]
6 | for str_value in bins_strs:
7 | result.append(int(float(str_value)))
8 | result.append(3000000000)
9 | return result
10 |
11 |
12 | def get_adj_size(adjacency, size_extra_field="svlen", size_extra_field_abs=True, size_extra_seq_field=None):
13 | adj_size = None
14 | try:
15 | adj_size = int(float(adjacency.extra[size_extra_field]))
16 | if size_extra_field_abs:
17 | adj_size = abs(adj_size)
18 | except (KeyError, ValueError):
19 | pass
20 | if adj_size is None:
21 | try:
22 | adj_size = len(adjacency.extra[size_extra_seq_field])
23 | except (KeyError, ValueError):
24 | pass
25 | if adj_size is None:
26 | adj_size = adjacency.distance_non_hap
27 | return adj_size
28 |
29 |
30 | def merged_source_tally(adjacencies, bins=None, extra_sources_field="supporting_sources", size_extra_field="svlen", size_extra_field_abs=True, size_extra_seq_field=None):
31 | if bins is None:
32 | bins = [-3000000000, 3000000000]
33 | result = defaultdict(lambda: defaultdict(int))
34 | for adj in adjacencies:
35 | adj_size = get_adj_size(adjacency=adj, size_extra_field=size_extra_field, size_extra_field_abs=size_extra_field_abs, size_extra_seq_field=size_extra_seq_field)
36 | sources_string = adj.extra.get(extra_sources_field, None)
37 | target_bin = None
38 | for bin in bins:
39 | if adj_size < bin:
40 | target_bin = bin
41 | break
42 | source = ("None",) if sources_string is None else tuple(sorted(sources_string.split(",")))
43 | result[source][target_bin] += 1
44 | return result
45 |
--------------------------------------------------------------------------------
/rck/utils/karyotype/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/karyotype/__init__.py
--------------------------------------------------------------------------------
/rck/utils/karyotype/analysis.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | from rck.core.io import FALSE_POSITIVE, AG_LABELING
4 | from rck.core.structures import Haplotype, CNBoundaries, AdjacencyGroupType, Phasing
5 |
6 |
7 | def scnb_violations(scnt, scnb, segments_syncs, segments=None, clone_ids=None, short_circuit=False):
8 | if clone_ids is None:
9 | clone_ids = set(scnt.keys()) & set(scnb.keys())
10 | result = defaultdict(list)
11 | for clone_id in clone_ids:
12 | scnp = scnt[clone_id]
13 | scnbp = scnb[clone_id]
14 | sids = [s.stable_id_non_hap for s in segments] if segments is not None else scnp.records.keys()
15 | for sid in sids:
16 | sync_indicator = segments_syncs[sid]
17 | cna, cnb = scnp.get_cn(sid=sid, haplotype=Haplotype.A), scnp.get_cn(sid=sid, haplotype=Haplotype.B)
18 | lower_a = scnbp.get_cnb(sid=sid, hap=Haplotype.A, boundary_type=CNBoundaries.LOWER)
19 | lower_b = scnbp.get_cnb(sid=sid, hap=Haplotype.B, boundary_type=CNBoundaries.LOWER)
20 | upper_a = scnbp.get_cnb(sid=sid, hap=Haplotype.A, boundary_type=CNBoundaries.UPPER)
21 | upper_b = scnbp.get_cnb(sid=sid, hap=Haplotype.B, boundary_type=CNBoundaries.UPPER)
22 | if sync_indicator == 1:
23 | if (not lower_a <= cna <= upper_a) or (not lower_b <= cnb <= upper_b):
24 | result[clone_id].append(sid)
25 | else:
26 | if (not lower_b <= cna <= upper_b) or (not lower_a <= cnb <= upper_a):
27 | result[clone_id].append(sid)
28 | if short_circuit and len(result[clone_id]) > 0:
29 | return result
30 | return result
31 |
32 |
33 | def unique_realization_violations(adjacencies, acnt):
34 | pass
35 |
36 |
37 | def adjacency_groups_molecule_violations(groups, acnt, clone_ids=None, skip_missing_fp=True, short_circuit=False):
38 | result = []
39 | if clone_ids is None:
40 | clone_ids = sorted(acnt.keys())
41 | for group in filter(lambda ag: ag.group_type == AdjacencyGroupType.MOLECULE, groups):
42 | group_fp = group.extra.get(FALSE_POSITIVE, None)
43 | if group_fp is None:
44 | if not skip_missing_fp:
45 | result.append(group)
46 | if short_circuit:
47 | return result
48 | continue
49 | group_is_good = False
50 | for clone_id in clone_ids:
51 | acnp = acnt[clone_id]
52 | clone_present = acnp.haploid_adjacencies_present(adjacencies=group.adjacencies)
53 | inferred_fp = 1 - (len(clone_present) * 1.0 / len(group.adjacencies))
54 | group_is_good |= inferred_fp <= group_fp
55 | if not group_is_good:
56 | result.append(group)
57 | if short_circuit and len(result) > 0:
58 | return result
59 | return result
60 |
61 |
62 | def adjacency_groups_general_violations(groups, acnt, clone_ids=None, skip_missing_fp=True, short_circuit=False):
63 | result = []
64 | if clone_ids is None:
65 | clone_ids = sorted(acnt.keys())
66 | for group in filter(lambda ag: ag.group_type == AdjacencyGroupType.GENERAL, groups):
67 | group_fp = group.extra.get(FALSE_POSITIVE, None)
68 | if group_fp is None:
69 | if not skip_missing_fp:
70 | result.append(group)
71 | if short_circuit:
72 | return result
73 | continue
74 | total_present = set()
75 | for clone_id in clone_ids:
76 | acnp = acnt[clone_id]
77 | clone_specific = acnp.haploid_adjacencies_present(adjacencies=group.adjacencies)
78 | for adjacency in clone_specific:
79 | total_present.add(adjacency.stable_id_non_phased)
80 | inferred_fp = 1 - (len(total_present) * 1.0 / len(group.adjacencies_ids))
81 | if inferred_fp > group_fp:
82 | result.append(group)
83 | if short_circuit and len(result) > 0:
84 | return result
85 | return result
86 |
87 |
88 | def adjacency_groups_labeling_violations(groups, acnt, clone_ids=None, short_circuit=False):
89 | if clone_ids is None:
90 | clone_ids = sorted(acnt.keys())
91 | result = []
92 | for group in filter(lambda ag: ag.group_type == AdjacencyGroupType.LABELING, groups):
93 | adjacencies = group.adjacencies
94 | indexes = group.extra[AG_LABELING]
95 | haplotype_specific_presence = defaultdict(list)
96 | for hap in [Haplotype.A, Haplotype.B]:
97 | for adjacency, index in zip(adjacencies, indexes):
98 | phasings = [Phasing.AA] if hap == Haplotype.A else [Phasing.BB]
99 | local_phasings = [Phasing.AB, Phasing.BA] if hap == Haplotype.A else [Phasing.BA, Phasing.AB]
100 | phasings.append(local_phasings[index])
101 | aid = adjacency.stable_id_non_phased
102 | cn = 0
103 | for clone_id in clone_ids:
104 | acnp = acnt[clone_id]
105 | for ph in phasings:
106 | cn += acnp.get_cn(aid=aid, phasing=ph)
107 | haplotype_specific_presence[hap].append(cn != 0)
108 | haplotype_specific_presence = dict(haplotype_specific_presence)
109 | for hap in [Haplotype.A, Haplotype.B]:
110 | haplotype_specific_presence[hap] = any(haplotype_specific_presence[hap])
111 | if sum(haplotype_specific_presence.values()) > 1:
112 | result.append(group)
113 | if short_circuit:
114 | return result
115 | return result
116 |
117 |
118 | def nas_fp_violations(acnt, fp, adjacencies=None):
119 | pass
120 |
--------------------------------------------------------------------------------
/rck/utils/karyotype/rck_kar_graph.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 |
4 | from rck.core.graph import construct_hiag_inflate_from_haploid_data
5 | from rck.core.io import get_logging_cli_parser, read_scnt_from_source, read_acnt_from_source, write_graph_to_destination
6 |
7 |
8 | def main():
9 | parser = argparse.ArgumentParser(prog="RCK-UTILS-KAR-graph")
10 | cli_logging_parser = get_logging_cli_parser()
11 | parser.add_argument("--acnt", required=True, type=argparse.FileType("rt"))
12 | parser.add_argument("--acnt-separator", default="\t")
13 | parser.add_argument("--acnt-extra-separator", default=";")
14 | parser.add_argument("--scnt", required=True, type=argparse.FileType("rt"))
15 | parser.add_argument("--scnt-separator", default="\t")
16 | parser.add_argument("--scnt-extra-separator", default=";")
17 | parser.add_argument("--clone")
18 | subparsers = parser.add_subparsers(title="commands", dest="command")
19 | subparsers.required = True
20 | writer_parser = subparsers.add_parser("write", parents=[cli_logging_parser])
21 | writer_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
22 | writer_parser.add_argument("--style", choices=["edge-list"], default="edge-list")
23 | writer_parser.add_argument("--separator", default="\t")
24 | writer_parser.add_argument("--include-absent", action="store_true", dest="include_cn_0")
25 | args = parser.parse_args()
26 | segments, scnt = read_scnt_from_source(source=args.scnt, separator=args.scnt_separator, extra_separator=args.scnt_extra_separator, remove_cn_data_from_segs=True)
27 | adjacencies, acnt = read_acnt_from_source(source=args.acnt, separator=args.acnt_separator, extra_separator=args.acnt_extra_separator, remove_cn_data_from_adj=True)
28 | if args.command == "write":
29 | hiag = construct_hiag_inflate_from_haploid_data(hapl_segments=segments, hapl_adjacencies=adjacencies)
30 | if args.clone is None:
31 | common_clones = set(acnt.keys()) & set(scnt.keys())
32 | if len(common_clones) == 0:
33 | raise ValueError("No common clones in Adjacency and Segment Copy Number tensors")
34 | args.clone = sorted(common_clones)[0]
35 | acnp, scnp = acnt[args.clone], scnt[args.clone]
36 | hiag.assign_copy_numbers_from_scn_profile(scn_profile=scnp)
37 | hiag.assign_copy_numbers_from_acn_profile(acn_profile=acnp)
38 | if not args.include_cn_0:
39 | hiag.remove_edges_with_zero_cn()
40 | write_graph_to_destination(graph=hiag, destination=args.output, style=args.style)
41 |
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------
/rck/utils/karyotype/rck_kar_stats.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from collections import defaultdict
3 |
4 | from rck.core.graph import construct_hiag_inflate_from_haploid_data
5 | from rck.core.io import read_scnt_from_source, read_acnt_from_source, read_scnb_from_source, read_adjacency_groups_from_source, read_positions_from_source, get_logging_cli_parser, \
6 | get_standard_logger_from_args, EXTERNAL_NA_ID
7 | from rck.core.structures import get_ref_telomeres_from_segments, AdjacencyType, AdjacencyGroupType, Segment
8 | from rck.utils.karyotype.analysis import adjacency_groups_molecule_violations, adjacency_groups_labeling_violations, adjacency_groups_general_violations
9 |
10 |
11 | def main():
12 | parser = argparse.ArgumentParser(prog="RCK-UTILS-KAR-stats", parents=[get_logging_cli_parser()])
13 | parser.add_argument("--verbose", choices=[0, 1, 2, 3, 4, 5], type=int, default=5)
14 | parser.add_argument("--acnt", required=True, type=argparse.FileType("rt"))
15 | parser.add_argument("--acnt-separator", default="\t")
16 | parser.add_argument("--acnt-extra-separator", default=";")
17 | parser.add_argument("--scnt", required=True, type=argparse.FileType("rt"))
18 | parser.add_argument("--scnt-separator", default="\t")
19 | parser.add_argument("--scnt-extra-separator", default=";")
20 | parser.add_argument("--scnb", type=argparse.FileType("rt"))
21 | parser.add_argument("--scnb-separator", default="\t")
22 | parser.add_argument("--scnb-extra-separator", default=";")
23 | parser.add_argument("--nas-fp", type=float, default=-1.0)
24 | parser.add_argument("--adjacency-groups", type=argparse.FileType("rt"))
25 | parser.add_argument("--adg-separator", default="\t")
26 | parser.add_argument("--adg-aids-separator", default=",")
27 | parser.add_argument("--adg-extra-separator", default=";")
28 | parser.add_argument("--telomere-positions", type=argparse.FileType("rt"))
29 | parser.add_argument("--telomere-positions-separator", default="\t")
30 | parser.add_argument("--telomere-positions-extra-separator", default=";")
31 | args = parser.parse_args()
32 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-KAR-stats")
33 | logger.info("Reading segment copy number tensor from {file}".format(file=args.scnt))
34 | segments, scnt = read_scnt_from_source(source=args.scnt, separator=args.scnt_separator, extra_separator=args.scnt_extra_separator, remove_cn_data_from_segs=True)
35 | logger.info("Reading adjacency copy number tensor from {file}".format(file=args.acnt))
36 | adjacencies, acnt = read_acnt_from_source(source=args.acnt, separator=args.acnt_separator, extra_separator=args.acnt_extra_separator, remove_cn_data_from_adj=True)
37 | if args.scnb is not None:
38 | logger.info("Reading segment copy number boundaries tensor from {file}".format(file=args.scnb))
39 | _, scnb = read_scnb_from_source(source=args.scnb, separator=args.scnb_separator, extra_separator=args.scnb_extra_separator, remove_cnb_data_from_segs=True)
40 | else:
41 | logger.info("No segment copy number boundaries tensor is provided via --scnb flag")
42 | scnb = None
43 | if args.adjacency_groups is not None:
44 | logger.info("Reading adjacency groups information from {file}".format(file=args.adjacency_groups))
45 | groups = read_adjacency_groups_from_source(source=args.adjacency_groups, separator=args.adg_separator,
46 | extra_separator=args.adg_extra_separator, aids_separator=args.adg_aids_separator)
47 | else:
48 | logger.info("No adjacency groups information is provided via --adjacency-groups flag")
49 | groups = []
50 | if args.telomere_positions is not None:
51 | logger.info("Reading telomere positions from {file}".format(file=args.telomere_positions))
52 | telomeres = read_positions_from_source(source=args.telomere_positions, separator=args.telomeres_positions_separator,
53 | extra_separator=args.telomere_positions_extra_separator)
54 | else:
55 | logger.info("No telomere positions are provided via --telomere-positions flag. Defaulting to reference telomere positions".format(file=args.telomere_positions))
56 | telomeres = get_ref_telomeres_from_segments(segments=segments)
57 | segments_by_chrs = defaultdict(list)
58 | for segment in segments:
59 | segments_by_chrs[segment.chromosome].append(segment)
60 | print("A total of {cnt} chromosomes are observed".format(cnt=len(segments_by_chrs)))
61 | total_segments_cnt = 0
62 | for chr_name, chr_segments in segments_by_chrs.items():
63 | total_segments_cnt += len(chr_segments)
64 | if args.verbose >= 3:
65 | print("Chromosome {chr_name} has {cnt} segments".format(chr_name=chr_name, cnt=len(chr_segments)))
66 | print("A total of {cnt} segments are observed".format(cnt=total_segments_cnt))
67 | novel_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.NOVEL]
68 | reference_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.REFERENCE]
69 | print("A total of {cnt} adjacencies ({n_cnt} novel; {r_cnt} reference)".format(cnt=len(novel_adjacencies) + len(reference_adjacencies),
70 | n_cnt=len(novel_adjacencies), r_cnt=len(reference_adjacencies)))
71 |
72 | adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies}
73 | if groups is not None:
74 | for ag in groups:
75 | ag.populate_adjacencies_via_ids(source=adjacencies, source_by_ids=adjacencies_by_external_ids)
76 | molecule_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.MOLECULE]
77 | labeling_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.LABELING]
78 | general_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.GENERAL]
79 | if len(molecule_groups) > 0:
80 | logger.info("Checking compliance with {cnt} molecule groups".format(cnt=len(molecule_groups)))
81 | molecule_groups_violations = adjacency_groups_molecule_violations(groups=molecule_groups, acnt=acnt)
82 | if len(molecule_groups_violations):
83 | logger.error("A total of {cnt} molecule groups DO NOT agree with input karyotype. See molecule groups ids below".format(cnt=len(molecule_groups)))
84 | logger.error(", ".join([ag.gid for ag in molecule_groups_violations]))
85 | else:
86 | logger.info("All molecule groups agree with input karyotype")
87 | else:
88 | logger.info("No molecule groups were provided. Nothing to check.")
89 | if len(labeling_groups) > 0:
90 | logger.info("Checking compliance with {cnt} labeling groups".format(cnt=len(labeling_groups)))
91 | labeling_groups_violations = adjacency_groups_labeling_violations(groups=labeling_groups, acnt=acnt)
92 | if len(labeling_groups_violations):
93 | logger.error("A total of {cnt} labeling groups DO NOT agree with input karyotype. See labeling groups ids below".format(cnt=len(labeling_groups_violations)))
94 | logger.error(", ".join([ag.gid for ag in labeling_groups_violations]))
95 | else:
96 | logger.info("All labeling groups agree with input karyotype")
97 | else:
98 | logger.info("No labeling groups were provided. Nothing to check.")
99 | if len(general_groups) > 0:
100 | logger.info("Checking compliance with {cnt} general groups".format(cnt=len(general_groups)))
101 | general_groups_violations = adjacency_groups_general_violations(groups=general_groups, acnt=acnt)
102 | if len(general_groups_violations):
103 | logger.error("A total of {cnt} general groups DO NOT agree with input karyotype. See general groups ids below".format(cnt=len(general_groups_violations)))
104 | logger.error(", ".join([ag.gid for ag in general_groups_violations]))
105 | else:
106 | logger.info("All general groups agree with input karyotype")
107 | else:
108 | logger.info("No information about adjacency groups were provided. Nothing to check.")
109 |
110 | clone_ids = sorted(set(scnt.keys()) & set(acnt.keys()))
111 | for clone_id in clone_ids:
112 | logger.info("Checking balancing and telomeres for clone {clone_id}".format(clone_id=clone_id))
113 | hiag = construct_hiag_inflate_from_haploid_data(hapl_segments=segments, hapl_adjacencies=adjacencies)
114 | scnp = scnt[clone_id]
115 | acnp = acnt[clone_id]
116 | hiag.assign_copy_numbers_from_scn_profile(scn_profile=scnp)
117 | hiag.assign_copy_numbers_from_acn_profile(acn_profile=acnp)
118 | hiag.remove_edges_with_zero_cn()
119 | logger.info("Checking that every vertex has a copy number excess >= 0.")
120 | for node in hiag.nodes(data=False):
121 | if hiag.node_imbalance(node=node) < 0:
122 | logger.warning("Something went WRONG! On segment extremity {node} there is a negative copy number excess...".format(node=str(node)))
123 | logger.info("Getting inferred telomeres.")
124 | diploid_telomeres = hiag.get_telomeres()
125 | inferred_hapl_telomeres_ids = {p.stable_id_non_hap for p in diploid_telomeres}
126 | input_hapl_telomers_ids = {p.stable_id_non_hap for p in telomeres}
127 | if inferred_hapl_telomeres_ids > input_hapl_telomers_ids:
128 | logger.error("Something went WRONG! Following segments extremities, while not specified specified as possible telomere sites were inferred as such.")
129 | logger.error(",".join(map(str, sorted(inferred_hapl_telomeres_ids - input_hapl_telomers_ids))))
130 | else:
131 | logger.info("Everything is OK! in clone {clone_id} all extremities have non-negative copy number excess, and inferred telomere sites concur with the input"
132 | "".format(clone_id=clone_id))
133 | length = 0
134 | for u, v, data in hiag.segment_edges():
135 | s: Segment = data["object"]
136 | length += s.length * data["copy_number"]
137 | logger.info(f"Total length for clone {clone_id} = {length}")
138 | chromosome_cnt = sum(hiag.node_imbalance(node) for node in hiag.nodes(data=False)) / 2
139 | logger.info(f"Total number of chromosomes in clone {clone_id} = {chromosome_cnt}")
140 |
141 |
142 | if __name__ == "__main__":
143 | main()
144 |
--------------------------------------------------------------------------------
/rck/utils/rck_input_refine.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import sys
5 | from copy import deepcopy
6 |
7 | current_file_level = 2
8 | current_dir = os.path.dirname(os.path.realpath(__file__))
9 | for _ in range(current_file_level):
10 | current_dir = os.path.dirname(current_dir)
11 | sys.path.append(current_dir)
12 |
13 | import rck
14 | from rck.core.io import read_adjacencies_from_file, \
15 | get_logging_cli_parser, get_standard_logger_from_args, get_full_path, read_scnt_from_file, read_positions_from_source, \
16 | write_segments_to_file, write_scnt_to_file
17 | from rck.core.structures import refined_scnt, refined_scnt_with_adjacencies_and_telomeres
18 |
19 |
20 | def main():
21 | parser = argparse.ArgumentParser(prog="RCK-UTILS-input-refine", parents=[get_logging_cli_parser()])
22 | parser.add_argument("--version", action="version", version=rck.version)
23 | parser.add_argument("--scnt", required=True)
24 | parser.add_argument("--adjacencies", required=True)
25 | parser.add_argument("--clone-ids", default=None)
26 | parser.add_argument("--scnt-separator", default="\t")
27 | parser.add_argument("--adjacencies-separator", default="\t")
28 | parser.add_argument("--no-merge-fragments", action="store_false", dest="merge_fragments")
29 | parser.add_argument("--fragments-max-merge-gap", type=int, default=1000000000)
30 | parser.add_argument("--no-fill-gaps-fragments", action="store_false", dest="fill_gaps_fragments")
31 | parser.add_argument("--fragments-max-fill-gap", type=int, default=1000000000)
32 | parser.add_argument("--no-allow-unit-segments", action="store_false", dest="allow_unit_segments")
33 | parser.add_argument("--telomere-positions", type=argparse.FileType("rt"))
34 | parser.add_argument("--telomere-positions-separator", default="\t")
35 | parser.add_argument("--output-scnt", required=True)
36 | parser.add_argument("--output-fragments", required=True)
37 | args = parser.parse_args()
38 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-input-refine")
39 | clone_ids = args.clone_ids.split(",") if args.clone_ids is not None else None
40 | scnt_file = get_full_path(args.scnt_file)
41 | adj_file = get_full_path(args.adj)
42 | segments, scnt = read_scnt_from_file(file_name=scnt_file, clone_ids=clone_ids, separator=args.scnt_separator)
43 | clone_ids = sorted(set(scnt.keys()))
44 | segments, scnt, segments_ids_mapping = refined_scnt(segments=segments, scnt=scnt,
45 | merge_fragments=args.merge_fragments, max_merge_gap=args.fragments_max_merge_gap,
46 | fill_gaps=args.fill_gaps_fragments, max_fill_gap=args.fragments_max_fill_gap)
47 |
48 | adjacencies = read_adjacencies_from_file(file_name=adj_file, separator=args.adjacencies_separator)
49 | if args.telomere_positions is not None:
50 | telomere_positions = read_positions_from_source(source=args.telomere_positions, separator=args.telomere_positions_separator)
51 | else:
52 | telomere_positions = []
53 | fragments = deepcopy(segments)
54 | segments, scnt = refined_scnt_with_adjacencies_and_telomeres(segments=segments, scnt=scnt, adjacencies=adjacencies, telomere_positions=telomere_positions)
55 | refined_scnt_file = os.path.expanduser(args.refined_scnt_file)
56 | refined_scnt_file = os.path.abspath(refined_scnt_file)
57 | fragments_file = get_full_path(path=args.output_fragments)
58 |
59 | write_segments_to_file(file_name=fragments_file, segments=fragments)
60 | write_scnt_to_file(file_name=refined_scnt_file, scnt=scnt, segments=segments)
61 |
62 |
63 | if __name__ == "__main__":
64 | main()
65 |
--------------------------------------------------------------------------------
/rck/utils/scn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/rck/utils/scn/__init__.py
--------------------------------------------------------------------------------
/rck/utils/scn/convert.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import math
3 |
4 | import gffutils
5 |
6 | from rck.core.structures import SegmentCopyNumberProfile, Segment, Haplotype
7 | from rck.utils.adj.convert import strip_chr
8 |
9 | BATTENBERG_SAMPLE_NAME = "sample"
10 | BATTENBERG_CHROMOSOME = "chr"
11 | BATTENBERG_START_POSITION = "startpos"
12 | BATTENBERG_END_POSITION = "endpos"
13 | BATTENBERG_CLONE1_CN_A = "nMaj1_A"
14 | BATTENBERG_CLONE1_CN_B = "nMin1_A"
15 | BATTENBERG_CLONE2_CN_A = "nMaj2_A"
16 | BATTENBERG_CLONE2_CN_B = "nMin2_A"
17 |
18 |
19 | def battenberg_force_non_negativity(cn):
20 | return cn if cn >= 0 else 0
21 |
22 |
23 | def battenberg_get_subclonal_cn(subclonal_cn_string, clonal_cn_int):
24 | if subclonal_cn_string == "NA":
25 | return clonal_cn_int
26 | return battenberg_force_non_negativity(int(subclonal_cn_string))
27 |
28 |
29 | def battenberg_get_scnt_from_battenberg_file(file_name, sample_name, separator="\t", chr_strip=True):
30 | with open(file_name, "rt") as source:
31 | return get_scnt_from_battenberg_source(source=source, sample_name=sample_name, separator=separator, chr_strip=chr_strip)
32 |
33 |
34 | def get_scnt_from_battenberg_source(source, sample_name, separator="\t", chr_strip=True):
35 | clone1_name = "1"
36 | clone2_name = "2"
37 | scnt = {clone1_name: SegmentCopyNumberProfile(), clone2_name: SegmentCopyNumberProfile()}
38 | segments = []
39 | reader = csv.DictReader(source, delimiter=separator)
40 | for row in reader:
41 | if BATTENBERG_SAMPLE_NAME in row and row[BATTENBERG_SAMPLE_NAME] != sample_name:
42 | continue
43 | start_coordinate = int(row[BATTENBERG_START_POSITION])
44 | end_coordinate = int(row[BATTENBERG_END_POSITION])
45 | chromosome = row[BATTENBERG_CHROMOSOME]
46 | if chr_strip:
47 | chromosome = strip_chr(chr_string=chromosome)
48 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coordinate, end=end_coordinate)
49 | clone1_scnp = scnt[clone1_name]
50 | clone2_scnp = scnt[clone2_name]
51 | cn1a = battenberg_force_non_negativity(int(row[BATTENBERG_CLONE1_CN_A]))
52 | cn1b = battenberg_force_non_negativity(int(row[BATTENBERG_CLONE1_CN_B]))
53 | clone1_scnp.set_cn_record_for_segment(segment=segment, cn=cn1a, haplotype=Haplotype.A)
54 | clone1_scnp.set_cn_record_for_segment(segment=segment, cn=cn1b, haplotype=Haplotype.B)
55 | cn2a = battenberg_get_subclonal_cn(subclonal_cn_string=row[BATTENBERG_CLONE2_CN_A], clonal_cn_int=cn1a)
56 | cn2b = battenberg_get_subclonal_cn(subclonal_cn_string=row[BATTENBERG_CLONE2_CN_B], clonal_cn_int=cn1b)
57 | clone2_scnp.set_cn_record_for_segment(segment=segment, cn=cn2a, haplotype=Haplotype.A)
58 | clone2_scnp.set_cn_record_for_segment(segment=segment, cn=cn2b, haplotype=Haplotype.B)
59 | segments.append(segment)
60 | return segments, scnt
61 |
62 |
63 | def hatchet_get_clone_ids_from_file(file_name, sample_name, separator="\t", min_usage=0.01):
64 | result = set()
65 | candidates = []
66 | with open(file_name, "rt") as source:
67 | for line_cnt, line in enumerate(source):
68 | line = line.strip()
69 | data = line.split(separator)
70 | clone_data = data[6:]
71 | if line_cnt == 0:
72 | total_clone_cnt = int(len(clone_data) / 2)
73 | candidates = [str(cnt) for cnt in range(1, total_clone_cnt + 1)]
74 | if line.startswith("#"):
75 | continue
76 | sample = data[3]
77 | if sample != sample_name:
78 | continue
79 | for candidate_clone_id, clone_usage_str in zip(candidates, clone_data[1::2]):
80 | clone_usage = float(clone_usage_str)
81 | if clone_usage < min_usage:
82 | continue
83 | result.add(candidate_clone_id)
84 | if sorted(result) == candidates:
85 | return sorted(result)
86 | return sorted(result)
87 |
88 |
89 | def get_scnt_from_hatchet_file(file_name, sample_name, separator="\t", clone_ids=None, min_usage=0.01, chr_strip=True):
90 | if clone_ids is None:
91 | clone_ids = hatchet_get_clone_ids_from_file(file_name=file_name, sample_name=sample_name, separator=separator, min_usage=min_usage)
92 | with open(file_name, "rt") as source:
93 | return get_scnt_from_hatchet_source(source=source, separator=separator, clone_ids=clone_ids, chr_strip=chr_strip)
94 |
95 |
96 | def get_scnt_from_hatchet_source(source, sample_name, clone_ids, separator="\t", chr_strip=True):
97 | scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids}
98 | segments = []
99 | clone_id_mappings = {}
100 | for line_cnt, line in enumerate(source):
101 | line = line.strip()
102 | data = line.split(separator)
103 | clone_data = data[6:]
104 | if line_cnt == 0:
105 | total_clone_cnt = int(len(clone_data) / 2)
106 | candidates = [str(cnt) for cnt in range(1, total_clone_cnt + 1)]
107 | for position_cnt, candidate in enumerate(candidates):
108 | if candidate in clone_ids:
109 | clone_id_mappings[candidate] = position_cnt
110 | clone_cn_strs = clone_data[::2]
111 | if line.startswith("#") or len(line) == 0:
112 | continue
113 | data_sample_name = data[3]
114 | if data_sample_name != sample_name:
115 | continue
116 | chromosome = data[0]
117 | if chr_strip:
118 | chromosome = strip_chr(chr_string=chromosome)
119 | start_coord = int(data[1])
120 | end_coord = int(data[2]) - 1
121 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coord, end=end_coord)
122 | segments.append(segment)
123 | fid = segment.stable_id_non_hap
124 | for clone_id in clone_ids:
125 | cns_str = clone_cn_strs[clone_id_mappings[clone_id]]
126 | data = cns_str.split("|")
127 | cna = int(data[0])
128 | cnb = int(data[1])
129 | scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.A, cn=cna)
130 | scnt[clone_id].set_cn_record(sid=fid, hap=Haplotype.B, cn=cnb)
131 | return segments, scnt
132 |
133 |
134 | REMIXT_CHROMOSOME = "chromosome"
135 | REMIXT_START_POSITION = "start"
136 | REMIXT_END_POSITION = "end"
137 | REMIXT_CLONE1_CN_A = "major_1"
138 | REMIXT_CLONE1_CN_B = "minor_1"
139 | REMIXT_CLONE2_CN_A = "major_2"
140 | REMIXT_CLONE2_CN_B = "minor_2"
141 |
142 |
143 | def get_scnt_from_remixt_file(file_name, separator="\t", chr_strip=True):
144 | with open(file_name, "rt") as source:
145 | return get_scnt_from_remixt_source(source=source, separator=separator, chr_strip=chr_strip)
146 |
147 |
148 | def get_scnt_from_remixt_source(source, separator="\t", chr_strip=True):
149 | segments = []
150 | clone1_id = "1"
151 | clone2_id = "2"
152 | scnt = {clone1_id: SegmentCopyNumberProfile(), clone2_id: SegmentCopyNumberProfile()}
153 | reader = csv.DictReader(source, delimiter=separator)
154 | for row in reader:
155 | chromosome = row[REMIXT_CHROMOSOME]
156 | if chr_strip:
157 | chromosome = strip_chr(chr_string=chromosome)
158 | start_coordinate = int(row[REMIXT_START_POSITION])
159 | end_coordinate = int(row[REMIXT_END_POSITION]) - 1
160 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start_coordinate, end=end_coordinate)
161 | segments.append(segment)
162 | sid = segment.stable_id_non_hap
163 | clone_1_cn_a = int(row[REMIXT_CLONE1_CN_A])
164 | clone_1_cn_b = int(row[REMIXT_CLONE1_CN_B])
165 | clone_2_cn_a = int(row[REMIXT_CLONE2_CN_A])
166 | clone_2_cn_b = int(row[REMIXT_CLONE2_CN_A])
167 | clone1_scnp = scnt[clone1_id]
168 | clone2_scnp = scnt[clone2_id]
169 | clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_1_cn_a)
170 | clone1_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_1_cn_b)
171 | clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=clone_2_cn_a)
172 | clone2_scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=clone_2_cn_b)
173 | return segments, scnt
174 |
175 |
176 | TITAN_CHROMOSOME = "Chromosome"
177 | TITAN_START_POSITION = "Start"
178 | TITAN_END_POSITION = "End"
179 | TITAN_MAJOR_CN = "MajorCN"
180 | TITAN_MINOR_CN = "MinorCN"
181 | TITAN_CLONE_ID = "Clonal_Cluster"
182 | TITAN_CORRECTED_CN = "Corrected_Copy_Number"
183 | TITAN_SAMPLE_NAME = "Sample"
184 |
185 |
186 | def titan_get_clone_ids_from_file(file_name, sample_name, separator="\t"):
187 | with open(file_name, "rt") as source:
188 | result = set()
189 | reader = csv.DictReader(source, delimiter=separator)
190 | for row in reader:
191 | if row[TITAN_SAMPLE_NAME] != sample_name:
192 | continue
193 | clone_id = row[TITAN_CLONE_ID]
194 | if clone_id != "NA":
195 | result.add(clone_id)
196 | return sorted(result)
197 |
198 |
199 | def get_scnt_from_titan_file(file_name, sample_name, clone_ids=None, separator="\t", corrected_cn_fix="None", chr_strip=True):
200 | if clone_ids is None:
201 | clone_ids = titan_get_clone_ids_from_file(file_name=file_name, sample_name=sample_name, separator=separator)
202 | with open(file_name, "rt") as source:
203 | return get_scnt_from_titan_source(source=source, sample_name=sample_name, clone_ids=clone_ids, separator=separator, corrected_cn_fix=corrected_cn_fix, chr_strip=chr_strip)
204 |
205 |
206 | def get_scnt_from_titan_source(source, sample_name, clone_ids, separator="\t", corrected_cn_fix="None", chr_strip=True):
207 | scnt = {clone_id: SegmentCopyNumberProfile() for clone_id in clone_ids}
208 | segments = []
209 | reader = csv.DictReader(source, delimiter=separator)
210 | for row in reader:
211 | if row[TITAN_SAMPLE_NAME] != sample_name:
212 | continue
213 | chromosome = row[TITAN_CHROMOSOME]
214 | if chr_strip:
215 | chromosome = strip_chr(chr_string=chromosome)
216 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=int(row[TITAN_START_POSITION]), end=int(row[TITAN_END_POSITION]))
217 | sid = segment.stable_id_non_hap
218 | segments.append(segment)
219 | major_cn, minor_cn = int(row[TITAN_MAJOR_CN]), int(row[TITAN_MINOR_CN])
220 | if minor_cn > major_cn:
221 | minor_cn, major_cn = major_cn, minor_cn
222 | titan_clone_id = row[TITAN_CLONE_ID]
223 | corrected_cn = int(row[TITAN_CORRECTED_CN])
224 | for clone_id in clone_ids:
225 | scnp = scnt[clone_id]
226 | if titan_clone_id == clone_id:
227 | if major_cn + minor_cn != corrected_cn and corrected_cn_fix != "None":
228 | diff = corrected_cn - major_cn - minor_cn
229 | ###
230 | # initialize as 0 when corrected_cn_fix strategy does not match any known, yet is not "None"
231 | ###
232 | major_cn_addition = 0
233 | minor_cn_addition = 0
234 | if corrected_cn_fix == "equal":
235 | major_cn_addition = int(math.ceil(diff / 2))
236 | minor_cn_addition = diff - major_cn_addition
237 | elif corrected_cn_fix == "relative-dist":
238 | relative_relation = minor_cn * 1.0 / major_cn
239 | major_cn_addition = int(math.ceil(diff / (1 + relative_relation)))
240 | minor_cn_addition = diff - major_cn_addition
241 | major_cn += major_cn_addition
242 | minor_cn += minor_cn_addition
243 | scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=major_cn)
244 | scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=minor_cn)
245 | else:
246 | scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=1)
247 | scnp.set_cn_record(sid=sid, hap=Haplotype.B, cn=1)
248 | return segments, scnt
249 |
250 |
251 | GINKGO_CHROMOSOME = "CHR"
252 | GINKGO_START_POSITION = "START"
253 | GINKGO_END_POSITION = "END"
254 |
255 |
256 | def get_scnt_from_ginkgo_file(file_name, sample_name, dummy_clone="1", separator="\t", chr_strip=True):
257 | with open(file_name, "rt") as source:
258 | return get_scnt_from_ginkgo_source(source=source, sample_name=sample_name, dummy_clone=dummy_clone, separator=separator, chr_strip=chr_strip)
259 |
260 |
261 | def get_scnt_from_ginkgo_source(source, sample_name, dummy_clone="1", separator="\t", chr_strip=True):
262 | scnp = SegmentCopyNumberProfile()
263 | segments = []
264 | reader = csv.DictReader(source, delimiter=separator)
265 | for row in reader:
266 | chromosome = row[GINKGO_CHROMOSOME]
267 | if chr_strip:
268 | chromosome = strip_chr(chr_string=chromosome)
269 | start = int(row[GINKGO_START_POSITION])
270 | end = int(row[GINKGO_END_POSITION])
271 | try:
272 | cn = int(row[sample_name])
273 | except KeyError:
274 | raise IOError("Could not obtain a segment copy value for sample {sample}. Make sure that --sample-name matches (including case) to the column header in the Ginkgo file")
275 | segment = Segment.from_chromosome_coordinates(chromosome=chromosome, start=start, end=end)
276 | sid = segment.stable_id_non_hap
277 | segments.append(segment)
278 | scnp.set_cn_record(sid=sid, hap=Haplotype.A, cn=cn)
279 | scnt = {dummy_clone: scnp}
280 | return segments, scnt
281 |
282 |
283 | def get_segments_from_gff_file(file_name, chr_strip=True, chr_mapping=None, chr_mapping_missing_strategy="keep"):
284 | result = []
285 | for record in gffutils.DataIterator(file_name):
286 | chr_name = record.chrom
287 | if chr_mapping is not None and chr_name not in chr_mapping and chr_mapping_missing_strategy == "skip":
288 | continue
289 | if chr_mapping is not None:
290 | chr_name = chr_mapping.get(chr_name, chr_name)
291 | if chr_strip:
292 | chr_name = strip_chr(chr_string=chr_name)
293 | extra = dict(record.attributes)
294 | new_extra = {}
295 | for key, value in extra.items():
296 | if isinstance(value, list) and len(value) == 1:
297 | value = value[0]
298 | new_extra[key] = value
299 | segment = Segment.from_chromosome_coordinates(chromosome=chr_name, start=record.start, end=record.end)
300 | segment.extra.update(new_extra)
301 | result.append(segment)
302 | return result
303 |
304 |
305 |
306 |
--------------------------------------------------------------------------------
/rck/utils/scn/rck_scnb.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import os
4 |
5 | import sys
6 |
7 |
8 | current_file_level = 3
9 | current_dir = os.path.dirname(os.path.realpath(__file__))
10 | for _ in range(current_file_level):
11 | current_dir = os.path.dirname(current_dir)
12 | sys.path.append(current_dir)
13 |
14 | import rck
15 | from rck.core.io import read_scnt_from_source, extract_scnb_from_segments, write_scnb_to_destination
16 | from rck.core.structures import SCNBoundariesStrategies, LengthSpreadRelationships, SegmentCopyNumberBoundaries
17 |
18 |
19 | def main():
20 | parser = argparse.ArgumentParser("Creating boundaries for a RCK formatted segment copy number tensor")
21 | parser.add_argument('--version', action='version', version=rck.version)
22 | parser.add_argument("scnt", type=argparse.FileType("rt"), default=sys.stdin)
23 | parser.add_argument("--bnd-strategy", choices=[strategy.value for strategy in SCNBoundariesStrategies], type=SCNBoundariesStrategies.from_string,
24 | default=SCNBoundariesStrategies.UNIFORM_MIN_MAX.value)
25 | parser.add_argument("--uniform-spread-size", type=int, default=1)
26 | parser.add_argument("--length-spread-relation", choices=[rel.value for rel in LengthSpreadRelationships], type=LengthSpreadRelationships.from_string,
27 | default=LengthSpreadRelationships.DUMMY.value)
28 | parser.add_argument("--uniform-min", type=int, default=0)
29 | parser.add_argument("--uniform-max", type=int, default=10)
30 | parser.add_argument("--missing-only", action="store_true", dest="missing_only")
31 | parser.add_argument("--min-allow-zero-for-positive", type=int, default=-1)
32 | parser.add_argument("--max-allow-zero-for-positive", type=int, default=1000000000)
33 | parser.add_argument("--min-allow-positive-for-zero", type=int, default=-1)
34 | parser.add_argument("--max-allow-positive-for-zero", type=int, default=1000000000)
35 | parser.add_argument("--clone-ids", default="")
36 | parser.add_argument("--is-male", action="store_false", dest="is_female")
37 | parser.add_argument("--no-allow-unit-segments", dest="allow_unit_segments", action="store_false")
38 | parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout)
39 | parser.add_argument("--o-with-scnt", action="store_true", dest="output_scnt")
40 | parser.add_argument("--separator", default="\t")
41 | args = parser.parse_args()
42 | clone_ids = args.clone_ids.split(",") if len(args.clone_ids) > 0 else None
43 | segments, scnt = read_scnt_from_source(source=args.scnt, clone_ids=clone_ids)
44 | if clone_ids is None:
45 | clone_ids = sorted(scnt.keys())
46 | try:
47 | scnb = extract_scnb_from_segments(segments=segments, clone_ids=clone_ids)
48 | except ValueError:
49 | scnb = {clone_id: SegmentCopyNumberBoundaries() for clone_id in clone_ids}
50 | for clone_id in clone_ids:
51 | scnb[clone_id].fill(segments=segments, scnp=scnt[clone_id], missing_only=args.missing_only, strategy=args.bnd_strategy,
52 | min_allow_zero_for_positive=args.min_allow_zero_for_positive,
53 | max_allow_zero_for_positive=args.max_allow_zero_for_positive,
54 | min_allow_positive_for_zero=args.min_allow_positive_for_zero,
55 | max_allow_positive_for_zero=args.max_allow_positive_for_zero,
56 | uniform_spread_size=args.uniform_spread_size,
57 | length_spread_relation=args.length_spread_relation,
58 | uniform_min=args.uniform_min,
59 | uniform_max=args.uniform_max,
60 | is_female=args.is_female)
61 | write_scnb_to_destination(destination=args.output, segments=segments, scnb=scnb, clone_ids=clone_ids)
62 |
63 |
64 | if __name__ == "__main__":
65 | main()
66 |
--------------------------------------------------------------------------------
/rck/utils/scn/rck_scnt_process.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import itertools
3 | import sys
4 | import os
5 |
6 | current_file_level = 3
7 | current_dir = os.path.dirname(os.path.realpath(__file__))
8 | for _ in range(current_file_level):
9 | current_dir = os.path.dirname(current_dir)
10 | sys.path.append(current_dir)
11 |
12 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, get_full_path, read_scnt_from_file, write_scnt_to_file, \
13 | write_scnt_to_destination, read_scnt_from_source, stream_segments_from_source, write_segments_to_destination
14 | from rck.core.structures import aligned_scnts, refined_scnt, cn_distance_inter_scnt
15 | from rck.utils.adj.process import KEEP, REMOVE, iter_over_string_entries_from_source, get_extra_field_regexes
16 | from rck.utils.scn.process import iter_haploid_segments, filter_segments_by_chromosomal_regions, filter_segments_by_extra, filter_segments_by_size
17 | from rck.utils.adj.convert import get_chrs_regions_string_list_from_file, parse_segment_chr_region, get_chrs_regions_string_lists_from_source
18 |
19 |
20 | def main():
21 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-process")
22 | cli_logging_parser = get_logging_cli_parser()
23 | subparsers = parser.add_subparsers(title="command", dest="command")
24 | subparsers.required = True
25 | ###
26 | refine_parser = subparsers.add_parser("refine", parents=[cli_logging_parser])
27 | refine_parser.add_argument('scnt', type=argparse.FileType("rt"), default=sys.stdin)
28 | refine_parser.add_argument("--separator", default="\t")
29 | refine_parser.add_argument("--no-allow-missing-clones", action="store_false", dest="allow_missing_clones")
30 | refine_parser.add_argument("--clone-ids", default=None)
31 | refine_parser.add_argument("--no-merge-fragments", action="store_false", dest="merge_fragments")
32 | refine_parser.add_argument("--max-merge-gap", type=int, default=1000000)
33 | refine_parser.add_argument("--no-fill-gaps", action="store_false", dest="fill_gaps")
34 | refine_parser.add_argument("--max-fill-gap", type=int, default=1000000)
35 | refine_parser.add_argument('--output', type=argparse.FileType("wt"), default=sys.stdout)
36 | ###
37 | align_parser = subparsers.add_parser("align", parents=[cli_logging_parser])
38 | align_parser.add_argument("scnt", nargs="+")
39 | align_parser.add_argument("--separator", default="\t")
40 | align_parser.add_argument("--output-suffix", default="aligned")
41 | align_parser.add_argument("--no-allow-unit-segments", action="store_false", dest="allow_unit_segments")
42 | align_parser.add_argument("--output-dir", default="")
43 | ###
44 | distance_parser = subparsers.add_parser("distance", parents=[cli_logging_parser])
45 | distance_parser.add_argument("--scnt1", type=argparse.FileType("rt"), required=True)
46 | distance_parser.add_argument("--scnt1-separator", default="\t")
47 | distance_parser.add_argument("--scnt1-extra-separator", default=";")
48 | distance_parser.add_argument("--scnt2", type=argparse.FileType("rt"), required=True)
49 | distance_parser.add_argument("--scnt2-separator", default="\t")
50 | distance_parser.add_argument("--scnt2-extra-separator", default=";")
51 | distance_parser.add_argument("--clone-ids", default=None)
52 | distance_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout)
53 | ###
54 | filter_parser = subparsers.add_parser("filter", parents=[cli_logging_parser])
55 | filter_parser.add_argument("scnt", type=argparse.FileType("rt"), default=sys.stdin)
56 | filter_parser.add_argument("--separator", default="\t")
57 | filter_parser.add_argument("--extra-separator", default=";")
58 | filter_parser.add_argument("--o-extra-fields", default="all")
59 | filter_parser.add_argument("--chrs-include", action="append", nargs=1)
60 | filter_parser.add_argument("--chrs-include-file", type=argparse.FileType("rt"))
61 | filter_parser.add_argument("--chrs-include-no-full", action="store_false", dest="include_full")
62 | filter_parser.add_argument("--chrs-exclude", action="append", nargs=1)
63 | filter_parser.add_argument("--chrs-exclude-file", type=argparse.FileType("rt"))
64 | filter_parser.add_argument("--chrs-exclude-full", action="store_true", dest="exclude_full")
65 | filter_parser.add_argument("--keep-extra-field-regex", nargs="+", default=None)
66 | filter_parser.add_argument("--keep-extra-field-regex-file", type=argparse.FileType("rt"), default=None)
67 | filter_parser.add_argument("--keep-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP)
68 | filter_parser.add_argument("--remove-extra-field-regex", nargs="+", default=None)
69 | filter_parser.add_argument("--remove-extra-field-regex-file", type=argparse.FileType("rt"), default=None)
70 | filter_parser.add_argument("--remove-extra-field-missing-strategy", choices=[KEEP, REMOVE], default=KEEP)
71 | filter_parser.add_argument("--min-size", type=int, default=0)
72 | filter_parser.add_argument("--max-size", type=int, default=1000000000)
73 | filter_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
74 | ###
75 | haploid_parser = subparsers.add_parser("haploid", parents=[cli_logging_parser])
76 | haploid_parser.add_argument("scnt", type=argparse.FileType("rt"), default=sys.stdin)
77 | haploid_parser.add_argument("--separator", default="\t")
78 | haploid_parser.add_argument("--extra-separator", default=";")
79 | haploid_parser.add_argument("--output", "-o", type=argparse.FileType("wt"), default=sys.stdout)
80 | ###
81 | args = parser.parse_args()
82 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT-process")
83 |
84 | if args.command == "refine":
85 | clone_ids = args.clone_ids.split(",") if args.clone_ids is not None else None
86 | logger.debug("Clone ids identified as {clone_ids}. If None -- all clone ids will be processed.".format(clone_ids=",".join(clone_ids)))
87 | logger.info("Reading Segment Copy Number Tensor form {file}".format(file=args.scnt))
88 | segments, scnt = read_scnt_from_source(source=args.scnt, clone_ids=clone_ids, separator=args.separator)
89 | logger.info("Refining Segment Copy Number Tensor from {file}".format(file=args.scnt))
90 | segments, scnt, _ = refined_scnt(segments=segments, scnt=scnt,
91 | merge_fragments=args.merge_fragments, max_merge_gap=args.max_merge_gap,
92 | fill_gaps=args.fill_gaps, max_fill_gap=args.max_fill_gap)
93 | logger.info("Writing refined Segment Copy Number Tensor to {file}".format(file=args.output))
94 | write_scnt_to_destination(destination=args.output, scnt=scnt, segments=segments, clone_ids=clone_ids, separator=args.separator)
95 | elif args.command == "align":
96 | scnt_files = {}
97 | for path in args.scnt:
98 | full_path = get_full_path(path=path)
99 | name = os.path.splitext(os.path.basename(full_path))[0]
100 | if name.endswith(".scnt"):
101 | name = name[:-5]
102 | if name.endswith("."):
103 | name = name[:-1]
104 | scnt_files[name] = full_path
105 | logger.debug("Input Segment Copy Number Tensors (SCNT) identified as {input_scnts}".format(input_scnts=" , ".join(scnt_files.values())))
106 | scnts_by_name = {}
107 | segments_by_name = {}
108 | clone_ids_by_scnt = {}
109 | logger.info("Reading input SCNTs")
110 | for name, path in scnt_files.items():
111 | logger.debug("Reading SCNT from {file}".format(file=scnt_files[name]))
112 | segments, scnt = read_scnt_from_file(file_name=scnt_files[name], separator=args.separator)
113 | clone_ids_by_scnt[name] = sorted(scnt.keys())
114 | scnts_by_name[name] = scnt
115 | segments_by_name[name] = segments
116 | if len(scnts_by_name.values()) == 1:
117 | logger.warning("Only one input SCNT identified. Doing nothing with it, outputting as is.")
118 | aligned_segments_by_name, aligned_scnts_by_name = segments_by_name, scnts_by_name
119 | else:
120 | logger.info("Aligning input SCNTs.")
121 | aligned_segments_by_name, aligned_scnts_by_name = aligned_scnts(segments_by_sample_names=segments_by_name, scnts_by_sample_names=scnts_by_name)
122 | result_base_names = {}
123 | cnt = 0
124 | for name in sorted(scnt_files.keys()):
125 | new_name = name
126 | if name in result_base_names:
127 | new_name = name + str(cnt)
128 | cnt += 1
129 | new_name = new_name + "." + args.output_suffix
130 | result_base_names[name] = new_name
131 | output_dir = args.output_dir if args.output_dir != "" else os.getcwd()
132 | output_dir = get_full_path(path=output_dir)
133 | logger.info("Writing aligned SCNTs")
134 | for name, new_name in result_base_names.items():
135 | scnt = aligned_scnts_by_name[name]
136 | segments = aligned_segments_by_name[name]
137 | scnt_path = os.path.join(output_dir, new_name + "rck.scnt.tsv")
138 | logger.debug("Writing aligned SCNT {scnt_name} to {file}".format(scnt_name=name, file=scnt_path))
139 | write_scnt_to_file(file_name=scnt_path, segments=segments, scnt=scnt, separator=args.separator)
140 | elif args.command == "filter":
141 | logger.info("Filtering input segments from following sources {sources}".format(sources=args.scnt))
142 | segments = stream_segments_from_source(source=args.scnt, separator=args.separator, extra_separator=args.extra_separator)
143 | include_chrs_regions_strings = []
144 | exclude_chrs_regions_strings = []
145 | if args.chrs_include is not None:
146 | for chrs_lists in args.chrs_include:
147 | for chrs_list in chrs_lists:
148 | for chr_name in chrs_list.split(","):
149 | include_chrs_regions_strings.append(chr_name)
150 | if args.chrs_include_file is not None:
151 | for chr_name in get_chrs_regions_string_lists_from_source(source=args.chrs_include_file):
152 | include_chrs_regions_strings.append(chr_name)
153 | if args.chrs_exclude is not None:
154 | for chrs_lists in args.chrs_exclude:
155 | for chrs_list in chrs_lists:
156 | for chr_name in chrs_list.split(","):
157 | exclude_chrs_regions_strings.append(chr_name)
158 | if args.chrs_exclude_file is not None:
159 | for chr_name in get_chrs_regions_string_list_from_file(file_name=args.chrs_exclude_file):
160 | exclude_chrs_regions_strings.append(chr_name)
161 | include_regions = [parse_segment_chr_region(string) for string in include_chrs_regions_strings]
162 | exclude_regions = [parse_segment_chr_region(string) for string in exclude_chrs_regions_strings]
163 | segments = filter_segments_by_chromosomal_regions(segments=segments, include=include_regions, exclude=exclude_regions,
164 | include_full=args.include_full, exclude_full=args.exclude_full)
165 | keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else []
166 | if args.keep_extra_field_regex_file is not None:
167 | keep_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.keep_extra_field_regex_file)))
168 | remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else []
169 | if args.remove_extra_field_regex_file is not None:
170 | remove_extra_field_entries.extend(list(iter_over_string_entries_from_source(source=args.remove_extra_field_regex_file)))
171 | keep_extra_field = get_extra_field_regexes(string_entries=keep_extra_field_entries)
172 | remove_extra_field = get_extra_field_regexes(string_entries=remove_extra_field_entries)
173 | segments = filter_segments_by_extra(segments=segments, keep_extra_field=keep_extra_field, keep_extra_field_missing_strategy=args.keep_extra_field_missing_strategy,
174 | remove_extra_field=remove_extra_field, remove_extra_field_missing_strategy=args.remove_extra_field_missing_strategy)
175 | segments = filter_segments_by_size(segments=segments, min_size=args.min_size, max_size=args.max_size)
176 | write_segments_to_destination(destination=args.output, segments=segments)
177 |
178 | elif args.command == "haploid":
179 | segments = stream_segments_from_source(source=args.scnt, separator=args.separator, extra_separator=args.extra_separator)
180 | haploid_segments = iter_haploid_segments(segments=segments, copy=False)
181 | write_segments_to_destination(destination=args.output, segments=haploid_segments)
182 | elif args.command == "distance":
183 | clone_ids = args.clone_ids
184 | if args.clone_ids is not None:
185 | clone_ids = args.clone_ids.split(",")
186 | segments1, scnt1 = read_scnt_from_source(source=args.scnt1, clone_ids=clone_ids, separator=args.scnt1_separator,
187 | extra_separator=args.scnt1_extra_separator, remove_cn_data_from_segs=True)
188 | segments2, scnt2 = read_scnt_from_source(source=args.scnt2, clone_ids=clone_ids, separator=args.scnt2_separator,
189 | extra_separator=args.scnt2_extra_separator, remove_cn_data_from_segs=True)
190 | segments_by_sample_names = {"1": segments1, "2": segments2}
191 | scnts_by_sample_names = {"1": scnt1, "2": scnt2}
192 | segments_by_sample_names, scnts_by_sample_names = aligned_scnts(segments_by_sample_names=segments_by_sample_names,
193 | scnts_by_sample_names=scnts_by_sample_names)
194 | segments = segments_by_sample_names["1"]
195 | scnt1, scnt2 = scnts_by_sample_names["1"], scnts_by_sample_names["2"]
196 | distance = cn_distance_inter_scnt(tensor1=scnt1, tensor2=scnt2, segments=segments, check_clone_ids_match=True)
197 | print("distance = ", distance)
198 |
199 | logger.info("Success!")
200 |
201 |
202 | if __name__ == "__main__":
203 | main()
204 |
--------------------------------------------------------------------------------
/rck/utils/scn/rck_scnt_rck2x.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 | import os
4 |
5 | current_file_level = 3
6 | current_dir = os.path.dirname(os.path.realpath(__file__))
7 | for _ in range(current_file_level):
8 | current_dir = os.path.dirname(current_dir)
9 | sys.path.append(current_dir)
10 |
11 | from rck.utils.scn.process import get_haploid_scnt, get_circa_segments_cna_fractions
12 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, read_scnt_from_source, write_scnt_to_shatterseek_destination, read_chr_sizes_from_source, \
13 | write_segments_to_circa_destination
14 | from rck.utils.adj.process import get_chromosome_strip_parser
15 |
16 |
17 | def main():
18 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-rck2x")
19 | cli_logging_parser = get_logging_cli_parser()
20 | chr_strip_parser = get_chromosome_strip_parser()
21 | subparsers = parser.add_subparsers(title="command", dest="command")
22 | subparsers.required = True
23 | ####
24 | shatterseek_parser = subparsers.add_parser("shatterseek", parents=[cli_logging_parser, chr_strip_parser])
25 | shatterseek_parser.add_argument("rck_scnt", type=argparse.FileType("rt"), default=sys.stdin)
26 | shatterseek_parser.add_argument("--clone-id", required=True)
27 | shatterseek_parser.add_argument("--separator", default="\t")
28 | shatterseek_parser.add_argument("--extra-separator", default=";")
29 | shatterseek_parser.add_argument("--default-cn", type=int, default=0)
30 | shatterseek_parser.add_argument("--output-header", action="store_true", dest="output_header")
31 | shatterseek_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
32 | ####
33 | circa_dens_parser = subparsers.add_parser("circa-dens", parents=[cli_logging_parser, chr_strip_parser])
34 | circa_dens_parser.add_argument("rck_scnt", type=argparse.FileType("rt"), default=sys.stdin)
35 | circa_dens_parser.add_argument("--clone-id", required=True)
36 | circa_dens_parser.add_argument("--separator", default="\t")
37 | circa_dens_parser.add_argument("--extra-separator", default=";")
38 | circa_dens_parser.add_argument("--cna-type", choices=["ampl", "del"], default="ampl")
39 | circa_dens_parser.add_argument("--haploid", action="store_true", dest="haploid")
40 | circa_dens_parser.add_argument("--inverse", action="store_true", dest="inverse")
41 | circa_dens_parser.add_argument("--window-size", type=int, default=10000000)
42 | circa_dens_parser.add_argument("--chr-sizes", type=argparse.FileType("rt"))
43 | circa_dens_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
44 | ####
45 | args = parser.parse_args()
46 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT")
47 |
48 | if args.command == "shatterseek":
49 | logger.info("Starting converting RCK Segment Copy Number Tensor data to ShatterSeek")
50 | logger.debug("Specified clone is {clone_id}".format(clone_id=args.clone_id))
51 | logger.info("Reading RCK formatted data from {file}".format(file=args.rck_scnt))
52 | segments, scnt = read_scnt_from_source(source=args.rck_scnt, separator=args.separator, extra_separator=args.extra_separator)
53 | logger.info("Read CN data is translated into a haploid (!!!) version of itself.")
54 | haploid_scnt = get_haploid_scnt(segments=segments, scnt=scnt)
55 | logger.info("Writing data for clone {clone_id} in a ShatterSeek suitable format to {file}".format(clone_id=args.clone_id, file=args.output))
56 | write_scnt_to_shatterseek_destination(destination=args.output, segments=segments, scnt=haploid_scnt, clone_id=args.clone_id,
57 | default=args.default_cn, output_header=args.output_header)
58 | elif args.command == "circa-dens":
59 | logger.info("Starting computing ampl/del statistics from RKC Segment Copy Number Tensor Format")
60 | logger.debug("Specified clone is {clone_id}".format(clone_id=args.clone_id))
61 | logger.info("Reading RCK formatted data from {file}".format(file=args.rck_scnt))
62 | segments, scnt = read_scnt_from_source(source=args.rck_scnt, separator=args.separator, extra_separator=args.extra_separator)
63 | chr_sizes = args.chr_sizes
64 | if args.chr_sizes is not None:
65 | chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes)
66 | circa_segments_cna_fractions = get_circa_segments_cna_fractions(segments=segments, scnt=scnt, clone_id=args.clone_id,
67 | window_size=args.window_size, chr_sizes=chr_sizes, cna_type=args.cna_type,
68 | haploid=args.haploid)
69 | segments = []
70 | total_average = 0
71 | total_length = 0
72 | for segment, cna_fraction in circa_segments_cna_fractions.items():
73 | value = cna_fraction * segment.length / args.window_size
74 | if args.inverse:
75 | value = 1 - value
76 | segment.extra[args.cna_type + "_fraction"] = value
77 | total_length += segment.length
78 | total_average += cna_fraction * segment.length
79 | segments.append(segment)
80 | logger.info("Total average cna fraction is " + str(total_average / total_length))
81 | write_segments_to_circa_destination(destination=args.output, segments=segments, extra=[args.cna_type + "_fraction"])
82 | logger.info("Success!")
83 |
84 |
85 | if __name__ == "__main__":
86 | main()
87 |
--------------------------------------------------------------------------------
/rck/utils/scn/rck_scnt_stats.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 |
4 | from rck.core.io import get_logging_cli_parser, read_scnt_from_source
5 | from rck.utils.scn.stats import cn_distance
6 |
7 |
8 | def main():
9 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-stats")
10 | cli_logging_parser = get_logging_cli_parser()
11 | subparsers = parser.add_subparsers(title="command", dest="command")
12 | subparsers.required = True
13 | #####
14 | distance_parser = subparsers.add_parser("distance", parents=[cli_logging_parser])
15 | distance_parser.add_argument('--scnt1', type=argparse.FileType("rt"), required=True)
16 | distance_parser.add_argument("--scnt1-separator", default="\t")
17 | distance_parser.add_argument("--scnt1-extra-separator", default=";")
18 | distance_parser.add_argument("--scnt1-clone-ids", default=None)
19 | distance_parser.add_argument('--scnt2', type=argparse.FileType("rt"), required=True)
20 | distance_parser.add_argument("--scnt2-separator", default="\t")
21 | distance_parser.add_argument("--scnt2-extra-separator", default=";")
22 | distance_parser.add_argument("--scnt2-clone-ids", default=None)
23 | distance_parser.add_argument("--topn", type=int, default=3)
24 | distance_parser.add_argument("--verbose", action="store_true", dest="verbose")
25 | distance_parser.add_argument("--both-haplotype-specific", action="store_true", dest="both_haplotype_specific")
26 | distance_parser.add_argument('-o', '--output', type=argparse.FileType("wt"), default=sys.stdout)
27 | #####
28 | args = parser.parse_args()
29 | if args.command == "distance":
30 | scnt1_clone_ids = args.scnt1_clone_ids if args.scnt1_clone_ids is None else args.scnt1_clone_ids.split(",")
31 | segments1, scnt1 = read_scnt_from_source(source=args.scnt1, separator=args.scnt1_separator, extra_separator=args.scnt1_extra_separator, clone_ids=scnt1_clone_ids)
32 | scnt2_clone_ids = args.scnt2_clone_ids if args.scnt2_clone_ids is None else args.scnt2_clone_ids.split(",")
33 | segments2, scnt2 = read_scnt_from_source(source=args.scnt2, separator=args.scnt2_separator, extra_separator=args.scnt2_extra_separator, clone_ids=scnt2_clone_ids)
34 | result = cn_distance(segments1=segments1, scnt1=scnt1, segments2=segments2, scnt2=scnt2, both_haplotype_specific=args.both_haplotype_specific)
35 | sorted_result = sorted([(key, value) for key, value in result.items()], key=lambda entry: sum(entry[1].values()))
36 | output_result = sorted_result[:args.topn]
37 | if args.verbose:
38 | print(f'Length-weighted segment copy number distance for tensors in {args.scnt1.name} and {args.scnt2.name}', file=args.output)
39 | for cnt, (case, clone_specific_distance) in enumerate(output_result, start=1):
40 | print(f'{cnt}. Best distance (total) of {sum(clone_specific_distance.values()):,} with clone-specific ones {clone_specific_distance}, for case {case}', file=args.output)
41 |
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------
/rck/utils/scn/rck_scnt_x2rck.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 | import os
4 |
5 | current_file_level = 3
6 | current_dir = os.path.dirname(os.path.realpath(__file__))
7 | for _ in range(current_file_level):
8 | current_dir = os.path.dirname(current_dir)
9 | sys.path.append(current_dir)
10 |
11 | from rck.core.io import get_logging_cli_parser, get_standard_logger_from_args, write_scnt_to_destination, get_full_path, write_segments_to_destination
12 | from rck.utils.scn.convert import get_scnt_from_battenberg_source, get_scnt_from_hatchet_source, hatchet_get_clone_ids_from_file, \
13 | get_scnt_from_remixt_source, titan_get_clone_ids_from_file, get_scnt_from_titan_source, get_scnt_from_ginkgo_source, get_segments_from_gff_file
14 | from rck.utils.adj.process import get_chromosome_strip_parser
15 |
16 |
17 | def main():
18 | parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-x2rck")
19 | cli_logging_parser = get_logging_cli_parser()
20 | chr_strip_parser = get_chromosome_strip_parser()
21 | subparsers = parser.add_subparsers(title="command", dest="command")
22 | subparsers.required = True
23 | ####
24 | titan_parser = subparsers.add_parser("titan", parents=[cli_logging_parser, chr_strip_parser])
25 | titan_parser.add_argument("titan_ichor_seg")
26 | titan_parser.add_argument("--sample-name", required=True)
27 | titan_parser.add_argument("--clone-ids", default=None)
28 | titan_parser.add_argument("--separator", default="\t")
29 | titan_parser.add_argument("--corrected-cn-fix", choices=["None", "equal", "relative-dist"], default="None")
30 | titan_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
31 | ####
32 | battenberg_parser = subparsers.add_parser("battenberg", parents=[cli_logging_parser, chr_strip_parser])
33 | battenberg_parser.add_argument("battenberg", type=argparse.FileType("rt"), default=sys.stdin)
34 | battenberg_parser.add_argument("--separator", default="\t")
35 | battenberg_parser.add_argument("--sample-name", required=True)
36 | battenberg_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2")
37 | battenberg_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
38 | ####
39 | hatchet_parser = subparsers.add_parser("hatchet", parents=[cli_logging_parser, chr_strip_parser])
40 | hatchet_parser.add_argument("hatchet", type=str)
41 | hatchet_parser.add_argument("--separator", default="\t")
42 | hatchet_parser.add_argument("--min-usage", type=float, default=0.01)
43 | hatchet_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
44 | group = hatchet_parser.add_mutually_exclusive_group(required=True)
45 | group.add_argument("--sample-name", default=None)
46 | group.add_argument("--clone-ids", default=None)
47 | ####
48 | remixt_parser = subparsers.add_parser("remixt", parents=[cli_logging_parser, chr_strip_parser])
49 | remixt_parser.add_argument("remixt", type=argparse.FileType("rt"), default=sys.stdin)
50 | remixt_parser.add_argument("--separator", default="\t")
51 | remixt_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2")
52 | remixt_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
53 | ####
54 | ginkgo_parser = subparsers.add_parser("ginkgo", parents=[cli_logging_parser, chr_strip_parser])
55 | ginkgo_parser.add_argument("ginkgo", type=argparse.FileType("rt"), default=sys.stdin)
56 | ginkgo_parser.add_argument("--separator", default="\t")
57 | ginkgo_parser.add_argument("--sample-name", required=True)
58 | ginkgo_parser.add_argument("--dummy-clone-name", default="1")
59 | ginkgo_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
60 | ####
61 | gff_parser = subparsers.add_parser("gff", parents=[cli_logging_parser, chr_strip_parser])
62 | gff_parser.add_argument("gff", type=str)
63 | gff_parser.add_argument("--chr-mapping-file", type=argparse.FileType("rt"))
64 | gff_parser.add_argument("--chr-mapping-missing-strategy", choices=["keep", "skip"], default="keep")
65 | gff_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
66 | args = parser.parse_args()
67 | logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT")
68 |
69 | if args.command == "titan":
70 | logger.info("Converting allele-specific segment copy values form TitanCNA format to RCK")
71 | titan_full_path = get_full_path(path=args.titan_ichor_seg)
72 | if args.clone_ids is None:
73 | logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=titan_full_path))
74 | clone_ids = titan_get_clone_ids_from_file(file_name=titan_full_path, sample_name=args.sample_name, separator=args.separator)
75 | else:
76 | clone_ids = sorted(set(args.clone_ids.split(",")))
77 | logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
78 | with open(args.titan_ichor_seg, "rt") as source:
79 | logger.info("Reading allele-specific segment copy number values from {file}".format(file=titan_full_path))
80 | segments, scnt = get_scnt_from_titan_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator,
81 | corrected_cn_fix=args.corrected_cn_fix, chr_strip=args.strip_chr)
82 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
83 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator)
84 | elif args.command == "battenberg":
85 | logger.info("Converting allele-specific segment copy values form Battenberg format to RCK")
86 | clone_ids = args.clone_ids.split(",")
87 | logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
88 | logger.info("Reading allele-specific segment copy number values form {file}".format(file=args.battenberg))
89 | segments, scnt = get_scnt_from_battenberg_source(source=args.battenberg, sample_name=args.sample_name, separator=args.separator, chr_strip=args.strip_chr)
90 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
91 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids)
92 | elif args.command == "hatchet":
93 | hatchet_full_path = get_full_path(path=args.hatchet)
94 | logger.info("Converting allele-specific segment copy values form HATCHet format to RCK")
95 | if args.clone_ids is None:
96 | logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=hatchet_parser))
97 | clone_ids = hatchet_get_clone_ids_from_file(file_name=hatchet_full_path, sample_name=args.sample_name, separator=args.separator, min_usage=args.min_usage)
98 | else:
99 | clone_ids = sorted(set(args.clone_ids.split(",")))
100 | logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
101 | with open(hatchet_full_path) as source:
102 | logger.info("Reading allele-specific segment copy number values from {file}".format(file=hatchet_full_path))
103 | segments, scnt = get_scnt_from_hatchet_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator, chr_strip=args.strip_chr)
104 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
105 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator)
106 | elif args.command == "remixt":
107 | logger.info("Converting allele-specific segment copy values form ReMixT format to RCK")
108 | clone_ids = args.clone_ids.split(",")
109 | logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
110 | logger.info("Reading allele-specific segment copy number values from {file}".format(file=args.remixt))
111 | segments, scnt = get_scnt_from_remixt_source(source=args.remixt, separator=args.separator, chr_strip=args.strip_chr)
112 | logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
113 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids)
114 | elif args.command == "ginkgo":
115 | logger.info("Converting *haploid* segments copy values from Ginkgo format to RCK")
116 | logger.info("Reading *haploid* segments copy values from {file}".format(file=args.ginkgo))
117 | segments, scnt = get_scnt_from_ginkgo_source(source=args.ginkgo, sample_name=args.sample_name, dummy_clone=args.dummy_clone_name,
118 | separator=args.separator, chr_strip=args.strip_chr)
119 | logger.info("Writing *haploid* segments copy number values in RCK format to {file}".format(file=args.output))
120 | write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=set(args.dummy_clone_name), separator=args.separator)
121 | elif args.command == "gff":
122 | logger.info("Converting segments data from GFF format to RCK")
123 | logger.info("Reading segments from {file}".format(file=args.gff))
124 | chr_mappings = None
125 | if args.chr_mapping_file is not None:
126 | chr_mappings = {}
127 | logger.info("Reading chromosome mapping data from {file}".format(file=args.chr_mapping_file))
128 | for line in args.chr_mapping_file:
129 | line = line.strip()
130 | data = line.split("\t")
131 | chr_mappings[data[0]] = data[1]
132 | segments = get_segments_from_gff_file(file_name=args.gff, chr_strip=args.strip_chr,
133 | chr_mapping=chr_mappings, chr_mapping_missing_strategy=args.chr_mapping_missing_strategy)
134 | logger.info("Writing segments in RCK format to {file}".format(file=args.output))
135 | write_segments_to_destination(destination=args.output, segments=segments)
136 | logger.info("Success!")
137 |
138 |
139 | if __name__ == "__main__":
140 | main()
141 |
--------------------------------------------------------------------------------
/rck/utils/scn/stats.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from collections import defaultdict
3 |
4 | from rck.core.structures import refined_scnt_with_adjacencies_and_telomeres, refined_scnt, cn_distance_inter_scnt
5 |
6 |
7 | class CloneCollectionCNDistanceInstance(object):
8 | def __init__(self, instances1, instances2, mapping1, mapping2):
9 | self.instances1 = instances1
10 | self.instances2 = instances2
11 | self.mapping1 = mapping1
12 | self.mapping2 = mapping2
13 |
14 | def __str__(self):
15 | return f'instance 1: ({",".join(self.instances1)}); instance 2: ({",".join(self.instances2)})'
16 |
17 |
18 | def cn_distance(segments1, scnt1, segments2, scnt2, both_haplotype_specific=False):
19 | positions_by_chr = defaultdict(set)
20 | for segments in [segments1, segments2]:
21 | for segment in segments:
22 | positions_by_chr[segment.chromosome].add(segment.start_position)
23 | positions_by_chr[segment.chromosome].add(segment.end_position)
24 | outermost_positions_per_chromosomes = {}
25 | for chr_name, positions in positions_by_chr.items():
26 | outermost_positions_per_chromosomes[chr_name] = {
27 | "start": min(positions, key=lambda p: p.coordinate),
28 | "end": max(positions, key=lambda p: p.coordinate)
29 | }
30 | segments1, scnt1, _ = refined_scnt(segments=segments1, scnt=scnt1, merge_fragments=False, fill_gaps=True, extend_outermost=True,
31 | outermost_positions=outermost_positions_per_chromosomes, outermost_positions_margin=0)
32 | segments2, scnt2, _ = refined_scnt(segments=segments2, scnt=scnt2, merge_fragments=False, fill_gaps=True, extend_outermost=True,
33 | outermost_positions=outermost_positions_per_chromosomes, outermost_positions_margin=0)
34 | all_positions = set()
35 | for segments in [segments1, segments2]:
36 | for segment in segments:
37 | all_positions.add(segment.start_position)
38 | all_positions.add(segment.end_position)
39 | segments1, scnt1, _ = refined_scnt_with_adjacencies_and_telomeres(segments=segments1, scnt=scnt1, telomere_positions=all_positions)
40 | segments2, scnt2, _ = refined_scnt_with_adjacencies_and_telomeres(segments=segments2, scnt=scnt2, telomere_positions=all_positions)
41 | clone_ids1, clone_ids2 = list(set(scnt1.keys())), list(set(scnt2.keys()))
42 | matching_clone_ids_cnt = min((len(clone_ids1), len(clone_ids2)))
43 | result = {}
44 | for clone_ids1_instances in itertools.combinations(clone_ids1, matching_clone_ids_cnt):
45 | for clone_ids2_instances in itertools.permutations(clone_ids2, matching_clone_ids_cnt):
46 | clone_ids1_mapping = {str(cnt): clone_id for cnt, clone_id in enumerate(clone_ids1_instances)}
47 | clone_ids2_mapping = {str(cnt): clone_id for cnt, clone_id in enumerate(clone_ids2_instances)}
48 | tmp_scnt1 = {key: scnt1[value] for key, value in clone_ids1_mapping.items()}
49 | tmp_scnt2 = {key: scnt2[value] for key, value in clone_ids2_mapping.items()}
50 | clone_specific_distances = cn_distance_inter_scnt(tensor1=tmp_scnt1, tensor2=tmp_scnt2, segments=segments1)
51 | case = CloneCollectionCNDistanceInstance(instances1=clone_ids1_instances, instances2=clone_ids2_instances,
52 | mapping1=clone_ids1_mapping, mapping2=clone_ids2_mapping)
53 | result[case] = clone_specific_distances
54 | return result
55 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | import sys
4 | import os
5 |
6 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
7 |
8 | import rck
9 |
10 | setup(
11 | name="RCK",
12 | version=rck.version,
13 | author="Sergey Aganezov",
14 | author_email="aganezov@cs.jhu.edu",
15 | description="A tool for (R)econstruction of (C)ancer (K)aryotypes (both clone- and haplotype-specific)",
16 | license="MIT",
17 | keywords="RCK, rck, cancer, cancer genomics, cancer karyotypes, clonality, subclonality, copy number aberrations, breakpoints, structural variations, novel adjacencies",
18 | url="https://github.com/aganezov/rck",
19 | packages=["", "rck", "rck.core", "rck.utils", "rck.utils.scn", "rck.utils.adj"],
20 | include_package_data=True,
21 | entry_points={
22 | "console_scripts": [
23 | "rck = rck.rck_run:main",
24 | "rck-scnt-x2rck = rck.utils.scn.rck_scnt_x2rck:main",
25 | "rck-scnt-process = rck.utils.scn.rck_scnt_process:main",
26 | "rck-scnt-rck2x = rck.utils.scn.rck_scnt_rck2x:main",
27 | "rck-scnt-stats = rck.utils.scn.rck_scnt_stats:main",
28 | "rck-scnb = rck.utils.scn.rck_scnb:main",
29 | "rck-adj-x2rck = rck.utils.adj.rck_adj_x2rck:main",
30 | "rck-adj-rck2x = rck.utils.adj.rck_adj_rck2x:main",
31 | "rck-adj-process = rck.utils.adj.rck_adj_process:main",
32 | "rck-adj-stats = rck.utils.adj.rck_adj_stats:main",
33 | "rck-adg-infer = rck.utils.adj.rck_adg_infer:main",
34 | "rck-adg-process = rck.utils.adj.rck_adg_process:main",
35 | "rck-adg-stats = rck.utils.adj.rck_adg_stats:main",
36 | "rck-input-refine = rck.utils.rck_input_refine:main",
37 | "rck-kar-graph = rck.utils.karyotype.rck_kar_graph:main",
38 | "rck-kar-stats = rck.utils.karyotype.rck_kar_stats:main",
39 | ]
40 | },
41 | install_requires=[
42 | "networkx>=2",
43 | "scipy",
44 | "pyvcf",
45 | "pysam",
46 | "sortedcontainers",
47 | "pandas",
48 | "gffutils",
49 | ]
50 | )
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aganezov/RCK/4c6c288bfe1e20905069d842bc58609b10568c7e/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_graph.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from rck.core.graph import IntervalAdjacencyGraph
4 | from rck.core.structures import Position, Strand, Segment, Adjacency, AdjacencyType
5 |
6 |
7 | class TestIntervalAdjacencyGraph(unittest.TestCase):
8 | def setUp(self):
9 | self.p1 = Position(chromosome="1", coordinate=1, strand=Strand.REVERSE)
10 | self.p2 = Position(chromosome="1", coordinate=2, strand=Strand.FORWARD)
11 | self.p3 = Position(chromosome="1", coordinate=3, strand=Strand.REVERSE)
12 | self.p4 = Position(chromosome="1", coordinate=4, strand=Strand.FORWARD)
13 | self.p5 = Position(chromosome="1", coordinate=5, strand=Strand.REVERSE)
14 | self.p6 = Position(chromosome="1", coordinate=6, strand=Strand.FORWARD)
15 |
16 | self.p7 = Position(chromosome="2", coordinate=1, strand=Strand.REVERSE)
17 | self.p8 = Position(chromosome="2", coordinate=2, strand=Strand.FORWARD)
18 | self.p9 = Position(chromosome="2", coordinate=3, strand=Strand.REVERSE)
19 | self.p10 = Position(chromosome="2", coordinate=4, strand=Strand.FORWARD)
20 | self.p11 = Position(chromosome="2", coordinate=5, strand=Strand.REVERSE)
21 | self.p12 = Position(chromosome="2", coordinate=6, strand=Strand.FORWARD)
22 |
23 | self.s1 = Segment(start_position=self.p1, end_position=self.p2)
24 | self.s2 = Segment(start_position=self.p3, end_position=self.p4)
25 | self.s3 = Segment(start_position=self.p5, end_position=self.p6)
26 |
27 | self.s4 = Segment(start_position=self.p7, end_position=self.p8)
28 | self.s5 = Segment(start_position=self.p9, end_position=self.p10)
29 | self.s6 = Segment(start_position=self.p11, end_position=self.p12)
30 |
31 | def test_construction_no_adjacencies(self):
32 | segments = [self.s1, self.s2, self.s3]
33 | iag = IntervalAdjacencyGraph(segments=segments, adjacencies=[])
34 | nodes = list(iag.nodes(data=True))
35 | self.assertEqual(len(nodes), 6)
36 | self.assertEqual(len(list(iag.edges())), 3)
37 | segment_edges = list(iag.segment_edges(data=True))
38 | self.assertEqual(len(segment_edges), 3)
39 | self.assertEqual(len(list(iag.adjacency_edges(data=True))), 0)
40 | segments_edges_as_objects_on_edges = [e[2]["object"] for e in segment_edges]
41 | for s in segments:
42 | self.assertIn(s.start_position.idx, {n[0] for n in nodes})
43 | self.assertIn(s.end_position.idx, {n[0] for n in nodes})
44 | self.assertIn(s, segments_edges_as_objects_on_edges)
45 |
46 | def test_construction_only_ref_adjacencies(self):
47 | segments = [self.s1, self.s2, self.s3]
48 | ra1 = Adjacency(position1=self.s1.end_position, position2=self.s2.start_position, adjacency_type=AdjacencyType.REFERENCE)
49 | ra2 = Adjacency(position1=self.s2.end_position, position2=self.s3.start_position, adjacency_type=AdjacencyType.REFERENCE)
50 | adjacencies = [ra1, ra2]
51 | iag = IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies)
52 | nodes = list(iag.nodes(data=True))
53 | self.assertEqual(len(nodes), 6)
54 | edges = list(iag.edges(data=True))
55 | segment_edges = list(iag.segment_edges(data=True))
56 | adjacency_edges = list(iag.adjacency_edges(data=True))
57 | self.assertEqual(len(edges), 5)
58 | self.assertEqual(len(segment_edges), 3)
59 | self.assertEqual(len(adjacency_edges), 2)
60 | for s in segments:
61 | self.assertIn(s.start_position.idx, {n[0] for n in nodes})
62 | self.assertIn(s.end_position.idx, {n[0] for n in nodes})
63 | self.assertIn(s, {e[2]["object"] for e in segment_edges})
64 | for a in adjacencies:
65 | self.assertIn(a, {e[2]["object"] for e in adjacency_edges})
66 |
67 | def test_construction_ref_and_nov_adjacencies(self):
68 | segments = [self.s1, self.s2, self.s3]
69 | ra1 = Adjacency(position1=self.s1.end_position, position2=self.s2.start_position, adjacency_type=AdjacencyType.REFERENCE)
70 | ra2 = Adjacency(position1=self.s2.end_position, position2=self.s3.start_position, adjacency_type=AdjacencyType.REFERENCE)
71 | na1 = Adjacency(position1=self.s1.end_position, position2=self.s3.start_position, adjacency_type=AdjacencyType.NOVEL)
72 | adjacencies = [ra1, ra2, na1]
73 | iag = IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies)
74 | nodes = list(iag.nodes(data=True))
75 | edges = list(iag.edges(data=True))
76 | segment_edges = list(iag.segment_edges(data=True))
77 | adjacency_edges = list(iag.adjacency_edges(data=True))
78 | r_adjacency_edges = list(iag.ref_adjacency_edges(data=True))
79 | n_adjacency_edges = list(iag.nov_adjacency_edges(data=True))
80 | self.assertSetEqual({(e[0], e[1]) for e in adjacency_edges},
81 | {(e[0], e[1]) for e in r_adjacency_edges}.union({(e[0], e[1]) for e in n_adjacency_edges}))
82 | self.assertSetEqual({e[2]["object"] for e in segment_edges}, set(segments))
83 | self.assertEqual(len(nodes), 6)
84 | self.assertEqual(len(edges), 6)
85 | self.assertEqual(len(segment_edges), 3)
86 | self.assertEqual(len(adjacency_edges), 3)
87 | self.assertEqual(len(r_adjacency_edges), 2)
88 | self.assertEqual(len(n_adjacency_edges), 1)
89 |
90 | def test_construction_invalid_consistency_check_ref_from_different_chromosomes(self):
91 | invalid_ra = Adjacency(position1=self.s2.start_position, position2=self.s4.end_position, adjacency_type=AdjacencyType.REFERENCE)
92 | segments = [self.s1, self.s2, self.s3, self.s4]
93 | adjacencies = [invalid_ra]
94 | with self.assertRaises(ValueError):
95 | IntervalAdjacencyGraph.check_consistency(segments=segments, adjacencies=adjacencies)
96 | with self.assertRaises(ValueError):
97 | IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies)
98 |
99 | def test_construction_invalid_consistency_check_adjacency_with_position_not_from_segments(self):
100 | adjacency = Adjacency(position1=self.s4.end_position, position2=self.s1, adjacency_type=AdjacencyType.NOVEL)
101 | segments = [self.s1, self.s2, self.s3]
102 | adjacencies = [adjacency]
103 | with self.assertRaises(ValueError):
104 | IntervalAdjacencyGraph.check_consistency(segments=segments, adjacencies=adjacencies)
105 | with self.assertRaises(ValueError):
106 | IntervalAdjacencyGraph(segments=segments, adjacencies=adjacencies)
107 |
--------------------------------------------------------------------------------
/tests/test_structures.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from rck.core.structures import PositionCluster
4 | from rck.core.structures import Strand, Position, Segment, Adjacency
5 |
6 |
7 | class StrandTestCase(unittest.TestCase):
8 | def test_strand_str(self):
9 | self.assertEqual(str(Strand.REVERSE), "-")
10 | self.assertEqual(str(Strand.FORWARD), "+")
11 |
12 | def test_from_pm_string(self):
13 | self.assertEqual(Strand.REVERSE, Strand.from_pm_string(string="-"))
14 | self.assertEqual(Strand.FORWARD, Strand.from_pm_string(string="+"))
15 | with self.assertRaises(ValueError):
16 | Strand.from_pm_string(string="?")
17 |
18 |
19 | class PositionTestCase(unittest.TestCase):
20 | def setUp(self):
21 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD)
22 | self.position2 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE)
23 | self.position3 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD)
24 | self.position4 = Position(chromosome="chr2", coordinate=1, strand=Strand.FORWARD)
25 |
26 | def test_empty_extra_creation(self):
27 | self.assertDictEqual(Position(chromosome="chrom1", coordinate=1, strand=Strand.FORWARD).extra, {})
28 |
29 | def test_eq(self):
30 | non_position = "?"
31 | eq_position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD)
32 | self.assertNotEqual(self.position1, self.position2)
33 | self.assertNotEqual(self.position1, self.position3)
34 | self.assertNotEqual(self.position1, self.position4)
35 | self.assertNotEqual(self.position1, non_position)
36 | self.assertEqual(self.position1, eq_position1)
37 |
38 | def test_lt(self):
39 | self.assertLess(self.position2, self.position1)
40 | self.assertGreater(self.position1, self.position2)
41 | self.assertLess(self.position1, self.position3)
42 | chr5_position = Position(chromosome="chr5", coordinate=5, strand=Strand.FORWARD)
43 | chr10_position = Position(chromosome="chr10", coordinate=1, strand=Strand.REVERSE)
44 | self.assertLess(self.position1, self.position4)
45 | self.assertLess(chr5_position, chr10_position)
46 |
47 |
48 | class SegmentTestCase(unittest.TestCase):
49 | def setUp(self):
50 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE)
51 | self.position2 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD)
52 |
53 | def test_creation(self):
54 | position3 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD)
55 | position4 = Position(chromosome="chr2", coordinate=2, strand=Strand.FORWARD)
56 | position5 = Position(chromosome="chr1", coordinate=0, strand=Strand.FORWARD)
57 | for pos in [position3, position4, position5]:
58 | with self.assertRaises(ValueError):
59 | Segment(start_position=self.position1, end_position=pos)
60 | Segment(start_position=self.position1, end_position=self.position2)
61 |
62 | def test_idx(self):
63 | s = Segment(start_position=self.position1, end_position=self.position2)
64 | self.assertIsNone(s._idx)
65 | self.assertEqual(s.idx, "chr1:1-2")
66 | s = Segment(start_position=self.position1, end_position=self.position2, idx="idx")
67 | self.assertEqual(s.idx, "idx")
68 | s.idx = "idx2"
69 | self.assertEqual(s.idx, "idx2")
70 |
71 | def test_str(self):
72 | s = Segment(start_position=self.position1, end_position=self.position2)
73 | self.assertEqual(str(s), "chr1:1-2")
74 | s.idx = "idx"
75 | self.assertEqual(str(s), "idx")
76 |
77 | def test_chromosome(self):
78 | s = Segment(start_position=self.position1, end_position=self.position2)
79 | self.assertEqual(s.chromosome, self.position1.chromosome)
80 | self.assertEqual(s.chromosome, self.position2.chromosome)
81 |
82 |
83 | class AdjacencyTestCase(unittest.TestCase):
84 | def setUp(self):
85 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE)
86 | self.position2 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD)
87 |
88 | def test_creation(self):
89 | a = Adjacency(position1=self.position2, position2=self.position1)
90 | self.assertEqual(a.position1, self.position1)
91 | self.assertEqual(a.position2, self.position2)
92 | a = Adjacency(position1=self.position1, position2=self.position2)
93 | self.assertEqual(a.position1, self.position1)
94 | self.assertEqual(a.position2, self.position2)
95 |
96 | def test_idx(self):
97 | s = Adjacency(position1=self.position1, position2=self.position2, idx="idx")
98 | self.assertEqual(s.idx, "idx")
99 | s.idx = None
100 | self.assertEqual(s.idx, "[" + str(self.position1) + "]-[" + str(self.position2) + "]")
101 |
102 |
103 | class PositionClusterTestCase(unittest.TestCase):
104 | def setUp(self):
105 | self.position1 = Position(chromosome="chr1", coordinate=1, strand=Strand.FORWARD)
106 | self.position2 = Position(chromosome="chr1", coordinate=1, strand=Strand.REVERSE)
107 | self.position3 = Position(chromosome="chr1", coordinate=2, strand=Strand.FORWARD)
108 | self.position4 = Position(chromosome="chr2", coordinate=1, strand=Strand.FORWARD)
109 |
110 | def test_interning_sorting_on_creation(self):
111 | pc = PositionCluster(positions=[self.position3, self.position1, self.position2])
112 | self.assertListEqual(pc.positions, [self.position2, self.position1, self.position3])
113 |
114 |
115 | if __name__ == '__main__':
116 | unittest.main()
117 |
--------------------------------------------------------------------------------