├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── LICENSE
├── README.md
├── demos
    ├── callingE
    │   └── demo-callingE.sh
    ├── cloningE
    │   └── demo-cloningE.sh
    ├── complete-nonormal
    │   └── demo-complete-nonormal.sh
    ├── complete
    │   └── demo-complete.sh
    ├── old_demos
    │   └── completeE.sh
    ├── plottingE
    │   └── demo-plottingE.sh
    └── pseudonormal
    │   └── demo-pseudonormal.sh
├── doc
    ├── chisel-calling.md
    ├── chisel-cartoon.png
    ├── chisel-cloning.md
    ├── chisel-plotting.md
    ├── chisel-pseudonormal.md
    └── chisel.md
├── guides
    ├── clones.md
    ├── clustering.md
    └── ploidy.md
├── install_full.sh
├── man
    ├── chisel-bedding.md
    ├── chisel-calling.md
    ├── chisel-cloning.md
    ├── chisel-plotting.md
    ├── chisel-prep.md
    ├── chisel-pseudonormal.md
    └── chisel.md
├── setup.py
├── src
    └── chisel
    │   ├── BAFEstimator.py
    │   ├── Caller.py
    │   ├── Cloner.py
    │   ├── Clusterizer.py
    │   ├── Combiner.py
    │   ├── Mutator.py
    │   ├── Plotter.py
    │   ├── RDREstimator.py
    │   ├── Utils.py
    │   ├── __init__.py
    │   ├── bin.awk
    │   ├── bin
    │       ├── __init__.py
    │       ├── chisel_bedding.py
    │       ├── chisel_calling.py
    │       ├── chisel_cloning.py
    │       ├── chisel_combocall.py
    │       ├── chisel_main.py
    │       ├── chisel_nonormal.py
    │       ├── chisel_nonormal_combocall.py
    │       ├── chisel_nonormal_preprocess.py
    │       ├── chisel_plotting.py
    │       ├── chisel_prep.py
    │       ├── chisel_preprocess.py
    │       ├── chisel_pseudonormal.py
    │       ├── chisel_rdr.py
    │       └── count.awk
    │   └── count.awk
├── tests
    ├── allchecks.sh
    ├── callingE.chk
    ├── cloningE.chk
    ├── complete.chk
    ├── plottingE.chk
    ├── pseudonormal.chk
    └── pytests
    │   ├── conftest.py
    │   ├── data
    │       ├── input
    │       │   ├── .gitignore
    │       │   └── README.md
    │       └── output
    │       │   ├── baf.tsv
    │       │   ├── calls.tsv
    │       │   ├── combo.tsv
    │       │   ├── rdr.tsv
    │       │   └── total.tsv
    │   ├── test_baf.py
    │   ├── test_call.py
    │   ├── test_clone.py
    │   ├── test_combine.py
    │   └── test_rdr.py
└── tox.ini


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master, develop, py3, ci ]
 6 |   pull_request:
 7 |     branches: [ master, develop, py3, ci ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python: [2.7.18]
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |         with:
20 |           lfs: false
21 | 
22 |       - name: Cache Testing Data
23 |         id: cache-test-data
24 |         uses: actions/cache@v2
25 |         with:
26 |           path: testdata
27 |           key: testdata
28 | 
29 |       - name: Download Testing Data
30 |         if: steps.cache-test-data.outputs.cache-hit != 'true'
31 |         run: |
32 |           pip3 install wheel
33 |           pip3 install setuptools
34 |           pip3 install zenodo-get
35 |           python3 -m zenodo_get 10.5281/zenodo.3950299 --output-dir=testdata
36 | 
37 |       - name: Set Testing Data Envvar
38 |         run: |
39 |           echo "TEST_DIRECTORY=$(realpath testdata)" >> $GITHUB_ENV
40 | 
41 |       - name: Install SAMtools
42 |         run: |
43 |           wget https://sourceforge.net/projects/samtools/files/samtools/1.7/samtools-1.7.tar.bz2/download -O samtools-1.7.tar.bz2
44 |           tar xvjf samtools-1.7.tar.bz2
45 |           (cd samtools-1.7 && ./configure && make)
46 |           echo $(realpath samtools-1.7) >> $GITHUB_PATH
47 | 
48 |       - name: Install BCFTools
49 |         run: |
50 |           wget https://sourceforge.net/projects/samtools/files/samtools/1.7/bcftools-1.7.tar.bz2/download -O bcftools-1.7.tar.bz2
51 |           tar xvjf bcftools-1.7.tar.bz2
52 |           (cd bcftools-1.7 && ./configure && make)
53 |           echo $(realpath bcftools-1.7) >> $GITHUB_PATH
54 | 
55 |       - name: Download and index hg19.fa
56 |         run: |
57 |           wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz --directory-prefix=${TEST_DIRECTORY}
58 |           (cd ${TEST_DIRECTORY} && gunzip -df hg19.fa.gz && samtools faidx hg19.fa && samtools dict hg19.fa > hg19.dict)
59 | 
60 |       - name: Setup Python
61 |         uses: actions/setup-python@v2
62 |         with:
63 |           python-version: ${{ matrix.python }}
64 | 
65 |       - name: Install Tox and any other packages
66 |         run: |
67 |           python -m pip install coverage tox tox-gh-actions
68 | 
69 |       - name: Test with tox
70 |         run: |
71 |           tox
72 |         env:
73 |           PLATFORM: ${{ matrix.python }}
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.pyc


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Princeton University
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/demos/callingE/demo-callingE.sh:
--------------------------------------------------------------------------------
 1 | # Demo for WGS data from a cancer patient
 2 | : ex: set ft=markdown ;:<<'```shell' #
 3 | 
 4 | The following CHISEL demo represents a guided example of the CHISEL pipeline starting from the computed RDRs and BAFs (typically the file `combo.tsv` in the folder `combo`) for tumor section E of breast cancer patient S0. Simply run this file through BASH as a standard script to run the complete demo. The demo represent a guided example for the command `chisel-calling` which allows to re-run the inference of copy numbers and can be used to try different parameters, especially related to the inference of tumor ploidy.
 5 | 
 6 | ## Requirements and set up
 7 | 
 8 | The demo requires that CHISEL has been succesfully installed with conda. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied.
 9 | 
10 | We gurantee that the running directory in the same directory of the demo and we remove previous results.
11 | 
12 | ```shell
13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/
15 | :<<'```shell' # Ignore this line
16 | ```
17 | 
18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands
19 | ```shell
20 | set -e
21 | set -o xtrace
22 | PS4='[\t]'
23 | :<<'```shell' # Ignore this line
24 | ```
25 | 
26 | ## Downloading of data
27 | 
28 | The demo auomatically downloads the required RDRs and BAFs already computed by the complete CHISEL pipeline in `data` folder.
29 | 
30 | ```shell
31 | # Creating data folder
32 | mkdir -p data
33 | 
34 | # Downloading RDRs and BAFs computed by CHISEL for tumor section E
35 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/callingE/combo.tsv.gz > data/combo.tsv.gz
36 | gzip -df data/combo.tsv.gz
37 | export INPUT="data/combo.tsv"
38 | :<<'```shell' # Ignore this line
39 | ```
40 | 
41 | ## Run CHISEL
42 | 
43 | We now run the command CHISEL command that starts from the inference of copy numbers from RDRs and BAFs.
44 | 
45 | ```shell
46 | chisel_calling ${INPUT} --seed 25
47 | exit $?
48 | ```
49 | 


--------------------------------------------------------------------------------
/demos/cloningE/demo-cloningE.sh:
--------------------------------------------------------------------------------
 1 | # Demo for WGS data from a cancer patient
 2 | : ex: set ft=markdown ;:<<'```shell' #
 3 | 
 4 | The following CHISEL demo represents a guided example of the CHISEL pipeline starting from the inferred copy numbers (typically the file `calls.tsv` in the folder `calls`) for tumor section E of breast cancer patient S0, and thus identifies the clones and produces the corresponding plots. Simply run this file through BASH as a standard script to run the complete demo. The demo represent a guided example for the command `chisel-cloning` which allows to re-run the inference of clones and can be used to try different parameters to explore different solutions and clustering of cells..
 5 | 
 6 | ## Requirements and set up
 7 | 
 8 | The demo requires that CHISEL has been succesfully installed with conda. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied.
 9 | 
10 | We gurantee that the running directory in the same directory of the demo and we remove previous results.
11 | 
12 | ```shell
13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/
15 | :<<'```shell' # Ignore this line
16 | ```
17 | 
18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands
19 | ```shell
20 | set -e
21 | set -o xtrace
22 | PS4='[\t]'
23 | :<<'```shell' # Ignore this line
24 | ```
25 | 
26 | ## Downloading of data
27 | 
28 | The demo auomatically downloads the required inferred copy numbers already computed by the complete CHISEL pipeline in `data` folder.
29 | 
30 | ```shell
31 | # Creating data folder
32 | mkdir -p data
33 | 
34 | # Downloading copy numbers inferred by CHISEL for tumor section E
35 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/cloningE/calls.tsv.gz > data/calls.tsv.gz
36 | gzip -df data/calls.tsv.gz
37 | export INPUT="data/calls.tsv"
38 | :<<'```shell' # Ignore this line
39 | ```
40 | 
41 | ## Run CHISEL
42 | 
43 | We now run the command CHISEL command that starts from the inference of copy numbers from RDRs and BAFs.
44 | 
45 | ```shell
46 | chisel_cloning ${INPUT} --seed 25
47 | exit $?
48 | ```
49 | 


--------------------------------------------------------------------------------
/demos/complete-nonormal/demo-complete-nonormal.sh:
--------------------------------------------------------------------------------
 1 | # Complete demo of CHISEL in nonormal mode
 2 | : ex: set ft=markdown ;:<<'```shell' #
 3 | 
 4 | The following CHISEL demo represents a guided example of the complete CHISEL pipeline in nonormal mode starting from an exemplary barcoded [BAM file](https://doi.org/10.5281/zenodo.3950299) publicly available. From this directory, simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented.
 5 | 
 6 | ## Requirements and set up
 7 | 
 8 | The demo requires that CHISEL has been succesfully installed with conda. If the custom installation was used, please make sure that you can succesfully run the command `chisel` as well as the required `samtools`, `bcftools`, and `awk`. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied.
 9 | 
10 | We gurantee that the running directory in the same directory of the demo and we remove previous results.
11 | 
12 | ```shell
13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/
15 | :<<'```shell' # Ignore this line
16 | ```
17 | 
18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands
19 | ```shell
20 | set -e
21 | set -o xtrace
22 | PS4='[\t]'
23 | :<<'```shell' # Ignore this line
24 | ```
25 | 
26 | ## Downloading of data
27 | 
28 | The demo auomatically downloads the required barcoded single-cell and matched-normal BAM files in `data` folder.
29 | 
30 | ```shell
31 | # Creating data folder
32 | mkdir -p data
33 | 
34 | # Downloading tumor barcoded BAM file
35 | echo "Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary."
36 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam?download=1' > data/cells.bam
37 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam.bai?download=1' > data/cells.bam.bai
38 | export TUM="data/cells.bam"
39 | :<<'```shell' # Ignore this line
40 | ```
41 | 
42 | Next the corresponding reference genome is downloaded and unpacked. Also, the required indexes are generated.
43 | 
44 | ```shell
45 | echo "Downloading human reference genome, please be patient as downloading time may vary."
46 | curl -L https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz | gzip -d > data/hg19.fa
47 | samtools faidx data/hg19.fa
48 | samtools dict data/hg19.fa > data/hg19.dict
49 | bwa index data/hg19.fa
50 | export REF="data/hg19.fa"
51 | export DIC="data/hg19.dict"
52 | :<<'```shell' # Ignore this line
53 | ```
54 | 
55 | Last, we download the pre-computed list of phased germline SNPs. Note that differently from the [one](https://github.com/raphael-group/chisel-data/raw/master/demos/completeE/phased.HRC.vcf.gz) obtained through the reccommended instructions (i.e. using BCFtools to call germline SNPs and Eagle2 throught the Michigan Imputation Serverve with HRC panel to phase the SNPs) this file only contains the lables `0|1` or `1|0` for every SNP, which is the minimum requirement for CHISEL.
56 | 
57 | ```shell
58 | curl -L 'https://zenodo.org/record/3950299/files/phases.tsv?download=1' > data/phases.tsv
59 | export PHA="data/phases.tsv"
60 | :<<'```shell' # Ignore this line
61 | ```
62 | 
63 | ## Run CHISEL
64 | 
65 | We now run the complete pipeline of CHISEL with the corresponding command `chisel_nonormal`.
66 | 
67 | ```shell
68 | chisel_nonormal -t ${TUM} -r ${REF} -l ${PHA} --simcov 0.2 --seed 12
69 | exit $?
70 | ```
71 | 


--------------------------------------------------------------------------------
/demos/complete/demo-complete.sh:
--------------------------------------------------------------------------------
 1 | # Complete demo of CHISEL
 2 | : ex: set ft=markdown ;:<<'```shell' #
 3 | 
 4 | The following CHISEL demo represents a guided example of the complete CHISEL pipeline starting from an exemplary barcoded [BAM file](https://doi.org/10.5281/zenodo.3950299) publicly available. From this directory, simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented.
 5 | 
 6 | ## Requirements and set up
 7 | 
 8 | The demo requires that CHISEL has been succesfully installed with conda. If the custom installation was used, please make sure that you can succesfully run the command `chisel` as well as the required `samtools`, `bcftools`, and `awk`. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied.
 9 | 
10 | We gurantee that the running directory in the same directory of the demo and we remove previous results.
11 | 
12 | ```shell
13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/
15 | :<<'```shell' # Ignore this line
16 | ```
17 | 
18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands
19 | ```shell
20 | set -e
21 | set -o xtrace
22 | PS4='[\t]'
23 | :<<'```shell' # Ignore this line
24 | ```
25 | 
26 | ## Downloading of data
27 | 
28 | The demo auomatically downloads the required barcoded single-cell and matched-normal BAM files in `data` folder.
29 | 
30 | ```shell
31 | # Creating data folder
32 | mkdir -p data
33 | 
34 | # Downloading tumor barcoded BAM file
35 | echo "Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary."
36 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam?download=1' > data/cells.bam
37 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam.bai?download=1' > data/cells.bam.bai
38 | export TUM="data/cells.bam"
39 | 
40 | # Downloading matched-normal BAM file
41 | echo "Downloading matched-normal BAM file from Zenodo, please be patient as downloading time may vary."
42 | curl -L 'https://zenodo.org/record/3950299/files/normal.bam?download=1' > data/normal.bam
43 | curl -L 'https://zenodo.org/record/3950299/files/normal.bam.bai?download=1' > data/normal.bam.bai
44 | export NOR="data/normal.bam"
45 | :<<'```shell' # Ignore this line
46 | ```
47 | 
48 | Next the corresponding reference genome is downloaded and unpacked
49 | 
50 | ```shell
51 | echo "Downloading human reference genome, please be patient as downloading time may vary."
52 | curl -L https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz | gzip -d > data/hg19.fa
53 | samtools faidx data/hg19.fa
54 | samtools dict data/hg19.fa > data/hg19.dict
55 | export REF="data/hg19.fa"
56 | export DIC="data/hg19.dict"
57 | :<<'```shell' # Ignore this line
58 | ```
59 | 
60 | Last, we download the pre-computed list of phased germline SNPs. Note that differently from the [one](https://github.com/raphael-group/chisel-data/raw/master/demos/completeE/phased.HRC.vcf.gz) obtained through the reccommended instructions (i.e. using BCFtools to call germline SNPs and Eagle2 throught the Michigan Imputation Serverve with HRC panel to phase the SNPs) this file only contains the lables `0|1` or `1|0` for every SNP, which is the minimum requirement for CHISEL.
61 | 
62 | ```shell
63 | curl -L 'https://zenodo.org/record/3950299/files/phases.tsv?download=1' > data/phases.tsv
64 | export PHA="data/phases.tsv"
65 | :<<'```shell' # Ignore this line
66 | ```
67 | 
68 | ## Run CHISEL
69 | 
70 | We now run the complete pipeline of CHISEL with the corresponding command `chisel`.
71 | 
72 | ```shell
73 | chisel -t ${TUM} -n ${NOR} -r ${REF} -l ${PHA} --seed 12
74 | exit $?
75 | ```
76 | 


--------------------------------------------------------------------------------
/demos/old_demos/completeE.sh:
--------------------------------------------------------------------------------
 1 | # Demo for WGS data from a cancer patient
 2 | : ex: set ft=markdown ;:<<'```shell' #
 3 | 
 4 | The following CHISEL demo represents a guided example of the complete CHISEL pipeline starting from the barcoded [BAM file](https://support.10xgenomics.com/single-cell-dna/datasets/1.0.0/breast_tissue_E_2k) publicly available from 10X Genomics archive and obtained through 10X Chromium Single Cell CNV Solution for section E of a breast tumor. Simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented.
 5 | 
 6 | ## Requirements and set up
 7 | 
 8 | The demo requires that CHISEL has been succesfully installed, such that the python environment called by the command `python2.7` has the required packages, and both `samtools` and `awk` are available in `${PATH}`.
 9 | 
10 | ```shell
11 | export CHISEL_HOME="../../" # This is CHISEL home by default, update if needed
12 | :<<'```shell' # Ignore this line
13 | ```
14 | 
15 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands
16 | ```shell
17 | set -e
18 | set -o xtrace
19 | PS4='[\t]'
20 | :<<'```shell' # Ignore this line
21 | ```
22 | 
23 | ## Downloading of data
24 | 
25 | The demo auomatically downloads the required barcoded single-cell and matched-normal BAM files from 10X Genomics archive through the following commands in `data` folder.
26 | 
27 | ```shell
28 | # Creating data folder
29 | mkdir -p data
30 | 
31 | # Downloading barcoded single-cell BAM of breast tumor section E
32 | wget -N -c http://s3-us-west-2.amazonaws.com/10x.files/samples/cell-dna/1.0.0/breast_tissue_E_2k/breast_tissue_E_2k_possorted_bam.bam -P data/
33 | wget -N -c http://cf.10xgenomics.com/samples/cell-dna/1.0.0/breast_tissue_E_2k/breast_tissue_E_2k_possorted_bam.bam.bai -P data/
34 | export TUM="data/breast_tissue_E_2k_possorted_bam.bam"
35 | 
36 | # Downloading matched-normal BAM file as section A
37 | wget -N -c http://s3-us-west-2.amazonaws.com/10x.files/samples/cell-dna/1.0.0/breast_tissue_A_2k/breast_tissue_A_2k_possorted_bam.bam -P data/
38 | wget -N -c http://cf.10xgenomics.com/samples/cell-dna/1.0.0/breast_tissue_A_2k/breast_tissue_A_2k_possorted_bam.bam.bai -P data/
39 | export NOR="data/breast_tissue_A_2k_possorted_bam.bam"
40 | :<<'```shell' # Ignore this line
41 | ```
42 | 
43 | Next the corresponding reference genome is downloaded and unpacked
44 | 
45 | ```shell
46 | export REF="data/refdata-GRCh38-2.1.0/fasta/genome.fa"
47 | export DIC="data/refdata-GRCh38-2.1.0/fasta/genome.dict"
48 | if [[ ! -f "${REF}" || ! -f "${DIC}" ]]; then
49 |     wget -N -c http://cf.10xgenomics.com/supp/genome/refdata-GRCh38-2.1.0.tar.gz -P data/
50 |     tar -xzvf data/refdata-GRCh38-2.1.0.tar.gz -C data/
51 |     rm -f data/refdata-GRCh38-2.1.0.tar.gz
52 | fi
53 | :<<'```shell' # Ignore this line
54 | ```
55 | 
56 | Last, we download the pre-computed VCF with phased SNPs; the VCF has been computed following the reccommended instructions, using BCFtools to call germline SNPs and Eagle2 throught the Michigan Imputation Serverve with HRC panel to phase the SNPs.
57 | 
58 | ```shell
59 | wget -N -c https://github.com/raphael-group/chisel-data/raw/master/demos/completeE/phased.HRC.vcf.gz -P data/
60 | gzip -f -d data/phased.HRC.vcf.gz
61 | export PHA="data/phased.HRC.vcf"
62 | :<<'```shell' # Ignore this line
63 | ```
64 | 
65 | ## Run CHISEL
66 | 
67 | We now run the complete pipeline of CHISEL with the corresponding command `chisel`.
68 | 
69 | ```shell
70 | chisel -t ${TUM} -n ${NOR} -r ${REF} -l ${PHA} --seed 25
71 | exit $?
72 | ```
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/demos/plottingE/demo-plottingE.sh:
--------------------------------------------------------------------------------
 1 | # Demo for WGS data from a cancer patient
 2 | : ex: set ft=markdown ;:<<'```shell' #
 3 | 
 4 | The following CHISEL demo represents a guided example of the CHISEL pipeline starting from the inferred copy numbers (typically the file `calls.tsv` in the folder `calls`) and identified clones (typically the file `mapping.tsv` in the folder `clones`) for tumor section E of breast cancer patient S0, and thus produces the corresponding plots. The demo represent a guided example for the command `chisel-plotting` which allows to re-run the plot generation and can be used to try different parameters to obtain the best format for the results.
 5 | 
 6 | ## Requirements and set up
 7 | 
 8 | The demo requires that CHISEL has been succesfully installed with conda. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied.
 9 | 
10 | We gurantee that the running directory in the same directory of the demo and we remove previous results.
11 | 
12 | ```shell
13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/
15 | :<<'```shell' # Ignore this line
16 | ```
17 | 
18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands
19 | ```shell
20 | set -e
21 | set -o xtrace
22 | PS4='[\t]'
23 | :<<'```shell' # Ignore this line
24 | ```
25 | 
26 | ## Downloading of data
27 | 
28 | The demo auomatically downloads the required inferred copy numbers already computed by the complete CHISEL pipeline in `data` folder.
29 | 
30 | ```shell
31 | # Creating data folder
32 | mkdir -p data
33 | 
34 | # Downloading copy numbers and clones inferred by CHISEL for tumor section E
35 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/cloningE/calls.tsv.gz > data/calls.tsv.gz
36 | gzip -df data/calls.tsv.gz
37 | export INPUT="data/calls.tsv"
38 | 
39 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/plottingE/mapping.tsv.gz > data/mapping.tsv.gz
40 | gzip -df data/mapping.tsv.gz
41 | export MAPP="data/mapping.tsv"
42 | :<<'```shell' # Ignore this line
43 | ```
44 | 
45 | ## Run CHISEL
46 | 
47 | We now run the command CHISEL command that starts from the inference of copy numbers from RDRs and BAFs.
48 | 
49 | ```shell
50 | chisel_plotting ${INPUT} -m ${MAPP}
51 | exit $?
52 | ```
53 | 


--------------------------------------------------------------------------------
/demos/pseudonormal/demo-pseudonormal.sh:
--------------------------------------------------------------------------------
 1 | # Demo for generating pseudo matched-normal sample
 2 | : ex: set ft=markdown ;:<<'```shell' #
 3 | 
 4 | The following CHISEL demo represents a guided example of the CHISEL command for generating a pseudo matched-normal sample starting from an exemplary barcoded [BAM file](https://doi.org/10.5281/zenodo.3952985) publicly available. Simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented.
 5 | 
 6 | ## Requirements and set up
 7 | 
 8 | The demo requires that CHISEL has been succesfully installed with conda. If the custom installation was used, please make sure that you can succesfully run the command `chisel_pseudonormal` as well as the required `samtools`, `bcftools`, and `awk`. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied.
 9 | 
10 | We gurantee that the running directory in the same directory of the demo and we remove previous results.
11 | 
12 | ```shell
13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/
15 | :<<'```shell' # Ignore this line
16 | ```
17 | 
18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands
19 | ```shell
20 | set -e
21 | set -o xtrace
22 | PS4='[\t]'
23 | :<<'```shell' # Ignore this line
24 | ```
25 | 
26 | ## Downloading of data
27 | 
28 | The demo auomatically downloads the required barcoded single-cell BAM file from 10X Genomics archive through the following commands in `data` folder.
29 | 
30 | ```shell
31 | # Creating data folder
32 | mkdir -p data
33 | 
34 | # Downloading tumor barcoded BAM file
35 | echo "Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary."
36 | curl -L https://zenodo.org/record/3952985/files/cells.bam?download=1 > data/cells.bam
37 | curl -L https://zenodo.org/record/3952985/files/cells.bam.bai?download=1 > data/cells.bam.bai
38 | export BAM="data/cells.bam"
39 | :<<'```shell' # Ignore this line
40 | ```
41 | 
42 | Last, the corresponding reference genome is downloaded and unpacked
43 | 
44 | ```shell
45 | echo "Downloading human reference genome, please be patient as downloading time may vary."
46 | curl -L https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz | gzip -d > data/hg19.fa
47 | samtools faidx data/hg19.fa
48 | samtools dict data/hg19.fa > data/hg19.dict
49 | export REF="data/hg19.fa"
50 | export DIC="data/hg19.dict"
51 | :<<'```shell' # Ignore this line
52 | ```
53 | 
54 | ## Run CHISEL
55 | 
56 | We now run the command `chisel_pseudonormal` of CHISEL for generating a pseudo mathched-normal sample by extracting the sequencing reads from diploid cells in the provided barcoded BAM file `${BAM}`.
57 | Specifically, we are required to specify the reference genome `${REF}` and we use the default values of all parameters.
58 | By default, temporary files and the sorted and indexed output BAM `pseudonormal.bam` will be generated in the current directory.
59 | 
60 | ```shell
61 | chisel_pseudonormal ${BAM} -r ${REF}
62 | exit $?
63 | ```
64 | 


--------------------------------------------------------------------------------
/doc/chisel-calling.md:
--------------------------------------------------------------------------------
 1 | # Command `chisel-calling.py`
 2 | 
 3 | The CHISEL command `chisel-calling.py` runs the CHISEL pipeline starting from the already estimated RDRs and BAFs.
 4 | To do this, this command requires to have the folder `combo` with the files and formats described [here](chisel.md).
 5 | This command is particularly useful if the user would like to re-run the CHISEL pipeline without the extensive re-computation of RDRs and BAFs but using different values of some of the main parameters, including:
 6 | 
 7 | 1. `-A`: varying sensitiviy of the model selection criterion for cell ploidy: in case of particularly noisy datasets or with particularly high variance, the estimation of cell ploidy may be more challenging and it may needed to increase the sensitivity of the selection (e.g. 2, 3, 4, ...);
 8 | 2. `-K`: varying the maximum number of clusters allowed in the global clustering of RDRs and BAFs: choosing values lower than the default (i.e. 100) generally allows to reduce presence of noisy CNAs at the cost of lower resolution;
 9 | 3. `-P`: varying the maximum value allowed for cell ploidy, since the default is 4 which generally corresponds to at most one WGD.
10 | 
11 | 


--------------------------------------------------------------------------------
/doc/chisel-cartoon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphael-group/chisel/24e227df0aba769c4be241bcb8548575b93a9556/doc/chisel-cartoon.png


--------------------------------------------------------------------------------
/doc/chisel-cloning.md:
--------------------------------------------------------------------------------
1 | # Command `chisel-cloning.py`
2 | 
3 | The CHISEL command `chisel-cloning.py` runs the CHISEL pipeline starting from the already estimated allele- and haplotype-specific copy numbers.
4 | To do this, this command requires to have the folder `calls` with the files and formats described [here](chisel.md).
5 | This command is particularly useful if the user would like to re-run the CHISEL's inference of tumor clones to adapt to datasets with particularly high levels of noise and variance.
6 | Examples of usage of this command for QC is described [here](../guides/clones.md).
7 | 


--------------------------------------------------------------------------------
/doc/chisel-plotting.md:
--------------------------------------------------------------------------------
 1 | # Command `chisel-plotting.py`
 2 | 
 3 | The CHISEL command `chisel-plotting.py` generates several useful plots, which can be used to inspect the inferred results or for quality control.
 4 | More specifically, this command generates 15 plots.
 5 | 
 6 | ## Main plots
 7 | 
 8 | ### Allele-specific copy numbers
 9 | 
10 | This plot (`allelecn.png`) depicts the allele-specific copy numbers inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
11 | The colors of the heatmap represent the difference pairs of allele-specific copy numbers and a full description of the color map used is available in the [CHISEL manuscript](https://doi.org/10.1101/837195).
12 | 
13 | ### Corrected allele-specific copy numbers
14 | 
15 | This plot (`allelecn-corrected.png`) depicts the allele-specific copy numbers inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
16 | The colors of the heatmap represent the difference pairs of allele-specific copy numbers and a full description of the color map used is available in the [CHISEL manuscript](https://doi.org/10.1101/837195).
17 | 
18 | ### Haplotype-specific copy numbers
19 | 
20 | This plot (`haplotypecn.png`) depicts the haplotype-specific copy numbers inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
21 | The colors of the heatmap represent the haplotype of the allele with fewer copies, such that green and magenta represent haplotype A and B, respectively.
22 | Note that balanced regions (allele with the same number of copies) are represented in white.
23 | Further descriptions of the color map used are available in the [CHISEL manuscript](https://doi.org/10.1101/837195).
24 | 
25 | ### Corrected haplotype-specific copy numbers
26 | 
27 | This plot (`haplotypecn-corrected.png`) depicts the haplotype-specific copy numbers inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
28 | The colors of the heatmap represent the haplotype of the allele with fewer copies, such that green and magenta represent haplotype A and B, respectively.
29 | Note that balanced regions (allele with the same number of copies) are represented in white.
30 | Further descriptions of the color map used are available in the [CHISEL manuscript](https://doi.org/10.1101/837195).
31 | 
32 | ## Useful additional plots
33 | 
34 | ### BAF and RDR plots
35 | 
36 | This plot (`rbplot_mirrored.png`) shows the global clusters of RDRs and BAFs inferred for a random sample of a certain number of cells (by default 20 cells).
37 | Each plot corresponds to a different cell, with each plot depicting the bins (each point) which are represented by the corresponding values of `|0.5 - mirrored BAF|` (x-axis) and RDR (y-axis), and are colored according to the corresponding cluster; note that colors are consistent across cells.
38 | 
39 | ### Clustered RDR
40 | 
41 | This plot (`crdr.png`) shows the estimated RDR and their cluster for a random sample of a certain number of cells (by default 20 cells).
42 | Each plot corresponds to a different cell, with each plot depicting the bins (each point) which are represented by the corresponding values of RDR (y-axis) along the entire genome (x-axis) and are colored according to the corresponding cluster; note that colors are consistent across cells.
43 | 
44 | ### Clustered mirrored BAF
45 | 
46 | This plot (`cbaf.png`) shows the estimated BAF and their cluster for a random sample of a certain number of cells (by default 20 cells).
47 | Each plot corresponds to a different cell, with each plot depicting the bins (each point) which are represented by the corresponding values of `|0.5 - mirrored BAF|` (y-axis) along the entire genome (x-axis) and are colored according to the corresponding cluster; note that colors are consistent across cells.
48 | 
49 | ### Total copy numbers
50 | 
51 | This plot (`totalcn.png`) is an heatmap that shows the total copy numbers inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
52 | Each point of the heatmap this represents the total copy numbers inferred by CHISEL, such that grey represents 2 copies, blue colors represent <2 copies with darker colors corresponding to smaller values, and red colors represent >2 copies with darker colors corresponding to higher values.
53 | 
54 | ### Corrected total copy numbers
55 | 
56 | This plot (`totalcn-corrected.png`) is an heatmap that shows the total copy numbers inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
57 | Each point of the heatmap this represents the total copy numbers inferred by CHISEL, such that grey represents 2 copies, blue colors represent <2 copies with darker colors corresponding to smaller values, and red colors represent >2 copies with darker colors corresponding to higher values.
58 | 
59 | ### LOH
60 | 
61 | This plot (`loh.png`) is an heatmap that shows the LOH inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
62 | Each point of the heatmap is colored according to the absence (white) or presence (black) of a LOH in the corresponding cell and bin.
63 | 
64 | ### Corrected LOH
65 | 
66 | This plot (`loh-corrected.png`) is an heatmap that shows the LOH inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
67 | Each point of the heatmap is colored according to the absence (white) or presence (black) of a LOH in the corresponding cell and bin.
68 | 
69 | ### A-specific copy numbers
70 | 
71 | This plot (`Aspecificcn.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype A for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
72 | The colors of the heatmap are the same for the total copy-numbers (see above).
73 | 
74 | ### Corrected A-specific copy numbers
75 | 
76 | This plot (`Aspecificcn-corrected.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype A and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
77 | The colors of the heatmap are the same for the total copy-numbers (see above).
78 | 
79 | ### B-specific copy numbers
80 | 
81 | This plot (`Bspecificcn.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype B for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
82 | The colors of the heatmap are the same for the total copy-numbers (see above).
83 | 
84 | ### Corrected B-specific copy numbers
85 | 
86 | This plot (`Bspecificcn-corrected.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype B and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome.
87 | The colors of the heatmap are the same for the total copy-numbers (see above).
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/doc/chisel-pseudonormal.md:
--------------------------------------------------------------------------------
1 | # Command `chisel-pseudonormal.py`
2 | 
3 | The CHISEL command `chisel-pseudonormal.py` implements the method integrated in CHISEL for generating a pseudo matched-normal sample by extracting diploid cells from a barcoded BAM file.
4 | The command simply required as input a barcoded BAM file and the corresponding reference genome; detailed descriptions of the required input are available [here](../man/chisel-pseudonormal.md).
5 | After the execution, the command generates a new BAM file by only merging sequencing reads from diploid cells; thus the resulting BAM file can be used as a paseudo matched-normal sample to run the entire [CHISEL's pipeline](chisel.md).


--------------------------------------------------------------------------------
/doc/chisel.md:
--------------------------------------------------------------------------------
 1 | # Command `chisel`
 2 | 
 3 | The CHISEL command `chisel` as well as the command `chisel_nonormal` runs the entire CHISEL pipeline starting from the required inputs (e.g. BAM files).
 4 | During the executiong, the command creates six folders which contain the temporary and final results produced by the 5 distinct steps of CHISEL.
 5 | 
 6 | ## Estimating RDRs
 7 | 
 8 | This step aims to estimate the RDR for every genomic bin in each cell.
 9 | Moreover, it selects the barcodes that correspond to cells using a specified threshold on the minimum number of reads.
10 | This step creates a folder `rdr` with three files:
11 | 
12 | 1. `total.tsv`: a TSV dataframe containing the number of sequencing reads observed for every selected cell. More specifically, the fields are:
13 |    1. `CELL`: the name of a cell or the name `normal` indicating the matched-normal sample;
14 |    2. `TOTAL`: the total number of sequencing reads observed for the cell.
15 | 2. `rdr.tsv`: a TSV dataframe containg the estimated RDRs with the following fields:
16 |    1. `CHROMOSOME`: the name of a chromosome;
17 |    2. `START`: the starting coordinate of a genomic bin;
18 |    3. `END`: the ending coordinate of the genomic bin;
19 |    4. `CELL`: the name of a cell;
20 |    5. `NORMAL`: the number of sequencing reads from the matched-normal sample for the bin;
21 |    5. `COUNT`: the number of sequencing reads from the cell `CELL` in the bin;
22 |    6. `RDR`: the estimated RDR.
23 | 3. `log`: a logging file of the execution of this step (optional).
24 | 
25 | ## Estimating BAF
26 | 
27 | This step aims to estimate the BAF for phased germline heterozygous SNPs in the selected cells.
28 | This step creates a folder `baf` with two files:
29 | 
30 | 1. `baf.tsv`: a TSV dataframe with the following fields:
31 |    1. `CHROMOSOME`: the name of a chromosome;
32 |    2. `POS`: a genomic position in the chromosome `CHROMOSOME` for a germline heterozygous SNP;
33 |    3. `CELL`: the name of a cell;
34 |    4. `A-COUNT`: the number of observed sequencing reads from the haplotype A of the SNP;
35 |    4. `B-COUNT`: the number of observed sequencing reads from the haplotype B of the SNP.
36 | 2. `log`: a logging file of the execution of this step (optional).
37 | 
38 | ## Combining RDRs and BAFs
39 | 
40 | This step aims to combine the RDRs and BAFs for the selected bins in the selected cells.
41 | This step creates a folder `combo` with two files:
42 | 
43 | 1. `combo.tsv`: a TSV dataframe with the following fields:
44 |    1. `CHROMOSOME`: the name of a chromosome;
45 |    2. `START`: the starting coordinate of a genomic bin;
46 |    3. `END`: the ending coordinate of the genomic bin;
47 |    4. `CELL`: the name of a cell;
48 |    5. `NORMAL`: the number of sequencing reads from the matched-normal sample for the bin;
49 |    6. `COUNT`: the number of sequencing reads from the cell `CELL` in the bin;
50 |    7. `RDR`: the estimated RDR for the bin in the cell `CELL`;
51 |    8. `A-COUNT`: the number of observed sequencing reads from the haplotype A of the SNP;
52 |    9. `B-COUNT`: the number of observed sequencing reads from the haplotype B of the SNP;
53 |    10. `BAF`: the B-allele frequency estimated for the bin in the cell `CELL`.
54 | 2. `log`: a logging file of the execution of this step (optional).
55 | 
56 | ## Calling
57 | 
58 | This step aims to infer the ploidy of each cell and, after global clustering of RDRs and BAFs, to infer the allele- and haplotype-specific copy numbers for every bin in every cell.
59 | This step creates a folder `calls` with two files
60 | 
61 | 1. `calls.tsv`: a TSV dataframe with the following fields:
62 |    1. `CHROMOSOME`: the name of a chromosome;
63 |    2. `START`: the starting coordinate of a genomic bin;
64 |    3. `END`: the ending coordinate of the genomic bin;
65 |    4. `CELL`: the name of a cell;
66 |    5. `NORMAL`: the number of sequencing reads from the matched-normal sample for the bin;
67 |    6. `COUNT`: the number of sequencing reads from the cell `CELL` in the bin;
68 |    7. `RDR`: the estimated RDR for the bin in the cell `CELL`;
69 |    8. `A-COUNT`: the number of observed sequencing reads from the haplotype A of the SNP;
70 |    9. `B-COUNT`: the number of observed sequencing reads from the haplotype B of the SNP;
71 |    10. `BAF`: the B-allele frequency estimated for the bin in the cell `CELL`;
72 |    11. `ALLELECN`: dash-separated ordered pair of the inferred haplotype-specific copy numbers for the bin in the cell `CELL`.
73 | 2. `log`: a logging file of the execution of this step (optional).
74 | 
75 | ## Cloning
76 | 
77 | This steps aims to infer the clones by clustering cells based on the inferred haplotype-specific copy numbers and selecting the clusters that correspond to actual clones.
78 | This step creates a folder `clones` with two files:
79 | 
80 | 1. `mapping.tsv`: a TSV dataframe with the following fields:
81 |    1. `CELL`: the name of a selected cell;
82 |    2. `CLUSTER`: the cluster where the cell `CELL` has been assigned;
83 |    3. `CLONE`: the clone of the cell `CELL`, however it corresponds to `None` if the cells is classified as noisy.
84 | 2. `log`: a logging file of the execution of this step (optional).
85 | 
86 | Moreover, this step introduces a new field (right-most field) in the file `calls.tsv` which is `CORRECTED_CNS` and corresponds to the final haplotype-specific copy numbers estimated after consensus of cells in the same clone.
87 | 
88 | ## Plotting
89 | 
90 | This step generate several useful plots about the results, which are fully described [here](chisel-plotting.md).
91 | 


--------------------------------------------------------------------------------
/guides/clones.md:
--------------------------------------------------------------------------------
 1 | # Identification of clones
 2 | 
 3 | CHISEL infers clones by clustering cells based on the inferred haplotype-specific copy numbers and selecting the clusters that correspond to actual clones.
 4 | This selection is indeed required because the data are noisy and minor differences between cells may indicate errors as well as small clusters may indicate noisy cells with bad sequencing.
 5 | This identification is controlled by two parameters:
 6 | - `f`: the maximum fraction of the genome with different haplotype-specific copy numbers for cells in the same clone (default: 0.06);
 7 | - `s`: the minimum number of cells in a clone (default: 14).
 8 | 
 9 | The values of these two parameters have been calibrated for the expected number of cells and sequencing coverage of 10X Genomics datasets.
10 | However, when analyzing datasets with different number of cells, different sequencing coverage, or particularly noisy datasets, the default values of these parameters may not be appropriate.
11 | Therefore, when the user observes a outlying high number of noisy cells or too few inferred clones (even 0), it is important to vary these values to explore different solutions.
12 | 
13 | Given the inferred clones with the previous parameters, there is one additional parameter that can be used to adjust the classification of noisy cells: `-r`, which controls the refinement of the identified clones and allows the user to include more "noisy" cells into the identified clones. Specifically, every cell that has a fraction of the genome with different haplotype-specific copy numbers lower than then value of `r` will be included into the clones. Therefore, the user can user increasingly higher values to force the inclusion of more noisy cells into the inferred clones, for example `-r 0.2`, `-r 0.3`, `-r 0.4`, etc. Note that `-r 1` will force every cell to be assigned to a clone.
14 | 
15 | These tasks can be performed very efficiently and easily by using the [CHISEL command `chisel-cloning.py`](../doc/chisel-cloning.md), which allows the user to only re-execute the inference of clones and the generation of plots very efficiently from the already inferred haploty-specific copy numbers.
16 | As such, using this command, the user can attempt to use different combinations of the parameters, varying the maximum difference `f` (e.g. `-f 0.1`, `-f 0.12`, `-f 0.15`, ...) and the minimum number `-s` of cells to select the clones (either increasing like `-s 20`, `-s 30`, ... or decreasing like `-s 3`, `-s 2`, according to the total number of cells).
17 | More details on adjusting and selecting reasonable values of these parameters are available in the [CHISEL's manuscript](https://doi.org/10.1101/837195).
18 | 


--------------------------------------------------------------------------------
/guides/clustering.md:
--------------------------------------------------------------------------------
 1 | # Clustering
 2 | 
 3 | CHISEL globally clusters the estimated RDRs and BAFs by using a k-means algorithm and model-selection criterion based on the elbow method to select the best number of clusters (further details are reported in the [CHISEL's manuscript](https://doi.org/10.1101/837195)).
 4 | In order to do this, CHISEL fixes the maximum number of clusters (default value is 100).
 5 | However, the value can be too high when analyzing very noisy datasets since the high levels of noise in the data can be misinterpreted and may lead to overfitting.
 6 | Therefore, the user can assess the levels of variance and noise by using the [BAF and RDR plots](../doc/chisel-plotting.md): in particular, high levels of noise can be immediately noted when a clear clustering structure is missing from such plots. 
 7 | As such, when analyzing datasets with very high levels of variance, the user can lower the maximum number of clusters to avoid overfitting.
 8 | Another possible signal that may indicate overiffing is for example the inference of many outlying and noisy CNAs, i.e. observing many cells with isolated and small CNAs.
 9 | For such QC purposes, the user can use the CHISEL [command `chisel-calling.py`](../doc/chisel-calling.md) to vary the maximum number of clusters with the argument `-K` to re-run the CHISEL inference without the need of re-estimating RDRs and BAFs (which generally is the most time-consuming step).
10 | The user can thus quantify the presence of noisy CNAs when varying the value of this parameter, for example `-K 80`, `-K 60`, `-K 40`...
11 | 


--------------------------------------------------------------------------------
/guides/ploidy.md:
--------------------------------------------------------------------------------
 1 | # Ploidy selection
 2 | 
 3 | CHISEL infers the ploidy of each cell from the estimated RDRs and BAFs by using a model-selection criterion which has been calibrated for observing a sufficient number of reads on average across all bins and cells (further details are reported in the [CHISEL's manuscript](https://doi.org/10.1101/837195)).
 4 | In the case of particularly noisy cells or datasets with a particularly low sequencing coverage or high variance, the inference can thus be more challenging.
 5 | A sign which generally indicates potential issues in the inference of cell ploidies is the observation of a substatial number of cells with different ploidies (and thus with completely different copy numbers).
 6 | However, CHISEL provides a parameter to adjust the sensitivity of the model-selection criterion for dealing with these cases.
 7 | 
 8 | For QC purposes, the reccommendation is to analyze the allele-specific copy numbers inferred by CHISEL, for example using the corresponding [plots](../doc/chisel-plotting.md).
 9 | If a substantial number of cells with different ploidies has been inferred, the reccommendation is to analyze how the results change by re-running the inference of copy numbers varying the sensitivity of the model-selection criterion.
10 | Specifically, the CHISEL [command `chisel-calling.py`](../doc/chisel-calling.md) can be used to do this very efficiently by varying the sensitivity with the argument `-A` to re-run the CHISEL inference without the need of re-estimating RDRs and BAFs (which generally is the most time-consuming step).
11 | The use can thus analyse the inferred tumor ploidies by increase the sensitivity with values of `-A 2`, `-A 3`, `-A 4`...
12 | The inference of different ploidies is well supported by the data if the results do not substantially change when increasing the sensitivity, otherwise the results obtained with higher sensitivity are more likely.
13 | 


--------------------------------------------------------------------------------
/install_full.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | # Finding whether os is Linux or MacOSX
 7 | OS=$(uname -s)
 8 | case ${OS} in
 9 |     Linux*)     OS=Linux;;
10 |     Darwin*)    OS=MacOSX;;
11 |     *)          echo "Unknown OS ${OS}; please use manual installation." && exit 1;;
12 | esac
13 | 
14 | # Finding whether machine is 32bit or 64bit
15 | case ${OS} in
16 |     Linux)
17 | 	VER=$(uname -i)
18 | 	case ${VER} in
19 | 	    x86_64)    MINICONDA="https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh";;
20 | 	    *)         MINICONDA="https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86.sh";;
21 | 	esac;;
22 |     MacOSX)
23 | 	VER=$(uname -m)
24 | 	case ${VER} in
25 | 	    *)         MINICONDA="https://repo.anaconda.com/miniconda/Miniconda2-latest-MacOSX-x86_64.sh";;
26 | 	esac;;
27 |     *)
28 |         echo "Unknown OS ${OS}; please use manual installation." && exit 1;;
29 | esac
30 | 
31 | # Installing Miniconda
32 | CHISEL_HOME=$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
33 | cd ${CHISEL_HOME}
34 | curl -L ${MINICONDA} > miniconda.sh
35 | rm -rf ./conda/
36 | bash miniconda.sh -b -f -p ./conda/
37 | export CONDA_HOME=${CHISEL_HOME}/conda/bin
38 | 
39 | # Installing chisel
40 | ${CONDA_HOME}/conda config --add channels defaults
41 | ${CONDA_HOME}/conda config --add channels bioconda
42 | ${CONDA_HOME}/conda config --add channels conda-forge
43 | ${CONDA_HOME}/conda create -n chisel chisel -y
44 | 
45 | # Activating CHISEL
46 | source ${CONDA_HOME}/activate chisel
47 | echo -e "\nInstallation was succesfull and CHISEL is ready!\nPlease remember to run the following command now and during every new session before using CHISEL:\n\n\t\tsource ${CONDA_HOME}/activate chisel\n\n"
48 | 


--------------------------------------------------------------------------------
/man/chisel-bedding.md:
--------------------------------------------------------------------------------
 1 | ```shell
 2 | usage: chisel_bedding [-h] [-x RUNDIR] [--rawcalls] [--noextending] [-j JOBS]
 3 |                       [INPUT]
 4 | 
 5 | CHISEL command to generate a BED file for each cell with the corresponding
 6 | CHISEL's results.
 7 | 
 8 | positional arguments:
 9 |   INPUT                 Input file with inferred copy numbers (default:
10 |                         calls/calls.tsv)
11 | 
12 | optional arguments:
13 |   -h, --help            show this help message and exit
14 |   -x RUNDIR, --rundir RUNDIR
15 |                         Running directory (default: current directory)
16 |   --rawcalls            Use raw copy numbers instead of consensus corrected
17 |                         ones (default: False)
18 |   --noextending         Merge consecutive bins only if they are neighboring
19 |                         (default: False, segments are extended to fill gaps)
20 |   -j JOBS, --jobs JOBS  Number of parallele jobs to use (default: equal to
21 |                         number of available processors)
22 | ```
23 | 


--------------------------------------------------------------------------------
/man/chisel-calling.md:
--------------------------------------------------------------------------------
 1 | ```shell
 2 | usage: chisel-calling.py [-h] [-x RUNDIR] [-A SENSITIVITY] [-P MAXPLOIDY]
 3 |                          [-K UPPERK] [--seed SEED] [-j JOBS]
 4 |                          [INPUT]
 5 | 
 6 | CHISEL command to re-run the inference of allele- and haplotype-specific copy
 7 | numbers, cell clustering, and plotting. This steps starts from estimated RDRs
 8 | and BAFs.
 9 | 
10 | positional arguments:
11 |   INPUT                 Input file with combined RDR and BAF per bin and per
12 |                         cell (default: combo/combo.tsv)
13 | 
14 | optional arguments:
15 |   -h, --help            show this help message and exit
16 |   -x RUNDIR, --rundir RUNDIR
17 |                         Running directory (default: current directory)
18 |   -A SENSITIVITY, --sensitivity SENSITIVITY
19 |                         Sensitivity of model selection for ploidy (default: 1,
20 |                         increase this parameter to lower sensitivity to noisy
21 |                         data, adjust this value (e.g. 2, 4, ..., 10, ...) to
22 |                         better deal with high-variance data (e.g. low
23 |                         coverage, small number of cells, low number of phased
24 |                         SNPs, etc...)
25 |   -P MAXPLOIDY, --maxploidy MAXPLOIDY
26 |                         Maximum total copy number to consider for balanced
27 |                         cluster (default: 4, corresponding to a WGD)
28 |   -K UPPERK, --upperk UPPERK
29 |                         Maximum number of bin clusters (default: 100, use 0 to
30 |                         consider maximum number of clusters)
31 |   --seed SEED           Random seed for replication (default: None)
32 |   -j JOBS, --jobs JOBS  Number of parallele jobs to use (default: equal to
33 |                         number of available processors)
34 | ```


--------------------------------------------------------------------------------
/man/chisel-cloning.md:
--------------------------------------------------------------------------------
 1 | ```chisel
 2 | usage: chisel-cloning.py [-h] [-x RUNDIR] [-f MAXDIFF] [-s MINSIZE]
 3 |                          [-r REFINEMENT] [--seed SEED]
 4 |                          [INPUT]
 5 | 
 6 | CHISEL command to run the pipeline starting from inferred copy numbers.
 7 | 
 8 | positional arguments:
 9 |   INPUT                 Input file with combined RDR and BAF per bin and per
10 |                         cell
11 | 
12 | optional arguments:
13 |   -h, --help            show this help message and exit
14 |   -x RUNDIR, --rundir RUNDIR
15 |                         Running directory (default: current directory)
16 |   -f MAXDIFF, --maxdiff MAXDIFF
17 |                         Maximum haplotype-specific distance between the genome
18 |                         of cells in the same clone (default: 0.06, when -1 is
19 |                         chosen the maximum cluster method of SciPy is used)
20 |   -s MINSIZE, --minsize MINSIZE
21 |                         Minimum number of cells in a subpopulation to define a
22 |                         clone (default: 14)
23 |   -r REFINEMENT, --refinement REFINEMENT
24 |                         Maximum difference to assign noisy cells to the
25 |                         closest clone (default: 0.0, note that 1.0 can be used
26 |                         to force the assigment of all cells)
27 |   --seed SEED           Random seed for replication (default: None)
28 | ```


--------------------------------------------------------------------------------
/man/chisel-plotting.md:
--------------------------------------------------------------------------------
 1 | ```shell
 2 | usage: chisel-plotting.py [-h] [-m CLONEMAP] [-f FIGFORMAT] [-s SAMPLE]
 3 |                           [--excludenoisy] [--gridsize GRIDSIZE]
 4 |                           [--plotsize PLOTSIZE] [--clussize CLUSSIZE]
 5 |                           [--xmax XMAX] [--xmin XMIN] [--ymax YMAX]
 6 |                           [--ymin YMIN]
 7 |                           [INPUT]
 8 | 
 9 | CHISEL command to re-create the plots.
10 | 
11 | positional arguments:
12 |   INPUT                 Input file with inferred copy numbers (default:
13 |                         calls/calls.tsv)
14 | 
15 | optional arguments:
16 |   -h, --help            show this help message and exit
17 |   -m CLONEMAP, --clonemap CLONEMAP
18 |                         Clone map (default: not used, the cells will be
19 |                         clustered for plotting purposes)
20 |   -f FIGFORMAT, --figformat FIGFORMAT
21 |                         Format of output figures (default: png, the only other
22 |                         option is pdf)
23 |   -s SAMPLE, --sample SAMPLE
24 |                         Number of cells to sample (default: 20)
25 |   --excludenoisy        Exclude noisy cells from plots (default: False)
26 |   --gridsize GRIDSIZE   Grid dimenstions specified as comma-separated numbers
27 |                         (default: 12,6)
28 |   --plotsize PLOTSIZE   Plot dimenstions for RDR-BAF plots, specified as
29 |                         comma-separated numbers (default: 5,1.5)
30 |   --clussize CLUSSIZE   Grid dimenstions for clustered plots, specified as
31 |                         comma-separated numbers (default: 5,3)
32 |   --xmax XMAX           Maximum x-axis value (default: None)
33 |   --xmin XMIN           Minimum x-axis value (default: None)
34 |   --ymax YMAX           Maximum x-axis value (default: None)
35 |   --ymin YMIN           Minimum x-axis value (default: None)
36 | ```


--------------------------------------------------------------------------------
/man/chisel-prep.md:
--------------------------------------------------------------------------------
 1 | ```shell
 2 | usage: chisel_prep [-h] [-r REFERENCE] [-x RUNDIR] [-o OUTPUT]
 3 |                    [--rexpname REXPNAME] [--rexpread REXPREAD]
 4 |                    [--noduplicates] [--keeptmpdir]
 5 |                    [--barcodelength BARCODELENGTH] [--bcftools BCFTOOLS]
 6 |                    [--samtools SAMTOOLS] [--bwa BWA] [-j JOBS] [--seed SEED]
 7 |                    INPUT [INPUT ...]
 8 | 
 9 | CHISEL command to create a barcoded BAM file from single-cell FASTQs (or gz-
10 | compressed FASTQs), single-cell BAMs, or a `RG:Z:`-barcoded BAM files without
11 | `CB:Z:` tags. When single-cell FASTQs or BAMs are provided a CELL name is
12 | assigned to each file (through either filename or table) and the same cell
13 | barcode will be assigned to all corresponding reads, but a different RG tag as
14 | they are considered as different repetitions of sequencing of the same cell.
15 | Specifically, when a table of inputs is not provied, for FASTQs each CELL name
16 | is extracted from the filename through the provided regular expression
17 | (default matches Illumina standard format), for BAMs basename is used as CELL
18 | name. When single-cell FASTQs are provided a READ value is also assigned to
19 | each file (through either filename or table) and files with the same filename
20 | when removing READ values are considered as pairs of sequencing read mates.
21 | Input files, CELL names, and possible READ values can be provided through a
22 | table of inputs.
23 | 
24 | positional arguments:
25 |   INPUT                 Input FASTQs, BAMs, or TSV file with different
26 |                         behaviors: .........................................
27 |                         (1) FASTQs -- specified in a directory DIR as
28 |                         `DIR/*.fastq` or `DIR/*.fastq.gz` -- will be barcoded
29 |                         and aligned with (optionally) marked duplicates into a
30 |                         barcoded BAM file; .................................
31 |                         (2) BAMs -- specified in a directory DIR as
32 |                         `DIR/*.bam` -- will be barcoded and aligned with
33 |                         (optionally) marked duplicates into a barcoded BAM
34 |                         file; ..............................................
35 |                         (3) a single BAM file with unique cells names in the
36 |                         field `RG:Z:` will be converted into a barcoded BAM
37 |                         file with the additional `CB:Z:` tag; ..............
38 |                         (4) a tab-separated table of inputs (TSV with optional
39 |                         header starting with `#`) with two columns: the first
40 |                         column is an input file (FASTQ or BAM) and the second
41 |                         column is the corresponding cell name. When FASTQs are
42 |                         provided, a third column can be optionally specified
43 |                         to indicate the read name in paired-end sequencing,
44 |                         e.g., indicating either R1 or R2 for the first or
45 |                         second mate of paired-end reads, respectively. If a
46 |                         third column is not present, FASTQs are assumed to be
47 |                         from single-end sequencing.
48 | 
49 | optional arguments:
50 |   -h, --help            show this help message and exit
51 |   -r REFERENCE, --reference REFERENCE
52 |                         Reference genome, which is mandatory in FASTQ mode
53 |                         (default: None)
54 |   -x RUNDIR, --rundir RUNDIR
55 |                         Running directory (default: current directory)
56 |   -o OUTPUT, --output OUTPUT
57 |                         Output name in running directory (default:
58 |                         barcodedcells.bam)
59 |   --rexpname REXPNAME   Regulare expression to extract cell name from input
60 |                         FASTQ filenames (default:
61 |                         `(.*)_S.*_L.*_R[1|2]_001.fastq.*`)
62 |   --rexpread REXPREAD   Regulare expression to extract cell name from input
63 |                         FASTQ filenames (default:
64 |                         `.*_S.*_L.*_(R[1|2])_001.fastq.*`)
65 |   --barcodeonly         Only compute barcodes but do not run aligning pipeline
66 |                         (default: False)
67 |   --noduplicates        Do not perform marking duplicates and recalibration
68 |                         with Picard tools (default: False)
69 |   --keeptmpdir          Do not erase temporary directory (default: False)
70 |   --barcodelength BARCODELENGTH
71 |                         Length of barcodes (default: 12)
72 |   --bcftools BCFTOOLS   Path to the directory to "bcftools" executable
73 |                         (default: in $PATH)
74 |   --samtools SAMTOOLS   Path to the directory to "samtools" executable
75 |                         (default: in $PATH)
76 |   --bwa BWA             Path to the directory to "bwa" executable (default: in
77 |                         $PATH)
78 |   -j JOBS, --jobs JOBS  Number of parallele jobs to use (default: equal to
79 |                         number of available processors)
80 |   --seed SEED           Random seed for replication (default: None)
81 | ```
82 | 


--------------------------------------------------------------------------------
/man/chisel-pseudonormal.md:
--------------------------------------------------------------------------------
 1 | ```shell
 2 | usage: chisel-pseudonormal.py [-h] -r REFERENCE [-x RUNDIR] [-e THRESHOLD]
 3 |                               [-b SIZE] [-c CHROMOSOMES] [-m MINREADS]
 4 |                               [--samtools SAMTOOLS] [-j JOBS]
 5 |                               [--tmpdir TMPDIR] [-n NORMAL]
 6 |                               INPUT
 7 | 
 8 | CHISEL command to generate a pseudo-matched normal sample by extracting
 9 | diploid cells from a barcoded single-cell BAM file.
10 | 
11 | positional arguments:
12 |   INPUT                 Barcoded single-cell BAM file
13 | 
14 | optional arguments:
15 |   -h, --help            show this help message and exit
16 |   -r REFERENCE, --reference REFERENCE
17 |                         Reference genome
18 |   -x RUNDIR, --rundir RUNDIR
19 |                         Running directory (default: current directory)
20 |   -e THRESHOLD, --threshold THRESHOLD
21 |                         Minimum fraction of diploid genome to select diploid
22 |                         cells (default: 0.9)
23 |   -b SIZE, --size SIZE  Bin size, with or without "kb" or "Mb"
24 |   -c CHROMOSOMES, --chromosomes CHROMOSOMES
25 |                         Space-separeted list of chromosomes between apices
26 |                         (default: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8
27 |                         chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17
28 |                         chr18 chr19 chr20 chr21 chr22")
29 |   -m MINREADS, --minreads MINREADS
30 |                         Minimum number total reads to select cells (default:
31 |                         100000)
32 |   --samtools SAMTOOLS   Path to the directory to "samtools" executable,
33 |                         required in default mode (default: samtools is
34 |                         directly called as it is in user $PATH)
35 |   -j JOBS, --jobs JOBS  Number of parallele jobs to use (default: equal to
36 |                         number of available processors)
37 |   --tmpdir TMPDIR       Temporary directory in running directory (default:
38 |                         _TMP)
39 |   -n NORMAL, --normal NORMAL
40 |                         Name of the generated pseudo matched-normal BAM file
41 |                         (default: pseudonormal.bam)
42 | ```


--------------------------------------------------------------------------------
/man/chisel.md:
--------------------------------------------------------------------------------
 1 | ```shell
 2 | usage: chisel.py [-h] [-x RUNDIR] -t TUMOR -n NORMAL -r REFERENCE -l
 3 |                  LISTPHASED [-b SIZE] [-k BLOCKSIZE] [-c CHROMOSOMES]
 4 |                  [-m MINREADS] [-p MAXPLOIDY] [-K UPPERK]
 5 |                  [--bcftools BCFTOOLS] [--samtools SAMTOOLS]
 6 |                  [--cellprefix CELLPREFIX] [--cellsuffix CELLSUFFIX]
 7 |                  [--seed SEED] [-j JOBS]
 8 | 
 9 | CHISEL command to run the complete pipeline starting from the 4 required data:
10 | (1) Barcoded single-cell BAM; (2) Matched-normal BAM; (3) Reference genome;
11 | (4) Phased VCF.
12 | 
13 | optional arguments:
14 |   -h, --help            show this help message and exit
15 |   -x RUNDIR, --rundir RUNDIR
16 |                         Running directory (default: current directory)
17 |   -t TUMOR, --tumor TUMOR
18 |                         Barcoded single-cell BAM file
19 |   -n NORMAL, --normal NORMAL
20 |                         Matched-normal BAM file
21 |   -r REFERENCE, --reference REFERENCE
22 |                         Reference genome
23 |   -l LISTPHASED, --listphased LISTPHASED
24 |                         Phased SNPs file (lines of heterozygous germline SNPs
25 |                         must contain either 0|1 or 1|0)
26 |   -b SIZE, --size SIZE  Bin size, with or without "kb" or "Mb"
27 |   -k BLOCKSIZE, --blocksize BLOCKSIZE
28 |                         Size of the haplotype blocks (default: 50kb, use 0 to
29 |                         disable)
30 |   -c CHROMOSOMES, --chromosomes CHROMOSOMES
31 |                         Space-separeted list of chromosomes between apices
32 |                         (default: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8
33 |                         chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17
34 |                         chr18 chr19 chr20 chr21 chr22")
35 |   -m MINREADS, --minreads MINREADS
36 |                         Minimum number total reads to select cells (default:
37 |                         100000)
38 |   -p MAXPLOIDY, --maxploidy MAXPLOIDY
39 |                         Maximum total copy number to consider for balanced
40 |                         cluster (default: 4, corresponding to a WGD)
41 |   -K UPPERK, --upperk UPPERK
42 |                         Maximum number of bin clusters (default: 100, use 0 to
43 |                         consider maximum number of clusters)
44 |   --bcftools BCFTOOLS   Path to the directory to "bcftools" executable,
45 |                         required in default mode (default: bcftools is
46 |                         directly called as it is in user $PATH)
47 |   --samtools SAMTOOLS   Path to the directory to "samtools" executable,
48 |                         required in default mode (default: samtools is
49 |                         directly called as it is in user $PATH)
50 |   --cellprefix CELLPREFIX
51 |                         Prefix of cell barcode field in SAM format (default:
52 |                         CB:Z:)
53 |   --cellsuffix CELLSUFFIX
54 |                         Suffix of cell barcode field in SAM format (default:
55 |                         none)
56 |   --seed SEED           Random seed for replication (default: None)
57 |   -j JOBS, --jobs JOBS  Number of parallele jobs to use (default: equal to
58 |                         number of available processors)
59 | ```
60 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | setuptools.setup(
 6 |     name='chisel',
 7 |     version='1.2',
 8 |     python_requires='==2.7.*',
 9 |     packages=['chisel', 'chisel.bin'],
10 |     package_dir={'': 'src'},
11 |     author='Simone Zaccaria',
12 |     author_email='s.zaccaria@ucl.ac.uk',
13 |     description='Copy-number Haplotype Inference in Single-cell by Evolutionary Links',
14 |     long_description='https://github.com/raphael-group/chisel',
15 |     url='https://github.com/raphael-group/chisel',
16 |     install_requires=[
17 |         'numpy>=1.16.1',
18 |         'scipy>=1.2.1',
19 |         'pandas',
20 |         'seaborn>=0.7.1',
21 |         'statsmodels<=0.10.1'
22 |     ],
23 |     extras_require={
24 |         'dev': ['pytest', 'mock']
25 |     },
26 |     license='BSD',
27 |     platforms=["Linux", "MacOs", "Windows"],
28 |     classifiers=[
29 |         'Programming Language :: Python :: 2.7',
30 |         "Intended Audience :: Science/Research",
31 |         "Natural Language :: English",
32 |         "Operating System :: MacOS :: MacOS X",
33 |         "Operating System :: Microsoft :: Windows",
34 |         "Operating System :: POSIX :: Linux",
35 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
36 |     ],
37 |     keywords=[
38 |         'scientific',
39 |         'sequence analysis',
40 |         'cancer',
41 |         'single-cell',
42 |         'DNA',
43 |         'copy-number'],
44 |     entry_points={'console_scripts': ['chisel=chisel.bin.chisel_main:main',
45 |                                       'chisel_nonormal=chisel.bin.chisel_nonormal:main',
46 |                                       'chisel_preprocess=chisel.bin.chisel_preprocess:main',
47 |                                       'chisel_nonormal_preprocess=chisel.bin.chisel_nonormal_preprocess:main',
48 |                                       'chisel_combocall=chisel.bin.chisel_combocall:main',
49 |                                       'chisel_nonormal_combocall=chisel.bin.chisel_nonormal_combocall:main',
50 |                                       'chisel_calling=chisel.bin.chisel_calling:main',
51 |                                       'chisel_cloning=chisel.bin.chisel_cloning:main',
52 |                                       'chisel_plotting=chisel.bin.chisel_plotting:main',
53 |                                       'chisel_pseudonormal=chisel.bin.chisel_pseudonormal:main',
54 |                                       'chisel_prep=chisel.bin.chisel_prep:main',
55 |                                       'chisel_bedding=chisel.bin.chisel_bedding:main',
56 |                                       'chisel_rdr=chisel.bin.chisel_rdr:main']}
57 | )
58 | 


--------------------------------------------------------------------------------
/src/chisel/Cloner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import sys, os
  4 | import argparse
  5 | import shutil
  6 | import warnings
  7 | 
  8 | from itertools import cycle
  9 | from collections import defaultdict
 10 | from collections import Counter
 11 | 
 12 | import numpy as np
 13 | import scipy
 14 | import scipy.cluster
 15 | import scipy.cluster.hierarchy as hier
 16 | 
 17 | from Utils import *
 18 | 
 19 | 
 20 | def parse_args(args):
 21 |     description = "Infer clones as subpopulations of cells with the same complement of CNAs and outputs a file with the mapping of every cell to the corresponding clone."
 22 |     parser = argparse.ArgumentParser(description=description)
 23 |     parser.add_argument("INPUT", type=str, help="Input file with RDR, BAF, and inferred copy numbers.")
 24 |     parser.add_argument("-f", "--maxdiff", required=False, type=float, default=0.07, help="Maximum fraction of the genome with different copy-number states allowed in a clone (default: 0.07, when -1 is chosen the maximum cluster method of SciPy is used)")
 25 |     parser.add_argument("-r", "--refinement", required=False, type=float, default=0.15, help="Maximum difference to assign noisy cells to a clone (default: 0.15)")
 26 |     parser.add_argument("-s", "--minsize", required=False, type=int, default=14, help="Minimum size of subpopultation to define a clone (default: 14)")
 27 |     parser.add_argument("-l", "--linkage", required=False, type=str, default='weighted', help="Linkage method to use for the hierarchical clustering (default: weighted, it must be a valid linkage method available in SciPy when using a non-euclidean distance, i.e. 'single', 'complete', 'average', 'weighted')")
 28 |     parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: none)")
 29 |     args = parser.parse_args(args)
 30 | 
 31 |     if not os.path.isfile(args.INPUT):
 32 |         raise ValueError('ERROR: input file does not exist!')
 33 |     if (not 0.0 <= args.maxdiff <= 1.0) and args.maxdiff != -1:
 34 |         raise ValueError('ERROR: the maximum different fraction of the genome must be either within [0, 1] or equal to -1!')
 35 |     if args.refinement is None:
 36 |         args.refinement = args.maxdiff
 37 |     if not 0.0 <= args.refinement <= 1.0:
 38 |         raise ValueError('ERROR: the refinement must be either within [0, 1]!')
 39 |     if args.minsize <= 0:
 40 |         raise ValueError('ERROR: the minimum size of subpopulations must be positive!')
 41 |     if not args.linkage in {'single', 'complete', 'average', 'weighted'}:
 42 |         raise ValueError('ERROR: the linkage method is invalid or not available for non-euclidean distances!')
 43 |     if args.seed and args.seed < 0:
 44 |         raise ValueError("Random seed must be positive or zero!")
 45 |     else:
 46 |         np.random.seed(args.seed)
 47 | 
 48 |     return {
 49 |         'input' : args.INPUT,
 50 |         'maxdiff' : args.maxdiff,
 51 |         'refinement' : args.refinement,
 52 |         'minsize' : args.minsize,
 53 |         'linkage' : args.linkage,
 54 |         'seed' : args.seed
 55 |     }
 56 | 
 57 | 
 58 | def main(args=None, stdout_file=None):
 59 |     log('Parsing and checking arguments')
 60 |     args = parse_args(args)
 61 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]), level='INFO')
 62 | 
 63 |     log('Reading input')
 64 |     cns, pos, cells = reading(args['input'])
 65 | 
 66 |     log('Clustering cells in clones')
 67 |     clus = clustering(cns, pos, cells, args['maxdiff'], args['linkage'])
 68 | 
 69 |     log('Selecting clones')
 70 |     clones = selecting(clus, args['minsize'])
 71 |     log('Number of identified clones: {}'.format(len(set(clones.values()))), level='INFO')
 72 | 
 73 |     if len(clones) > 0 and args['refinement'] >= 0.0:
 74 |         log('Refining clustering')
 75 |         clones, clus = refining(cns, clus, clones, args['refinement'])
 76 |     log('Number of discarded cells: {} over {} in total'.format(len(set(cells) - set(clones.keys())), len(set(cells))), level='INFO')
 77 | 
 78 |     log('Profiling clones')
 79 |     profiles = profiling(cns, clus)
 80 |     
 81 |     log('Writing clone map')
 82 |     header = '\t'.join(['#CELL', 'CLUSTER', 'CLONE'])
 83 |     if stdout_file is not None:
 84 |         with open(stdout_file, 'w') as f:
 85 |             f.write(header + '\n')
 86 |             for c in cells:
 87 |                 f.write('\t'.join(map(str, [c, clus[c], 'Clone{}'.format(clones[c]) if c in clones else 'None'])) + '\n')
 88 |     else:
 89 |         print header
 90 |         for c in cells:
 91 |             print '\t'.join(map(str, [c, clus[c], 'Clone{}'.format(clones[c]) if c in clones else 'None']))
 92 | 
 93 |     log('Writing clone-corrected copy numbers in provided input')
 94 |     ftmp = args['input'] + '_TMP'
 95 |     assert not os.path.isfile(ftmp), "Temporary file {} does already exist!".format(ftmp)
 96 |     form = (lambda p : ((p[0], int(p[1]), int(p[2])), p[3], p[0:12]))
 97 |     with open(args['input'], 'r') as i:
 98 |         with open(ftmp, 'w') as o:
 99 |             for l in i:
100 |                 if '#' != l[0]:
101 |                     b, e, val = form(l.strip().split())
102 |                     o.write('\t'.join(val + ['{}|{}'.format(*profiles[b][clus[e]])]) + '\n')
103 |                 else:
104 |                     o.write('\t'.join(['#CHR', 'START', 'END', 'CELL', 'NORM_COUNT', 'COUNT', 'RDR', 'A_COUNT', 'B_COUNT', 'BAF', 'CLUSTER', 'HAP_CN', 'CORRECTED_HAP_CN']) + '\n')
105 |     shutil.move(ftmp, args['input'])
106 | 
107 | 
108 | def reading(f):
109 |     cns = defaultdict(lambda : dict())
110 |     form = (lambda p : ((p[0], int(p[1]), int(p[2])), p[3], tuple(map(int, p[11].split('|')))))
111 |     with open(f, 'r') as i:
112 |         for l in i:
113 |             if l[0] != '#' and len(l) > 1:
114 |                 b, c, cn = form(l.strip().split())
115 |                 assert c not in cns[b] # and c not in stuff[b]
116 |                 cns[b][c] = cn
117 |     cns = dict(cns)
118 |     orderchrs = (lambda x : int(''.join([l for l in x if l.isdigit()])))
119 |     order = (lambda b : (orderchrs(b[0]), int(b[1]), int(b[2])))
120 |     pos = sorted(cns.keys(), key=order)
121 |     cells = sorted(set(c for b in cns for c in cns[b]))
122 |     return cns, pos, cells
123 | 
124 | 
125 | def clustering(cns, pos, cells, maxdiff, linkage):
126 |     states = {s : x for x, s in enumerate(set(cns[b][c] for b in pos for c in cells))}
127 |     data = [[states[cns[b][c]] for b in pos] for c in cells]
128 |     linkage = hier.linkage(data, method=linkage, metric='hamming', optimal_ordering=True)
129 |     if maxdiff != -1:
130 |         clus = hier.fcluster(linkage, t=maxdiff, criterion='distance')
131 |     else:
132 |         clus = hier.fcluster(linkage, t=len(cells), criterion='maxclust')
133 |     return {e : clus[i] for i, e in enumerate(cells)}
134 | 
135 | 
136 | def selecting(clus, minsize):
137 |     size = {i : sum(clus[c] == i for c in clus) for i in set(clus.values())}
138 |     return {c : clus[c] for c in clus if size[clus[c]] >= minsize}
139 | 
140 | 
141 | def refining(cns, clus, chosen, maxdiff):
142 |     clones = set(chosen.values())
143 |     safeargmax = (lambda C : argmax(C) if len(C) > 0 else (1, 1))
144 |     getcn = (lambda g, i : safeargmax(Counter([cns[g][c] for c in chosen if chosen[c] == i])))
145 |     profile = {g : {i : getcn(g, i) for i in clones} for g in cns}
146 |     diff = (lambda i, c, g : 1 if profile[g][i] != cns[g][c] else 0)
147 |     weight = (lambda i, c : float(sum(diff(i, c, g) for g in profile)) / float(len(profile)))
148 |     closest = (lambda c : min([(i, weight(i, c)) for i in clones], key=(lambda x : x[1])))
149 |     ref = {c : closest(c) for c in clus if c not in chosen.keys()}
150 |     newclones = {c : chosen[c] if c in chosen else ref[c][0] for c in clus if c in chosen or ref[c][1] <= maxdiff}
151 |     newclus = {c : newclones[c] if c in newclones else clus[c] for c in clus}
152 |     assert False not in set(len({clus[c], chosen[c], newclus[c], newclones[c]}) == 1 for c in chosen)
153 |     return newclones, newclus
154 | 
155 | 
156 | def profiling(cns, clus):
157 |     clones = set(clus.values())
158 |     # safeargmax = (lambda C : argmax(C) if len(C) > 0 else (1, 1))
159 |     # getcn = (lambda g, i : safeargmax(Counter([cns[g][c] for c in clus if clus[c] == i])))
160 |     mapclo = {i : filter(lambda e : clus[e] == i, clus.keys()) for i in clones}
161 |     assert all(len(mapclo[i]) > 0 for i in mapclo), 'Found cluster assignment with no corresponding cell'
162 |     getcn = (lambda g, i : argmax(Counter([cns[g][e] for e in mapclo[i]])))
163 |     return {g : {i : getcn(g, i) for i in clones} for g in cns}    
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     main()
168 | 


--------------------------------------------------------------------------------
/src/chisel/Clusterizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | import argparse
  5 | import math
  6 | import ctypes
  7 | import warnings
  8 | 
  9 | import multiprocessing as mp
 10 | from multiprocessing import Lock, Value, Pool
 11 | 
 12 | import numpy as np
 13 | import scipy.spatial
 14 | 
 15 | from Utils import *
 16 | 
 17 | 
 18 | 
 19 | def kclustering(data, restarts, threshold, seed=None, lord=1, j=1, LB=1, UB=None):
 20 |     pool_size = mp.cpu_count()
 21 |     os.system('taskset -cp 0-%d %s > /dev/null' % (pool_size, os.getpid()))
 22 | 
 23 |     error, center, pdist, cdist = getord(lord)
 24 |     if len(set(len(p) for p in data)) != 1:
 25 |         raise ValueError('All points must have the same length')
 26 |     if seed is not None:
 27 |         np.random.seed(seed)
 28 |     
 29 |     points = np.array(data)
 30 |     TCENTER = center(points)
 31 |     TERROR = sum(error(p, TCENTER) for p in points)
 32 |     PAIRWISE = pdist(points)
 33 |     assert np.isfinite(PAIRWISE).all(), 'Pairwise distance contain NaN!\n{}'.format(PAIRWISE)
 34 | 
 35 |     objs = {}
 36 |     clus = {}
 37 |     
 38 |     if UB:
 39 |         R = min(len(data), UB)
 40 |     else:
 41 |         R = len(points)
 42 |         objs[R] = 0.0
 43 |         clus[R] = [i for i, p in enumerate(points)]
 44 | 
 45 |     if LB:
 46 |         L = max(0, LB)
 47 |     else:
 48 |         L = 0
 49 |         objs[L] = 1.0
 50 |         clus[L] = [0 for p in points]
 51 |     
 52 |     def compute(K):
 53 |         log('Computing for {}:'.format(K), level='INFO')
 54 |         if K not in objs:
 55 |             assert K not in clus, 'The number of clusters {} does not have an objective but a solution'.format(K)
 56 |             obj, clu = kclustering_fixed(points, K, restarts, TERROR, PAIRWISE, lord, j)
 57 |             objs[K] = obj
 58 |             clus[K] = clu
 59 |         log('Objective value for {}: {}'.format(K, objs[K]), level='INFO')
 60 | 
 61 |     MAXR = R
 62 |     compute(MAXR)
 63 |     
 64 |     while(R - L > 1):
 65 |         M = int(math.floor(float(R + L) / 2.0))
 66 |         assert M not in {L, R}, 'Median point is equal to boundaries but it cannot happen'
 67 |         compute(M)
 68 |         if objs[M] - objs[MAXR] > threshold:
 69 |             L = M
 70 |         else:
 71 |             R = M
 72 |             
 73 |     compute(L)
 74 |     compute(R)
 75 |     if L <= threshold:
 76 |         return clus[L]
 77 |     else:
 78 |         return clus[R]
 79 | 
 80 | 
 81 | def getord(lord):
 82 |     ## K-means minimizes SQUARE l2-norms while K-medians minimizes L1-norm
 83 |     if lord == 1: ## K-medians
 84 |         error = (lambda a, b : np.linalg.norm(a - b, ord=1))
 85 |         center = (lambda X : np.median(X, axis=0))
 86 |         pdist = (lambda X : scipy.spatial.distance.pdist(X, metric='cityblock'))
 87 |         cdist = (lambda X, Y : scipy.spatial.distance.cdist(X, Y, metric='cityblock'))
 88 |     elif lord == 2: ## K-means
 89 |         error = (lambda a, b : np.linalg.norm(a - b, ord=2)**2)
 90 |         center = (lambda X : np.mean(X, axis=0))
 91 |         pdist = (lambda X : scipy.spatial.distance.pdist(X, metric='sqeuclidean'))
 92 |         cdist = (lambda X, Y : scipy.spatial.distance.cdist(X, Y, metric='sqeuclidean'))
 93 |     else:
 94 |         raise ValueError('Order of l-norm distance must be either 1 or 2!')
 95 |     return error, center, pdist, cdist
 96 | 
 97 | 
 98 | def kclustering_fixed(points, K, restarts, TERROR, PAIRWISE, lord=1, j=1):
 99 |     with warnings.catch_warnings() as w:
100 |         warnings.simplefilter("ignore")
101 |         shared_points, shared_points_base = share_matrix(points)
102 |         shared_pairwise, shared_pairwise_base = share_array(PAIRWISE)
103 |         shared_clus, shared_clus_base = newshare_matrix(restarts, len(points))
104 |     
105 |     jobs = ((np.random.randint(low=0, high=2**10), x) for x, i in enumerate(range(restarts)))
106 |     bar = ProgressBar(total=restarts, length=40, verbose=False)
107 |     
108 |     initargs = (points.shape[0], points.shape[1], K, lord, TERROR, shared_points, shared_pairwise, shared_clus)
109 |     pool = Pool(processes=min(j, restarts), initializer=init_kclustering, initargs=initargs)
110 |     progress = (lambda obj, it : bar.progress(advance=True, msg="Obj: {} [Iterations: {}]".format(obj, it)))
111 |     best = min(((obj, idx) for obj, idx, it in pool.imap_unordered(run_kclustering, jobs) if progress(obj, it)), key=(lambda x : (x[0], x[1])))
112 |     pool.close()
113 |     pool.join()
114 |     return best[0], shared_clus[best[1]]
115 | 
116 | 
117 | def share_array(npdata):
118 |     N = npdata.shape[0]
119 |     shared_array_base = mp.Array(ctypes.c_double, N)
120 |     shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
121 |     shared_array[:] = npdata
122 |     return shared_array, shared_array_base
123 | 
124 | 
125 | def share_matrix(npdata):
126 |     N, M = npdata.shape
127 |     shared_matrix_base = mp.Array(ctypes.c_double, N * M)
128 |     shared_matrix = np.ctypeslib.as_array(shared_matrix_base.get_obj())
129 |     shared_matrix = shared_matrix.reshape(N, M)
130 |     shared_matrix[:] = npdata
131 |     return shared_matrix, shared_matrix_base
132 | 
133 | 
134 | def newshare_matrix(N, M):
135 |     shared_matrix_base = mp.Array(ctypes.c_double, N * M)
136 |     shared_matrix = np.ctypeslib.as_array(shared_matrix_base.get_obj())
137 |     shared_matrix = shared_matrix.reshape(N, M)
138 |     return shared_matrix, shared_matrix_base
139 | 
140 | 
141 | def init_kclustering(_N, _M, _K, _lord, _TERROR, _points, _pairwise, _clus):
142 |     global N, M, K, error, center, pdist, cdist, TERROR, POINTS, PAIRWISE, CLUS
143 |     N = _N
144 |     M = _M
145 |     K = _K
146 |     error, center, pdist, cdist = getord(_lord)
147 |     TERROR = _TERROR
148 |     POINTS = _points
149 |     PAIRWISE = _pairwise
150 |     CLUS = _clus
151 | 
152 | 
153 | def run_kclustering(job):
154 |     seed, idx = job
155 | 
156 |     ## Utils
157 |     np.random.seed(seed)
158 |     randint = np.random.randint
159 |     lookup = (lambda i, j : PAIRWISE[indices_to_condensed(i, j, N)])
160 | 
161 |     ## Initialization
162 |     centroids = []
163 |     for i in range(K):
164 |         if len(centroids) == 0:
165 |             chosen = randint(N)
166 |             probs = [lookup(x, chosen)**2 if x != chosen else 0.0 for x in xrange(N)]
167 |         else:
168 |             chosen = weighted_ichoice(probs)
169 |             probs = [min(probs[x], lookup(x, chosen)**2) if x != chosen else 0.0 for x in xrange(N)]
170 |         centroids.append(chosen)
171 |     assert len(centroids) == K, 'Found less centroids {} than expected {}'.format(len(centroids), K)
172 |     centroids = np.stack([POINTS[i] for i in centroids])
173 | 
174 |     ## Iterative process
175 |     it = 0
176 |     pre = None
177 |     while pre is None or np.any(np.abs(centroids - pre) > 0.001):
178 |         it += 1
179 |         pre = centroids
180 | 
181 |         ## Assignment
182 |         between  = cdist(POINTS, centroids)
183 |         clu = [min((i for i in range(K)), key=(lambda i : between[x, i])) for x in xrange(N)]
184 |         used = set(clu)
185 | 
186 |         ## Update centroids
187 |         centerize = (lambda i : center(np.stack([p for x, p in enumerate(POINTS) if clu[x] == i])))
188 |         centroids = np.stack([centerize(i) if i in used else np.zeros(M) for i in range(K)])
189 | 
190 |     CLUS[idx] = clu
191 |         
192 |     return sum(between[x, clu[x]] for x, p in enumerate(POINTS)) / TERROR, idx, it
193 | 
194 | 
195 | def weighted_ichoice(weights):
196 |     w = np.array(weights)
197 |     wsum = np.sum(w, dtype=float)
198 |     if wsum > 0:
199 |         assert np.isfinite(wsum).all(), 'wsum distance contain NaN!\n{}'.format(wsum)
200 |         assert np.isfinite(w).all(), 'w distance contain NaN!\n{}'.format(w)
201 |         cs = np.cumsum(w) / wsum
202 |         r = np.random.rand()
203 |         return np.searchsorted(cs, r)
204 |     else:
205 |         return np.random.choice(np.arange(len(weights)), size=1)[0]
206 | 
207 | 


--------------------------------------------------------------------------------
/src/chisel/Mutator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | import shlex
  5 | import argparse
  6 | import subprocess as sp
  7 | 
  8 | from multiprocessing import Lock, Value, Pool
  9 | from collections import defaultdict
 10 | 
 11 | from Utils import *
 12 | 
 13 | 
 14 | def parse_args():
 15 |     description = "Cell-specific allele counting for a given list of point mutations." #which must be provided as a stdin stream (symbol '-' must be used in this case) or as the name of a file."
 16 |     parser = argparse.ArgumentParser(description=description)
 17 |     parser.add_argument("-l","--listmutations", type=str, required=True, help="List of TSV phased genomic positions (TAB-seprated format '#CHR POS REF VAR')")
 18 |     parser.add_argument("-t","--tumor", required=True, type=str, help="BAM file for matched normal sample")
 19 |     parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome")
 20 |     parser.add_argument("-s","--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
 21 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 22 |     parser.add_argument("-c","--listcells", type=str, required=False, default=None, help="File where first column contains all the cells to consider (default: not used)")
 23 |     args = parser.parse_args()
 24 | 
 25 |     if not os.path.isfile(args.tumor):
 26 |         raise ValueError("Specified tumor does not exist!")
 27 |     if not os.path.isfile(args.reference):
 28 |         raise ValueError("Reference genome does not exist!")
 29 |     if args.listcells is not None and not os.path.isfile(args.listcells):
 30 |         raise ValueError("Specified list of cells does not exist!")
 31 | 
 32 |     samtools = args.samtools
 33 |     if not samtools:
 34 |         samtools = "samtools"
 35 |     if which(samtools) is None:
 36 |         raise ValueError("samtools has not been found or is not executable!")
 37 | 
 38 |     if not args.jobs:
 39 |         args.jobs = mp.cpu_count()
 40 |     if args.jobs < 1:
 41 |         raise ValueError("The number of jobs must be positive!")
 42 | 
 43 |     return {
 44 |         'tumor' : args.tumor,
 45 |         'mutations' : args.listmutations,
 46 |         'ref' : args.reference,
 47 |         'samtools' : samtools,
 48 |         'J' : args.jobs,
 49 |         'list' : args.listcells
 50 |     }
 51 | 
 52 | 
 53 | def main():
 54 |     log('Parsing and checking arguments')
 55 |     args = parse_args()
 56 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]), level='INFO')
 57 | 
 58 |     log('Extracting genomic positions of given mutations')
 59 |     mutations = read_mutations(args['mutations'])
 60 |     log('Chromosomes analyzed: {}'.format(','.join(sorted(mutations, key=orderchrs))), level='INFO')
 61 |     log('Total number of given mutations: {}'.format(sum(len(mutations[c]) for c in mutations)), level='INFO')
 62 | 
 63 |     log('Extracting allele counts of mutations for all cells')
 64 |     amut = extracting(args, mutations)
 65 | 
 66 |     if args['list']:
 67 |         log('Reading cell list')
 68 |         with open(args['list'], 'r') as i:
 69 |             cells = set(l.strip().split()[0].replace('-1', '') for l in i if len(l) > 1 and l[0] != '#')
 70 | 
 71 |     log('Writing A/B counts for selected phased SNPs across selected cells')
 72 |     print '\t'.join(['#CHR', 'POS', 'CELL', 'MUT', 'MUTCOV', 'COV'])
 73 |     for c, o, e in ((c, o, e) for c in sorted(amut, key=orderchrs) for o in sorted(amut[c]) for e in sorted(amut[c][o])):
 74 |         print '\t'.join(map(str, [c, o, e, mutations[c][o], amut[c][o][e][mutations[c][o]], sum(amut[c][o][e].values())]))
 75 | 
 76 |     log('KTHXBYE')
 77 | 
 78 | 
 79 | def read_mutations(f):
 80 |     mutations = defaultdict(lambda : dict())
 81 |     chrs = map(str, range(1, 23))
 82 |     with open(f, 'r') as i:
 83 |         for l in i:
 84 |             if len(l) > 1 and l[0] != '#':
 85 |                 p = l.strip().split()
 86 |                 c = p[0]
 87 |                 if ''.join([l for l in c if l.isdigit()]) not in chrs:
 88 |                     continue
 89 |                 try:
 90 |                     o = int(p[1])
 91 |                     v = p[3]
 92 |                     assert o not in mutations[c]
 93 |                     if v[0] in {'A', 'C', 'G', 'T'} or v[0] in {'+', '-'}:
 94 |                         mutations[c][o] = 'N' if v[0] == '-' else (v[1] if v[0] == '+' else v[0])
 95 |                         assert mutations[c][o] in {'A', 'C', 'G', 'T', 'N'} 
 96 |                 except ValueError:
 97 |                     pass
 98 |     return mutations
 99 | 
100 | 
101 | def extracting(args, mutations):
102 |     jobs = ((c, o) for c in mutations for o in mutations[c])
103 |     njobs = sum(len(mutations[c]) for c in mutations)
104 |     countawk = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'count.awk')
105 |     bar = ProgressBar(total=njobs, length=40, verbose=False)
106 |     
107 |     initargs = (args['tumor'], args['samtools'], countawk)
108 |     pool = Pool(processes=min(args['J'], njobs), initializer=init_extracting, initargs=initargs)
109 | 
110 |     ACGT = (lambda : {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0, 'N' : 0})
111 |     amut = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : ACGT())))
112 |     amut = {c : {o : defaultdict(lambda : ACGT()) for o in mutations[c]} for c in mutations}
113 |     for c, o, l in pool.imap_unordered(counting_cell, jobs):
114 |         if l != '':
115 |             for a in l.strip().split('\n'):
116 |                 e, al, count = tuple(a.split())
117 |                 amut[c][o][e][al] += int(count)
118 |         bar.progress(advance=True, msg="Extracted SNP {}:{}".format(c, o))   
119 | 
120 |     return {c : {o : dict(filter(lambda (e, al) : sum(al.values()) > 0, amut[c][o].items())) for o in amut[c]} for c in amut}
121 |     
122 | 
123 | def init_extracting(_tumor, _sam, countawk):
124 |     global cmd_sam, cmd_awk
125 |     cmd_sam = "{} view -F 1796 -q 13 {} {}:{}-{}".format(_sam, _tumor, '{}', '{}', '{}')
126 |     cmd_awk = 'awk -v TAG="{}" -f {}'.format('{}', countawk)
127 | 
128 | 
129 | def counting_cell(job):
130 |     sam = sp.Popen(shlex.split(cmd_sam.format(job[0], job[1], job[1])), stdout=sp.PIPE, stderr=sp.PIPE)
131 |     stdout, stderr = sp.Popen(shlex.split(cmd_awk.format(job[1])), stdin=sam.stdout, stdout=sp.PIPE, stderr=sp.PIPE).communicate()
132 |     return (job[0], job[1], stdout)
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     main()
137 | 


--------------------------------------------------------------------------------
/src/chisel/RDREstimator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | import shlex
  5 | import argparse
  6 | import subprocess as sp
  7 | 
  8 | from multiprocessing import Lock, Value, Pool
  9 | from collections import Counter
 10 | 
 11 | from Utils import *
 12 | 
 13 | 
 14 | def parse_args(args):
 15 |     description = "Compute RDR from barcoded single-cell sequencing data."
 16 |     parser = argparse.ArgumentParser(description=description)
 17 |     parser.add_argument("-t","--tumor", required=True, type=str, help="Barcoded BAM file")
 18 |     parser.add_argument("-n","--normal", required=True, type=str, help="BAM file for matched normal sample")
 19 |     parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"")
 20 |     parser.add_argument("-s","--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
 21 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 22 |     parser.add_argument("-r","--reference", type=str, required=False, default="hg19", help="Name of the corresponding reference genome among \{hg18, hg19, hg38\} (default: hg19)")
 23 |     parser.add_argument("-m","--minreads", type=int, required=False, default=100000, help="Minimum number total reads to select cells (default: None)")
 24 |     parser.add_argument("-l","--cellslist", type=str, required=False, default=None, help="List of cells to select (default: None)")
 25 |     parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")")
 26 |     parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)")
 27 |     parser.add_argument("--cellsuffix", type=str, required=False, default='', help="Suffix of cell barcode field in SAM format (default: none)")
 28 |     parser.add_argument("--outdir", required=False, default='./', type=str, help="Running directory where to write the list of selected cells (default: current directory)")
 29 |     args = parser.parse_args(args)
 30 | 
 31 |     if not os.path.isfile(args.tumor):
 32 |         raise ValueError("Specified tumor does not exist!")
 33 |     if not os.path.isfile(args.normal):
 34 |         raise ValueError("Specified normal does not exist!")
 35 |     if not os.path.isfile(args.reference):
 36 |         raise ValueError("Reference genome not found!")
 37 |     if not os.path.isfile(os.path.splitext(args.reference)[0] + '.dict'):
 38 |         raise ValueError("The dictionary .dict of the reference genome not found! Please remember to index it.")
 39 |     if not os.path.isdir(args.outdir):
 40 |         raise ValueError("Running directory does not exists: {}".format(args.outdir))
 41 | 
 42 |     size = 0
 43 |     try:
 44 |         if args.size[-2:] == "kb":
 45 |             size = int(args.size[:-2]) * 1000
 46 |         elif args.size[-2:] == "Mb":
 47 |             size = int(args.size[:-2]) * 1000000
 48 |         else:
 49 |             size = int(args.size)
 50 |     except:
 51 | 	raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 52 | 
 53 |     samtools = args.samtools
 54 |     if not samtools:
 55 |         samtools = "samtools"
 56 |     if which(samtools) is None:
 57 |         raise ValueError("samtools has not been found or is not executable!")
 58 |     if not args.jobs:
 59 |         args.jobs = mp.cpu_count()
 60 |     if args.jobs < 1:
 61 |         raise ValueError("The number of jobs must be positive!")
 62 |     if args.minreads is not None and args.cellslist is not None:
 63 |         raise ValueError("Only one between number of reads or list of cells can be specified!")
 64 |     if args.minreads and args.minreads < 1:
 65 |         raise ValueError("Minimum number of reads must be positive!")
 66 |     if args.cellslist and not os.path.isfile(args.cellslist):
 67 |         raise ValueError("Cell list does not exist!")
 68 | 
 69 |     return {
 70 |         'tumor' : args.tumor,
 71 |         'normal' : args.normal,
 72 |         'bins' : size,
 73 |         'samtools' : samtools,
 74 |         'J' : args.jobs,
 75 |         'ref' : args.reference,
 76 |         'minreads' : args.minreads,
 77 |         'list' : args.cellslist,
 78 |         'chrs' : args.chromosomes.split(),
 79 |         'prefix' : args.cellprefix,
 80 |         'suffix' : args.cellsuffix,
 81 |         'outdir' : args.outdir
 82 |     }
 83 | 
 84 | 
 85 | def main(args=None, stdout_file=None):
 86 |     log('Parsing and checking arguments')
 87 |     args = parse_args(args)
 88 |     log('\n'.join(['Arguments:'] + ['{} : {}'.format(a, args[a]) for a in args]), level='INFO')
 89 | 
 90 |     log('Computing bins')
 91 |     bins = get_bins(args['ref'], args['chrs'], args['bins'], bams=[args['tumor'], args['normal']], samtools=args['samtools'])
 92 | 
 93 |     log('Counting reads on normal')
 94 |     counts = counting_normal(args['normal'], bins, args['samtools'], args['J'])
 95 | 
 96 |     log('Counting reads on barcoded cells')
 97 |     counts = counting_cells(counts, args['tumor'], bins, args['samtools'], args['J'], args['prefix'], args['suffix'])
 98 |     
 99 |     log('Evaluating set of found cells')
100 |     if args['list'] is None:
101 |         names = set(e for c in counts for b in counts[c] for e in counts[c][b])
102 |         cells = names - {'normal'}
103 |     else:
104 |         clist = set()
105 |         with open(args['list'], 'r') as i:
106 |             for l in i:
107 |                 clist.add(l.strip().replace(',','\t').split()[0].replace('-1', ''))
108 |         names = set(e for c in counts for b in counts[c] for e in counts[c][b] if e in clist)
109 | 
110 |     log('Computing total numbers of sequenced reads')
111 |     total = reduce(inupdate, (Counter(counts[c][b]) for c in counts for b in counts[c]))
112 | 
113 |     log('Selecting cells')
114 |     if args['minreads']:
115 |         names = set(e for e in total if total[e] >= args['minreads'])
116 |         cells = names - {'normal'}
117 |     log('Number of selected cells: {}'.format(len(cells)), level='INFO')
118 | 
119 |     ftot = os.path.join(args['outdir'], 'total.tsv')
120 |     log('Writing the totals in {}'.format(ftot), level='INFO')
121 |     with open(ftot, 'w') as o:
122 |         o.write('{}\t{}\n'.format('normal', total['normal']))
123 |         o.write('\n'.join(['{}\t{}'.format(e, total[e]) for e in cells]))
124 | 
125 |     log('Estimating RDR')
126 |     scale = {e : float(total['normal']) / float(total[e]) for e in cells}
127 |     ratio = (lambda c, b, e : (float(counts[c][b][e]) / float(counts[c][b]['normal'])) if counts[c][b]['normal'] > 0 else 0.0)
128 |     rec = (lambda c, b, e, rdr : '{}\t{}\{}\t{}'.format(c, b, e, rdr))
129 | 
130 |     if stdout_file is not None:
131 |         stdout_f = open(stdout_file, 'w')
132 | 
133 |     for c in sorted(counts, key=orderchrs):
134 |         for b in sorted(counts[c], key=(lambda x : x[0])):
135 |             for e in sorted(set(counts[c][b].keys()) & cells):
136 |                 line = '\t'.join(map(str, [c, b[0], b[1], e, counts[c][b]['normal'], counts[c][b][e], ratio(c, b, e) * scale[e]]))
137 |                 if stdout_file is not None:
138 |                     stdout_f.write(line + '\n')
139 |                 else:
140 |                     print line
141 | 
142 |     if stdout_file is not None:
143 |         stdout_f.close()
144 | 
145 |     log('KTHXBYE')
146 | 
147 | 
148 | def counting_normal(normal, bins, samtools, J):
149 |     jobs = [(c, b) for c in bins for b in bins[c]]
150 |     lock = Lock()
151 |     counter = Value('i', 0)
152 | 
153 |     initargs = (lock, counter, len(jobs), normal, samtools)
154 |     pool = Pool(processes=min(J, len(jobs)), initializer=init_extracting_normal, initargs=initargs)
155 | 
156 |     counts = defaultdict(lambda : defaultdict(lambda : dict()))
157 |     try:
158 |         for c, b, rd in pool.imap_unordered(extracting_normal, jobs):
159 |             assert 'normal' not in counts[c][b]
160 |             if rd != '':
161 |                 counts[c][b]['normal'] = int(rd.strip())
162 |             else:
163 |                 counts[c][b]['normal'] = 0
164 |         pool.close()
165 |         pool.join()
166 |     except Exception, e:
167 |         pool.close()
168 |         pool.terminate()
169 |         raise RuntimeError("ERROR: " + str(e))
170 |         sys.exit(1)
171 | 
172 |     return counts
173 | 
174 | 
175 | def init_extracting_normal(lock, counter, _l, _normal, sam):
176 |     global bar, cmd_sam
177 |     bar = ProgressBar(total=_l, length=40, lock=lock, counter=counter, verbose=False)
178 |     cmd_sam = "{} view -F 1796 -q 13 -c {} {}:{}-{}".format(sam, _normal, "{}", "{}", "{}")
179 | 
180 | 
181 | def extracting_normal(job):
182 |     c, b = job
183 |     cmd = cmd_sam.format(c, b[0], b[1])
184 |     stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate()
185 |     bar.progress(advance=True, msg="Counting normal on {}:{}-{}".format(c, b[0], b[1]))
186 |     return (c, b, stdout.strip())
187 | 
188 | 
189 | def counting_cells(counts, tumor, bins, samtools, J, prefix, suffix):
190 |     jobs = [(c, b) for c in bins for b in bins[c]]
191 |     bar = ProgressBar(total=len(jobs), length=40, verbose=False)
192 | 
193 |     initargs = (tumor, samtools, prefix, suffix)
194 |     pool = Pool(processes=min(J, len(jobs)), initializer=init_extracting, initargs=initargs)
195 | 
196 |     for c, b, rd in pool.imap_unordered(extracting, jobs):
197 |         if rd != '':
198 |             for l in rd.strip().split('\n'):
199 |                 p = l.split()
200 |                 assert p[0] not in counts[c][b]
201 |                 counts[c][b][p[0]] = int(p[1])
202 |         bar.progress(advance=True, msg="Extracted barcodes on {}:{}-{}".format(c, b[0], b[1]))
203 | 
204 |     pool.close()
205 |     pool.join()
206 | 
207 |     return counts
208 | 
209 | 
210 | def init_extracting(_tumor, sam, prefix, suffix):
211 |     global cmd_sam, cmd_awk
212 |     cmd_sam = "{} view -F 1796 -q 13 {} {}:{}-{}".format(sam, _tumor, "{}", "{}", "{}")
213 |     cmd_awk = shlex.split("awk 'BEGIN{{}} {{ if(match($0, /{}[ACGT]+{}/)) {{ X[substr($0, RSTART+5, RLENGTH-5)]++ }} }} END{{ for(i in X) print i, X[i] }}'".format(prefix, suffix))
214 | 
215 | 
216 | def extracting(job):
217 |     c, b = job
218 |     cmd = cmd_sam.format(c, b[0], b[1])
219 |     sam = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE)
220 |     stdout, stderr = sp.Popen(cmd_awk, stdin=sam.stdout, stdout=sp.PIPE, stderr=sp.PIPE).communicate()
221 |     return (c, b, stdout.strip())
222 | 
223 | 
224 | def get_bins(ref, chromosomes, bsize, bams=None, samtools=None):
225 |     chrs = set(c.replace('chr', '') for c in chromosomes)
226 |     ends = {}
227 |     refdict = os.path.splitext(ref)[0] + '.dict'
228 |     with open(refdict, 'r') as i:
229 |         for l in i:
230 |             if '@SQ' in l:
231 |                 assert 'SN:' in l and 'LN:' in l
232 |                 c = l.split('SN:')[1].split()[0]
233 |                 if c.replace('chr', '') in chrs:
234 |                     end = int(l.split('LN:')[1].split()[0])
235 |                     ends[c] = end
236 |                     
237 |     missing = [c for c in chrs if c not in ends and 'chr{}'.format(c) not in ends]
238 |     if missing:
239 |         msg = "The following chromosomes have not been found in the dictionary of the reference genome with or without chr-notation: \n\t{}"
240 |         error(msg.format(','.join(missing)))
241 | 
242 |     if bams and samtools:
243 |         for bam in bams:
244 |             cmd = "{} view -H {}".format(samtools, bam)
245 |             stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate()
246 |             allchrs = set(p.replace('SN:','') for l in stdout.strip().split('\n') if l[:3] == '@SQ' for p in l.strip().split() if p[:3])
247 |             missing = [c for c in ends if c not in allchrs]
248 |             if missing:
249 |                 msg = "The following chromosomes have not been found in {} with these exact names: \n\t{}"
250 |                 error(msg.format(bam, ','.join(missing)))
251 |             
252 |     fl = (lambda l, c : l + [ends[c]] if l[-1] < ends[c] else (l if l[-1] == ends[c] else []))
253 |     bk = (lambda c : fl(list(range(0, ends[c], bsize)), c))
254 |     bins = {c : sorted(zip(bk(c)[:-1], bk(c)[1:]), key=(lambda x : x[0])) for c in ends}
255 |     assert False not in set(len(bins[c]) > 0 for c in bins), "Binning failed for some chromosomes"
256 | 
257 |     return bins
258 | 
259 | 
260 | if __name__ == '__main__':
261 |     main()
262 | 


--------------------------------------------------------------------------------
/src/chisel/Utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | import sys
  5 | import datetime
  6 | import subprocess as sp
  7 | import multiprocessing as mp
  8 | import shlex
  9 | import datetime
 10 | import re
 11 | 
 12 | from collections import defaultdict
 13 | from contextlib import contextmanager
 14 | 
 15 | 
 16 | argmax = (lambda d : max(d, key=(lambda x : d[x])))
 17 | 
 18 | 
 19 | argmin = (lambda d : min(d, key=(lambda x : d[x])))
 20 | 
 21 | 
 22 | def log(m):
 23 |     sys.stderr.write('# ' + m + '\n')
 24 | 
 25 | checkchrs = (lambda digit_x, x : digit_x if len(digit_x) > 0 else 
 26 |                                 (23 if 'X' in x else 
 27 |                                 (24 if 'Y' in x else 
 28 |                                 (25 if 'M' in x else 26))))
 29 | orderchrs = (lambda x : int(checkchrs(''.join([l for l in x if l.isdigit()]), x)))
 30 | 
 31 | 
 32 | def inupdate(a, b):
 33 |     a.update(b)
 34 |     return a
 35 | 
 36 | 
 37 | def indices_to_condensed(i, j, n):
 38 |     assert i != j, "ERROR: equal indices cannot be transformed into condensed indices!"
 39 |     if i < j:
 40 |         i, j = j, i
 41 |     return n*j - j*(j+1)/2 + i - 1 - j
 42 | 
 43 | def which(program):
 44 |     import os
 45 |     def is_exe(fpath):
 46 |         return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
 47 | 
 48 |     fpath, fname = os.path.split(program)
 49 |     if fpath:
 50 |         if is_exe(program):
 51 |             return program
 52 |     else:
 53 |         for path in os.environ["PATH"].split(os.pathsep):
 54 |             path = path.strip('"')
 55 |             exe_file = os.path.join(path, program)
 56 |             if is_exe(exe_file):
 57 |                 return exe_file
 58 | 
 59 |     return None
 60 | 
 61 | @contextmanager
 62 | def stdout_redirected(to=os.devnull):
 63 |     fd = sys.stdout.fileno()
 64 | 
 65 |     def _redirect_stdout(to):
 66 |         sys.stdout.close()
 67 |         os.dup2(to.fileno(), fd)
 68 |         sys.stdout = os.fdopen(fd, 'w')
 69 | 
 70 |     with os.fdopen(os.dup(fd), 'w') as old_stdout:
 71 |         with open(to, 'w') as file:
 72 |             _redirect_stdout(to=file)
 73 |         try:
 74 |             yield
 75 |         finally:
 76 |             _redirect_stdout(to=old_stdout)
 77 | 
 78 | 
 79 | class ProgressBar:
 80 | 
 81 |     def __init__(self, total, length, lock=None, counter=0, verbose=False, decimals=1, fill=unichr(9608), prefix = 'Progress:', suffix = 'Complete'):
 82 |         self.total = total
 83 |         self.length = length
 84 |         self.decimals = decimals
 85 |         self.fill = fill
 86 |         self.prefix = prefix
 87 |         self.suffix = suffix
 88 |         self.lock = lock
 89 |         self.counter = counter
 90 |         assert lock is not None or counter == 0
 91 |         self.verbose = verbose
 92 | 
 93 |     def progress(self, advance=True, msg=""):
 94 |         if self.lock is None:
 95 |             self.progress_unlocked(advance, msg)
 96 |         else:
 97 |             self.progress_locked(advance, msg)
 98 |         return True
 99 | 
100 |     def progress_unlocked(self, advance, msg):
101 |         flush = sys.stderr.flush
102 |         write = sys.stderr.write
103 |         if advance:
104 |             self.counter += 1
105 |         percent = ("{0:." + str(self.decimals) + "f}").format(100 * (self.counter / float(self.total)))
106 |         filledLength = int(self.length * self.counter // self.total)
107 |         bar = self.fill * filledLength + '-' * (self.length - filledLength)
108 |         rewind = '\x1b[2K\r'
109 |         result = '%s |%s| %s%% %s' % (self.prefix, bar, percent, self.suffix)
110 |         msg = '[{:%Y-%b-%d %H:%M:%S}]'.format(datetime.datetime.now()) + msg
111 |         if not self.verbose:
112 |             toprint = rewind + result + " [%s]" % (msg)
113 |         else:
114 |             toprint = rewind + msg + "\n" + result
115 |         write(toprint.encode('utf-8'))
116 |         flush()
117 |         if self.counter == self.total:
118 |             write("\n")
119 |             flush()
120 | 
121 |     def progress_locked(self, advance, msg):
122 |         flush = sys.stderr.flush
123 |         write = sys.stderr.write
124 |         if advance:
125 |             with self.counter.get_lock():
126 |                 self.counter.value += 1
127 |         percent = ("{0:." + str(self.decimals) + "f}").format(100 * (self.counter.value / float(self.total)))
128 |         filledLength = int(self.length * self.counter.value // self.total)
129 |         bar = self.fill * filledLength + '-' * (self.length - filledLength)
130 |         rewind = '\x1b[2K\r'
131 |         result = '%s |%s| %s%% %s' % (self.prefix, bar, percent, self.suffix)
132 |         msg = '[{:%Y-%b-%d %H:%M:%S}]'.format(datetime.datetime.now()) + msg
133 |         if not self.verbose:
134 |             toprint = rewind + result + " [%s]" % (msg)
135 |         else:
136 |             toprint = rewind + msg + "\n" + result
137 |         with self.lock:
138 |             write(toprint.encode('utf-8'))
139 |             flush()
140 |             if self.counter.value == self.total:
141 |                 write("\n")
142 |                 flush()
143 | 
144 | 
145 | def log(msg, level='STEP', lock=None):
146 |     timestamp = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())
147 |     if level == "STEP":
148 |         if lock is None:
149 |             sys.stderr.write("{}{}[{}]{}{}\n".format(bcolors.BOLD, bcolors.HEADER, timestamp, msg, bcolors.ENDC))
150 |         else:
151 |             with lock: sys.stderr.write("{}{}[{}]{}{}\n".format(bcolors.BOLD, bcolors.HEADER, timestamp, msg, bcolors.ENDC))
152 |     elif level == "INFO":
153 |         if lock is None:
154 |             sys.stderr.write("{}[{}]{}{}\n".format(bcolors.OKGREEN, timestamp, msg, bcolors.ENDC))
155 |         else:
156 |             with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.OKGREEN, timestamp, msg, bcolors.ENDC))
157 |     elif level == "WARN":
158 |         if lock is None:
159 |             sys.stderr.write("{}[{}]{}{}\n".format(bcolors.WARNING, timestamp, msg, bcolors.ENDC))
160 |         else:
161 |             with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.WARNING, timestamp, msg, bcolors.ENDC))
162 |     elif level == "PROGRESS":
163 |         if lock is None:
164 |             sys.stderr.write("{}{}[{}]{}{}\n".format(bcolors.UNDERLINE, bcolors.BBLUE, timestamp, msg, bcolors.ENDC))
165 |         else:
166 |             with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.BBLUE, timestamp, msg, bcolors.ENDC))
167 |     elif level == "ERROR":
168 |         if lock is None:
169 |             sys.stderr.write("{}[{}]{}{}\n".format(bcolors.FAIL, timestamp, msg, bcolors.ENDC))
170 |         else:
171 |             with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.FAIL, timestamp, msg, bcolors.ENDC))
172 |     else:
173 |         if lock is None:
174 |             sys.stderr.write("{}\n".format(msg))
175 |         else:
176 |             with lock: sys.stderr.write("{}\n".format(msg))
177 | 
178 | 
179 | def runcmd(cmd, xdir, out=None, log="log"):
180 |     j = os.path.join
181 |     tmp = log + '_TMP'
182 |     sout = open(j(xdir, out) if out is not None else os.devnull, 'w')
183 |     with open(j(xdir, tmp), 'w') as serr:
184 |         proc = sp.Popen(shlex.split(cmd), stdout=sout, stderr=sp.PIPE)
185 |         for line in iter(lambda : proc.stderr.read(1), ''):
186 |             sys.stderr.write(line)
187 |             serr.write(line)
188 |     sout.flush()
189 |     sout.close()
190 | 
191 |     with open(j(xdir, tmp), 'r') as i:
192 |         with open(j(xdir, log), 'w') as o:
193 |             for l in i:
194 |                 if 'Progress' not in l:
195 |                     o.write(re.sub(r'\033\[[0-9]*m', '', l))
196 |     os.remove(j(xdir, tmp))
197 | 
198 | 
199 | def error(msg):
200 |     log(msg=msg, level="ERROR")
201 |     sys.exit(0)
202 | 
203 | 
204 | class bcolors:
205 |     HEADER = '\033[95m'
206 |     OKBLUE = '\033[94m'
207 |     BBLUE = '\033[96m'
208 |     OKGREEN = '\033[92m'
209 |     WARNING = '\033[93m'
210 |     FAIL = '\033[91m'
211 |     ENDC = '\033[0m'
212 |     BOLD = '\033[1m'
213 |     UNDERLINE = '\033[4m'
214 | 


--------------------------------------------------------------------------------
/src/chisel/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.2'
2 | 


--------------------------------------------------------------------------------
/src/chisel/bin.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk
 2 | 
 3 | 
 4 | BEGIN{}
 5 | {
 6 |     if ( match($0, /CB:Z:[ACGT]+/) )
 7 |     {
 8 |         X[substr($0, RSTART+5, RLENGTH-5)]++
 9 |     }
10 | }
11 | END{ for(i in X) print i, X[i] }
12 | 


--------------------------------------------------------------------------------
/src/chisel/bin/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1'
2 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_bedding.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import sys, os
  4 | import argparse
  5 | 
  6 | from multiprocessing import Lock, Value, Pool
  7 | from collections import Counter
  8 | 
  9 | import chisel
 10 | 
 11 | src = os.path.dirname(chisel.__file__)
 12 | from ..Utils import *
 13 | from chisel import Plotter
 14 | 
 15 | 
 16 | def parse_args():
 17 |     description = "CHISEL command to generate a BED file for each cell with the corresponding CHISEL's results."
 18 |     parser = argparse.ArgumentParser(description=description)
 19 |     parser.add_argument("INPUT", nargs='?', default='calls/calls.tsv', type=str, help="Input file with inferred copy numbers (default: calls/calls.tsv)")
 20 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
 21 |     parser.add_argument("--rawcalls", required=False, default=False, action='store_true', help="Use raw copy numbers instead of consensus corrected ones (default: False)")
 22 |     parser.add_argument("--noextending", required=False, default=False, action='store_true', help="Merge consecutive bins only if they are neighboring (default: False, segments are extended to fill gaps)")
 23 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 24 |     args = parser.parse_args()
 25 | 
 26 |     if not os.path.isfile(args.INPUT):
 27 |         raise ValueError('ERROR: input file {} does not exist!'.format(args.INPUT))
 28 |     if not os.path.isdir(args.rundir):
 29 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
 30 | 
 31 |     if not args.jobs:
 32 |         args.jobs = mp.cpu_count()
 33 |     if args.jobs < 1:
 34 |         raise ValueError("The number of jobs must be positive!")
 35 |     
 36 |     return {
 37 |         "input" : args.INPUT,
 38 |         "rundir" : args.rundir,
 39 |         "rawcalls" : args.rawcalls,
 40 |         "noextending" : args.noextending,
 41 |         "j" : args.jobs
 42 |     }
 43 | 
 44 | 
 45 | def main():
 46 |     log('Parsing and checking arguments', level='STEP')
 47 |     args = parse_args()
 48 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
 49 | 
 50 |     log('Reading input', level='STEP')
 51 |     bins, pos, cells, iscorr = Plotter.read_cells(args['input'])
 52 |     log('Number of cells: {}'.format(len(cells)), level='INFO')
 53 |     log('Number of bins: {}'.format(len(pos)), level='INFO')
 54 | 
 55 |     log('Converting and writing BED files', level='STEP')
 56 |     make_beds(bins, pos, cells, iscorr, args)
 57 | 
 58 |     log('KTHXBYE', level='STEP')
 59 | 
 60 | 
 61 | def make_beds(bins, pos, cells, iscorr, args):
 62 |     jobs = (e for e in cells)
 63 |     bar = ProgressBar(total=len(cells), length=40, verbose=False)
 64 |     chk = (lambda g, e : bins[g][e]['CORR-CNS' if iscorr and not args['rawcalls'] else 'CNS'] if e in bins[g] else None)
 65 |     cns = {g : {e : chk(g, e) for e in bins[g]} for g in bins}
 66 |     initargs = (cns, pos, sum(g[2] - g[1] for g in pos), args['rundir'], args['noextending'])
 67 |     pool = Pool(processes=min(args['j'], len(cells)), initializer=init_making_bed, initargs=initargs)
 68 |     progress = (lambda e : bar.progress(advance=True, msg="Wrote cell {}".format(e)))
 69 |     res = map(progress, pool.imap_unordered(making_bed, jobs))
 70 |     pool.close()
 71 |     pool.join()
 72 | 
 73 | 
 74 | def init_making_bed(_cns, _pos, _totcov, _rundir, _noextending):
 75 |     global cns, pos, totcov, rundir, noextending
 76 |     cns = _cns
 77 |     pos = _pos
 78 |     totcov = _totcov
 79 |     rundir = _rundir
 80 |     noextending = _noextending
 81 | 
 82 | 
 83 | def making_bed(e):
 84 |     ecns = {g : cns[g][e] for g in pos}
 85 |     flag = {g : x == 0 or g[0] != pos[x-1][0] for x, g in enumerate(pos)}
 86 |     form = (lambda cns : '{}|{}'.format(*cns))
 87 |     
 88 |     with open(os.path.join(rundir, '{}.bed'.format(e)), 'w') as o:    
 89 |         def out(start, end):
 90 |             assert start[0] == end[0] and start[1] <= end[1] and start[2] <= end[2] and ecns[start] == ecns[end], 'Error on consecutive bins'
 91 |             if not noextending:
 92 |                 o.write('\t'.join(map(str, [start[0], 0 if flag[start] else start[1], end[2], form(ecns[start])])) + '\n')
 93 |             else:
 94 |                 o.write('\t'.join(map(str, [start[0], start[1], end[2], form(ecns[start])])) + '\n')
 95 |         
 96 |         start = pos[0]
 97 |         end = None
 98 |         precn = None
 99 |         tot = 0
100 |         for x, g in enumerate(pos):
101 |             if end is not None and (start[0] != g[0] or precn != ecns[g] or (noextending and end[2] != g[1])):
102 |                 out(start, end)
103 |                 tot += end[2] - start[1]
104 |                 start = g
105 |                 precn = ecns[g]
106 |             end = g
107 |         out(start, end)
108 |         tot += end[2] - start[1]
109 |         
110 |     assert end == pos[-1], 'Error for the last bin'
111 |     assert (not noextending or tot == totcov) and (noextending or tot >= totcov), 'Error in total length: {} written vs. {} expected'.format(tot, totcov)
112 |     return e
113 |     
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_calling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os
  4 | import argparse
  5 | import subprocess as sp
  6 | import multiprocessing as mp
  7 | import shlex
  8 | import datetime
  9 | import re
 10 | 
 11 | import chisel
 12 | 
 13 | src = os.path.dirname(chisel.__file__)
 14 | from ..Utils import *
 15 | 
 16 | 
 17 | def parse_args():
 18 |     description = "CHISEL command to re-run the inference of allele- and haplotype-specific copy numbers, cell clustering, and plotting. This steps starts from estimated RDRs and BAFs."
 19 |     parser = argparse.ArgumentParser(description=description)
 20 |     parser.add_argument("INPUT", nargs='?', default='combo/combo.tsv', type=str, help="Input file with combined RDR and BAF per bin and per cell (default: combo/combo.tsv)")
 21 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
 22 |     parser.add_argument("-A","--sensitivity", required=False, type=float, default=1.0, help="Sensitivity of model selection for ploidy (default: 1, increase this parameter to lower sensitivity to noisy data, adjust this value (e.g. 2, 4, ..., 10, ...) to better deal with high-variance data (e.g. low coverage, small number of cells, low number of phased SNPs, etc...)")
 23 |     parser.add_argument("-P","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)")
 24 |     parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)")
 25 |     parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)")
 26 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 27 |     args = parser.parse_args()
 28 | 
 29 |     if not os.path.isfile(args.INPUT):
 30 |         raise ValueError("Input file does not exist: {}".format(args.INPUT))
 31 |     if not os.path.isdir(args.rundir):
 32 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
 33 |     if args.seed and args.seed < 1:
 34 |         raise ValueError("The random seed  must be positive!")
 35 |     if args.maxploidy < 3:
 36 |         raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!")
 37 |     if args.upperk < 1:
 38 |         raise ValueError("The maximum number of clusters must be positive!")
 39 |     if not args.jobs:
 40 |         args.jobs = mp.cpu_count()
 41 |     if args.jobs < 1:
 42 |         raise ValueError("The number of jobs must be positive!")
 43 | 
 44 |     return {
 45 |         "INPUT" : args.INPUT,
 46 |         "rundir" : args.rundir,
 47 |         "sensitivity" : args.sensitivity,
 48 |         "maxploidy" : args.maxploidy,
 49 |         "upperk" : args.upperk,
 50 |         "seed" : args.seed,
 51 |         "jobs" : args.jobs
 52 |     }
 53 | 
 54 | 
 55 | def main():
 56 |     log('Parsing and checking arguments', level='PROGRESS')
 57 |     args = parse_args()
 58 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
 59 | 
 60 |     log('Setting directories', level='PROGRESS')
 61 |     dcal, dclo, dplo = setup(args, force=False)
 62 |     def get_comp(name):
 63 |         comp = os.path.join(src, name)
 64 |         if not os.path.isfile(comp):
 65 |             raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src))
 66 |         return comp
 67 | 
 68 |     log('Calling', level='PROGRESS')
 69 |     cmd = 'python2.7 {} {} -A {} -P {} -K {} -j {}'
 70 |     cmd = cmd.format(get_comp('Caller.py'), args['INPUT'], args['sensitivity'], args['maxploidy'], args['upperk'], args['jobs'])
 71 |     if args['seed'] is not None:
 72 |         cmd += " --seed {}".format(args['seed'])
 73 |     runcmd(cmd, dcal, out='calls.tsv')
 74 |     calls = os.path.join(dcal, 'calls.tsv')
 75 | 
 76 |     log('Cloning', level='PROGRESS')
 77 |     cmd = 'python2.7 {} {}'
 78 |     cmd = cmd.format(get_comp('Cloner.py'), calls)
 79 |     if args['seed'] is not None:
 80 |         cmd += " --seed {}".format(args['seed'])
 81 |     runcmd(cmd, dclo, out='mapping.tsv')
 82 |     mapping = os.path.join(dclo, 'mapping.tsv')
 83 | 
 84 |     log('Plotting', level='PROGRESS')
 85 |     os.chdir(dplo)
 86 |     up = (lambda f : os.path.join(os.pardir, f))
 87 |     cmd = 'python2.7 {} {} -m {}'
 88 |     cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping))
 89 |     runcmd(cmd, './')
 90 |     os.chdir(os.pardir)
 91 | 
 92 | 
 93 | def setup(args, force=True):
 94 |     dcal = os.path.join(args['rundir'], 'calls')
 95 |     if os.path.isdir(dcal):
 96 |         log("The calls sub-directory in the running directory already exists, results will be overwritten!", level='WARN')
 97 |     else:
 98 |         os.mkdir(dcal)
 99 | 
100 |     dclo = os.path.join(args['rundir'], 'clones')
101 |     if os.path.isdir(dclo):
102 |         log("The clones sub-directory in the running directory already exists, results will be overwritten!", level='WARN')
103 |     else:
104 |         os.mkdir(dclo)
105 | 
106 |     dplo = os.path.join(args['rundir'], 'plots')
107 |     if os.path.isdir(dplo):
108 |         log("The plots sub-directory in the running directory already exists, results will be overwritten!\n", level='WARN')
109 |     else:
110 |         os.mkdir(dplo)
111 | 
112 |     return dcal, dclo, dplo
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_cloning.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | 
 3 | import os
 4 | import argparse
 5 | import subprocess as sp
 6 | import multiprocessing as mp
 7 | import shlex
 8 | import datetime
 9 | import re
10 | 
11 | import chisel
12 | 
13 | src = os.path.dirname(chisel.__file__)
14 | from ..Utils import *
15 | 
16 | 
17 | def parse_args():
18 |     description = "CHISEL command to run the pipeline starting from inferred copy numbers."
19 |     parser = argparse.ArgumentParser(description=description)
20 |     parser.add_argument("INPUT", nargs='?', default='calls/calls.tsv', type=str, help="Input file with combined RDR and BAF per bin and per cell")
21 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
22 |     parser.add_argument("-f", "--maxdiff", required=False, type=float, default=0.06, help="Maximum haplotype-specific distance between the genome of cells in the same clone (default: 0.06, when -1 is chosen the maximum cluster method of SciPy is used)")
23 |     parser.add_argument("-s", "--minsize", required=False, type=int, default=14, help="Minimum number of cells in a subpopulation to define a clone (default: 14)")
24 |     parser.add_argument("-r", "--refinement", required=False, type=float, default=None, help="Maximum difference to assign noisy cells to the closest clone (default: 0.0, note that 1.0 can be used to force the assigment of all cells)")
25 |     parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)")
26 |     args = parser.parse_args()
27 | 
28 |     if not os.path.isfile(args.INPUT):
29 |         raise ValueError("Input file does not exist: {}".format(args.INPUT))
30 |     if not os.path.isdir(args.rundir):
31 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
32 |     if args.seed and args.seed < 1:
33 |         raise ValueError("The random seed  must be positive!")
34 |     if (args.maxdiff < 0.0 and args.maxdiff != 1.0) or args.maxdiff > 1.0:
35 |         raise ValueError("Maximum distance must be in [0, 1] or equal to -1!")
36 |     if args.minsize < 0:
37 |         raise ValueError("Minimum number of cells in a clone must be positive!")
38 | 
39 |     return {
40 |         "INPUT" : args.INPUT,
41 |         "rundir" : args.rundir,
42 |         "maxdiff" : args.maxdiff,
43 |         "minsize" : args.minsize,
44 |         "refinement" : args.refinement,
45 |         "seed" : args.seed
46 |     }
47 | 
48 | 
49 | def main():
50 |     log('Parsing and checking arguments', level='PROGRESS')
51 |     args = parse_args()
52 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
53 | 
54 |     log('Setting directories', level='PROGRESS')
55 |     dclo, dplo = setup(args, force=False)
56 |     def get_comp(name):
57 |         comp = os.path.join(src, name)
58 |         if not os.path.isfile(comp):
59 |             raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src))
60 |         return comp
61 | 
62 |     log('Cloning', level='PROGRESS')
63 |     cmd = 'python2.7 {} {} -f {} -s {}'
64 |     cmd = cmd.format(get_comp('Cloner.py'), args['INPUT'], args['maxdiff'], args['minsize'])
65 |     if args['refinement'] is not None:
66 |         cmd += " -r {}".format(args['refinement'])
67 |     if args['seed'] is not None:
68 |         cmd += " --seed {}".format(args['seed'])
69 |     runcmd(cmd, dclo, out='mapping.tsv')
70 |     mapping = os.path.join(dclo, 'mapping.tsv')
71 | 
72 |     log('Plotting', level='PROGRESS')
73 |     os.chdir(dplo)
74 |     up = (lambda f : os.path.join(os.pardir, f))
75 |     cmd = 'python2.7 {} {} -m {}'
76 |     cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(args['INPUT']), up(mapping))
77 |     runcmd(cmd, './')
78 |     os.chdir(os.pardir)
79 | 
80 | 
81 | def setup(args, force=True):
82 |     dclo = os.path.join(args['rundir'], 'clones')
83 |     if os.path.isdir(dclo):
84 |         log("The clones sub-directory in the running directory already exists, results will be overwritten!", level='WARN')
85 |     else:
86 |         os.mkdir(dclo)
87 | 
88 |     dplo = os.path.join(args['rundir'], 'plots')
89 |     if os.path.isdir(dplo):
90 |         log("The plots sub-directory in the running directory already exists, results will be overwritten!", level='WARN')
91 |     else:
92 |         os.mkdir(dplo)
93 | 
94 |     return dclo, dplo
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_combocall.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | os.environ["OMP_NUM_THREADS"] = "1" 
  5 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
  6 | os.environ["MKL_NUM_THREADS"] = "1" 
  7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 
  8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 
  9 | import argparse
 10 | from subprocess import Popen
 11 | import chisel
 12 | 
 13 | src = os.path.dirname(chisel.__file__)
 14 | from ..Utils import *
 15 | 
 16 | 
 17 | def parse_args():
 18 |     description = "CHISEL command to run the complete pipeline starting from RDRs and BAFs for one or multiple samples from previously executions of CHISEL or CHISEl preprocess."
 19 |     parser = argparse.ArgumentParser(description=description)
 20 |     parser.add_argument("INPUT", type=str, nargs='+', help="One or multiple CHISEL directory runs for different samples from which to combine RDRs and BAFs")
 21 |     parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome")
 22 |     parser.add_argument("--names", required=False, default=None, type=str, nargs='+', help="Sample names when combining multiple samples (default: idx used)")
 23 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
 24 |     parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)")
 25 |     parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")")
 26 |     parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)")
 27 |     parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)")
 28 |     parser.add_argument("--addgccorr", required=False, default=False, action='store_true', help="Add additional custome correction for GC bias (default: disabled)")
 29 |     parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)")
 30 |     parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)")
 31 |     parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
 32 |     parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)")
 33 |     parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)")
 34 |     parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)")
 35 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 36 |     args = parser.parse_args()
 37 | 
 38 |     for indir in args.INPUT:
 39 |         if not os.path.isdir(indir):
 40 |             raise ValueError("Input directory does not exists: {}".format(indir))
 41 |         rdr_file = os.path.join(indir, 'rdr', 'rdr.tsv')
 42 |         if not os.path.isfile(rdr_file):
 43 |             raise ValueError("Input directory does not contain RDR file: {}".format(rdr_file))
 44 |         tot_file = os.path.join(indir, 'rdr', 'total.tsv')
 45 |         if not os.path.isfile(tot_file):
 46 |             raise ValueError("Input directory does not contain Total read file: {}".format(tot_file))
 47 |         baf_file = os.path.join(indir, 'baf', 'baf.tsv')
 48 |         if not os.path.isfile(baf_file):
 49 |             raise ValueError("Input directory does not contain BAF file: {}".format(baf_file))
 50 |     if not os.path.isdir(args.rundir):
 51 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
 52 |     if args.seed and args.seed < 1:
 53 |         raise ValueError("The random seed  must be positive!")
 54 |     if args.maxploidy < 3:
 55 |         raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!")
 56 |     if args.upperk < 1:
 57 |         raise ValueError("The maximum number of clusters must be positive!")
 58 | 
 59 |     if not os.path.isfile(args.reference):
 60 |         raise ValueError(error("Reference genome file does not exist: {}".format(args.reference)))
 61 |     refidx = ['{}.{}'.format(args.reference, ix) for ix in ['amb', 'ann', 'bwt', 'pac', 'sa']]
 62 |     if not all(os.path.isfile(f) for f in refidx):
 63 |         raise ValueError(error("Some of the BWA index files are missing, please make sure these are available and generated through the command \n\t``bwa index {}''.\n Expected files are: {}".format(args.reference, '\n'.join(refidx))))
 64 | 
 65 |     blocksize = 0
 66 |     try:
 67 |         if args.blocksize[-2:] == "kb":
 68 |             blocksize = int(args.blocksize[:-2]) * 1000
 69 |         elif args.blocksize[-2:] == "Mb":
 70 |             blocksize = int(args.blocksize[:-2]) * 1000000
 71 |         else:
 72 |             blocksize = int(args.blocksize)
 73 |     except:
 74 |         raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 75 | 
 76 |     if not args.jobs:
 77 |         args.jobs = mp.cpu_count()
 78 |     if args.jobs < 1:
 79 |         raise ValueError("The number of jobs must be positive!")
 80 | 
 81 |     bcftools = args.bcftools
 82 |     if not bcftools:
 83 |         bcftools = "bcftools"
 84 |     if which(bcftools) is None:
 85 |         raise ValueError("bcftools has not been found or is not executable!")
 86 | 
 87 |     samtools = args.samtools
 88 |     if not samtools:
 89 |         samtools = "samtools"
 90 |     if which(samtools) is None:
 91 |         raise ValueError("samtools has not been found or is not executable!")
 92 | 
 93 |     return {
 94 |         "input" : args.INPUT,
 95 |         "reference" : os.path.abspath(args.reference),
 96 |         "names" : args.names,
 97 |         "rundir" : args.rundir,
 98 |         "blocksize" : blocksize,
 99 |         "chromosomes" : args.chromosomes,
100 |         "addgccorr" : args.addgccorr,
101 |         "phasecorr" : not args.nophasecorr,
102 |         "bcftools" : bcftools,
103 |         "samtools" : samtools,
104 |         "maxploidy" : args.maxploidy,
105 |         "upperk" : args.upperk,
106 |         "cellprefix" : args.cellprefix,
107 |         "cellsuffix" : args.cellsuffix,
108 |         "seed" : args.seed,
109 |         "jobs" : args.jobs
110 |     }
111 | 
112 | 
113 | def main():
114 |     log('Parsing and checking arguments', level='PROGRESS')
115 |     args = parse_args()
116 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
117 | 
118 |     log('Setting directories', level='PROGRESS')
119 |     dbaf, drdr, dcom, dcal, dclo, dplo = setup(args)
120 |     def get_comp(name):
121 |         comp = os.path.join(src, name)
122 |         if not os.path.isfile(comp):
123 |             raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src))
124 |         return comp
125 | 
126 |     lcel = os.path.join(drdr, 'total.tsv')
127 |     if os.path.isfile(lcel):
128 |         raise ValueError("Total read file {} already exists, please remove it or it'd get overwritten!".format(lcel))
129 |     rdr = os.path.join(drdr, 'rdr.tsv')
130 |     if os.path.isfile(rdr):
131 |         raise ValueError("RDR file {} already exists, please remove it or it'd get overwritten!".format(rdr))
132 |     baf = os.path.join(dbaf, 'baf.tsv')
133 |     if os.path.isfile(baf):
134 |         raise ValueError("BAF file {} already exists, please remove it or it'd get overwritten!".format(baf))
135 |     
136 |     log('Aggregating previously-computed RDRs and BAFs', level='PROGRESS')
137 |     aggregate(rdr, lcel, baf, args['input'], args['names'])
138 | 
139 |     log('Combining RDRs and BAFs', level='PROGRESS')
140 |     cmd = 'python2.7 {} -r {} -b {} -j {} -k {} -l {}'
141 |     cmd = cmd.format(get_comp('Combiner.py'), rdr, baf, args['jobs'], args['blocksize'], lcel)
142 |     if args['seed'] is not None:
143 |         cmd += " --seed {}".format(args['seed'])
144 |     if args['addgccorr']:
145 |         cmd += " --gccorr {}".format(args['reference'])
146 |     if not args['phasecorr']:
147 |         cmd += " --nophasecorr"
148 |     runcmd(cmd, dcom, out='combo.tsv')
149 |     com = os.path.join(dcom, 'combo.tsv')
150 | 
151 |     log('Calling', level='PROGRESS')
152 |     cmd = 'python2.7 {} {} -P {} -K {} -j {}'
153 |     cmd = cmd.format(get_comp('Caller.py'), com, args['maxploidy'], args['upperk'], args['jobs'])
154 |     if args['seed'] is not None:
155 |         cmd += " --seed {}".format(args['seed'])
156 |     runcmd(cmd, dcal, out='calls.tsv')
157 |     calls = os.path.join(dcal, 'calls.tsv')
158 | 
159 |     log('Cloning', level='PROGRESS')
160 |     cmd = 'python2.7 {} {}'
161 |     cmd = cmd.format(get_comp('Cloner.py'), calls)
162 |     if args['seed'] is not None:
163 |         cmd += " --seed {}".format(args['seed'])
164 |     runcmd(cmd, dclo, out='mapping.tsv')
165 |     mapping = os.path.join(dclo, 'mapping.tsv')
166 | 
167 |     log('Plotting', level='PROGRESS')
168 |     os.chdir(dplo)
169 |     up = (lambda f : os.path.join(os.pardir, f))
170 |     cmd = 'python2.7 {} {} -m {}'
171 |     cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping))
172 |     runcmd(cmd, './')
173 |     os.chdir(os.pardir)
174 | 
175 | 
176 | def aggregate(rdr, lcel, baf, input_dirs, names):
177 |     if names is None or len(names) != len(input_dirs):
178 |         names = list(range(len(input_dirs)))
179 |     for indir, name in zip(input_dirs, names):
180 |         log('Aggregating RDRs for {} with name {}'.format(indir, name), level='INFO')
181 |         Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,$3,"{}_"$4,$5,$6,$7{}'.format('{', name, '}'),
182 |                                                      os.path.join(indir, 'rdr', 'rdr.tsv'),
183 |                                                      rdr), shell=True).communicate() 
184 |     for indir, name in zip(input_dirs, names):
185 |         log('Aggregating Total reads for {} with name {}'.format(indir, name), level='INFO')
186 |         Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print "{}_"$1,$2{}'.format('{', name, '}'),
187 |                                                      os.path.join(indir, 'rdr', 'total.tsv'),
188 |                                                      lcel), shell=True).communicate() 
189 |     for indir, name in zip(input_dirs, names):
190 |         log('Aggregating BAFs for {} with name {}'.format(indir, name), level='INFO')
191 |         Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,"{}_"$3,$4,$5{}'.format('{', name, '}'),
192 |                                                      os.path.join(indir, 'baf', 'baf.tsv'),
193 |                                                      baf), shell=True).communicate() 
194 |     return
195 | 
196 | 
197 | def setup(args):
198 |     if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']):
199 |         log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN')
200 | 
201 |     dbaf = os.path.join(args['rundir'], 'baf')
202 |     if not os.path.isdir(dbaf):
203 |         os.mkdir(dbaf)
204 | 
205 |     drdr = os.path.join(args['rundir'], 'rdr')
206 |     if not os.path.isdir(drdr):
207 |         os.mkdir(drdr)
208 | 
209 |     dcom = os.path.join(args['rundir'], 'combo')
210 |     if not os.path.isdir(dcom):
211 |         os.mkdir(dcom)
212 | 
213 |     dcal = os.path.join(args['rundir'], 'calls')
214 |     if not os.path.isdir(dcal):
215 |         os.mkdir(dcal)
216 | 
217 |     dclo = os.path.join(args['rundir'], 'clones')
218 |     if not os.path.isdir(dclo):
219 |         os.mkdir(dclo)
220 | 
221 |     dplo = os.path.join(args['rundir'], 'plots')
222 |     if not os.path.isdir(dplo):
223 |         os.mkdir(dplo)
224 | 
225 |     return dbaf, drdr, dcom, dcal, dclo, dplo
226 | 
227 | 
228 | if __name__ == '__main__':
229 |     main()
230 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | os.environ["OMP_NUM_THREADS"] = "1" 
  5 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
  6 | os.environ["MKL_NUM_THREADS"] = "1" 
  7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 
  8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 
  9 | import argparse
 10 | import chisel
 11 | 
 12 | src = os.path.dirname(chisel.__file__)
 13 | from ..Utils import *
 14 | 
 15 | 
 16 | def parse_args():
 17 |     description = "CHISEL command to run the complete pipeline starting from the 4 required data: (1) Barcoded single-cell BAM; (2) Matched-normal BAM; (3) Reference genome; (4) Phased VCF."
 18 |     parser = argparse.ArgumentParser(description=description)
 19 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
 20 |     parser.add_argument("-t","--tumor", required=True, type=str, help="Barcoded single-cell BAM file")
 21 |     parser.add_argument("-n","--normal", required=True, type=str, help="Matched-normal BAM file")
 22 |     parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome")
 23 |     parser.add_argument("-l","--listphased", type=str, required=True, help="Phased SNPs file (lines of heterozygous germline SNPs must contain either 0|1 or 1|0)")
 24 |     parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"")
 25 |     parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)")
 26 |     parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")")
 27 |     parser.add_argument("-m","--minreads", type=int, required=False, default=300000, help="Minimum number total reads to select cells (default: 300000)")
 28 |     parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)")
 29 |     parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)")
 30 |     parser.add_argument("--addgccorr", required=False, default=False, action='store_true', help="Add additional custome correction for GC bias (default: disabled)")
 31 |     parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)")
 32 |     parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)")
 33 |     parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
 34 |     parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)")
 35 |     parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)")
 36 |     parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)")
 37 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 38 |     args = parser.parse_args()
 39 | 
 40 |     if not os.path.isdir(args.rundir):
 41 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
 42 |     if not os.path.isfile(args.tumor):
 43 |         raise ValueError("Barcoded single-cell BAM file does not exist: {}".format(args.tumor))
 44 |     if not os.path.isfile(args.normal):
 45 |         raise ValueError("Matched-normal BAM file does not exist: {}".format(args.normal))
 46 |     if not os.path.isfile(args.reference):
 47 |         raise ValueError("Reference genome file does not exist: {}".format(args.reference))
 48 |     if not os.path.isfile(args.listphased):
 49 |         raise ValueError("Phased SNPs file does not exist: {}".format(args.listphased))
 50 |     if args.seed and args.seed < 1:
 51 |         raise ValueError("The random seed  must be positive!")
 52 |     if args.minreads < 1:
 53 |         raise ValueError("The minimum number of reads must be positive!")
 54 |     if args.maxploidy < 3:
 55 |         raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!")
 56 |     if args.upperk < 1:
 57 |         raise ValueError("The maximum number of clusters must be positive!")
 58 | 
 59 |     size = 0
 60 |     try:
 61 |         if args.size[-2:] == "kb":
 62 |             size = int(args.size[:-2]) * 1000
 63 |         elif args.size[-2:] == "Mb":
 64 |             size = int(args.size[:-2]) * 1000000
 65 |         else:
 66 |             size = int(args.size)
 67 |     except:
 68 |         raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 69 | 
 70 |     blocksize = 0
 71 |     try:
 72 |         if args.blocksize[-2:] == "kb":
 73 |             blocksize = int(args.blocksize[:-2]) * 1000
 74 |         elif args.blocksize[-2:] == "Mb":
 75 |             blocksize = int(args.blocksize[:-2]) * 1000000
 76 |         else:
 77 |             blocksize = int(args.blocksize)
 78 |     except:
 79 |         raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 80 | 
 81 |     if not args.jobs:
 82 |         args.jobs = mp.cpu_count()
 83 |     if args.jobs < 1:
 84 |         raise ValueError("The number of jobs must be positive!")
 85 | 
 86 |     bcftools = args.bcftools
 87 |     if not bcftools:
 88 |         bcftools = "bcftools"
 89 |     if which(bcftools) is None:
 90 |         raise ValueError("bcftools has not been found or is not executable!")
 91 | 
 92 |     samtools = args.samtools
 93 |     if not samtools:
 94 |         samtools = "samtools"
 95 |     if which(samtools) is None:
 96 |         raise ValueError("samtools has not been found or is not executable!")
 97 | 
 98 |     return {
 99 |         "rundir" : args.rundir,
100 |         "tumor" : args.tumor,
101 |         "normal" : args.normal,
102 |         "reference" : args.reference,
103 |         "listphased" : args.listphased,
104 |         "binsize" : size,
105 |         "blocksize" : blocksize,
106 |         "chromosomes" : args.chromosomes,
107 |         "minreads" : args.minreads,
108 |         "addgccorr" : args.addgccorr,
109 |         "phasecorr" : not args.nophasecorr,
110 |         "bcftools" : bcftools,
111 |         "samtools" : samtools,
112 |         "maxploidy" : args.maxploidy,
113 |         "upperk" : args.upperk,
114 |         "cellprefix" : args.cellprefix,
115 |         "cellsuffix" : args.cellsuffix,
116 |         "seed" : args.seed,
117 |         "jobs" : args.jobs
118 |     }
119 | 
120 | 
121 | def main():
122 |     log('Parsing and checking arguments', level='PROGRESS')
123 |     args = parse_args()
124 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
125 | 
126 |     log('Setting directories', level='PROGRESS')
127 |     dbaf, drdr, dcom, dcal, dclo, dplo = setup(args)
128 |     def get_comp(name):
129 |         comp = os.path.join(src, name)
130 |         if not os.path.isfile(comp):
131 |             raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src))
132 |         return comp
133 | 
134 |     log('Computing RDRs', level='PROGRESS')
135 |     cmd = 'python2.7 {} -n {} -t {} -r {} -b {} -m {} -j {} -c \"{}\" --outdir {}'
136 |     cmd = cmd.format(get_comp('RDREstimator.py'), args['normal'], args['tumor'], args['reference'], args['binsize'], args['minreads'], args['jobs'], args['chromosomes'], drdr)
137 |     if args['samtools'] is not None:
138 |         cmd += " -s {}".format(args['samtools'])
139 |     cmd += " --cellprefix {}".format(args['cellprefix'])
140 |     if args['cellsuffix'] is not None:
141 |         cmd += " --cellsuffix {}".format(args['cellsuffix'])
142 |     runcmd(cmd, drdr, out='rdr.tsv')
143 |     lcel = os.path.join(drdr, 'total.tsv')
144 |     rdr = os.path.join(drdr, 'rdr.tsv')
145 | 
146 |     log('Computing BAFs', level='PROGRESS')
147 |     cmd = 'python2.7 {} -n {} -t {} -r {} -j {} -c {} -l {}'
148 |     cmd = cmd.format(get_comp('BAFEstimator.py'), args['normal'], args['tumor'], args['reference'], args['jobs'], lcel, args['listphased'])
149 |     if args['samtools'] is not None:
150 |         cmd += " -s {}".format(args['samtools'])
151 |     if args['bcftools'] is not None:
152 |         cmd += " -b {}".format(args['bcftools'])
153 |     cmd += " --cellprefix {}".format(args['cellprefix'])
154 |     if args['cellsuffix'] is not None:
155 |         cmd += " --cellsuffix {}".format(args['cellsuffix'])
156 |     runcmd(cmd, dbaf, out='baf.tsv')
157 |     baf = os.path.join(dbaf, 'baf.tsv')
158 | 
159 |     log('Combining RDRs and BAFs', level='PROGRESS')
160 |     cmd = 'python2.7 {} -r {} -b {} -j {} -k {} -l {}'
161 |     cmd = cmd.format(get_comp('Combiner.py'), rdr, baf, args['jobs'], args['blocksize'], lcel)
162 |     if args['seed'] is not None:
163 |         cmd += " --seed {}".format(args['seed'])
164 |     if args['addgccorr']:
165 |         cmd += " --gccorr {}".format(args['reference'])
166 |     if not args['phasecorr']:
167 |         cmd += " --nophasecorr"
168 |     runcmd(cmd, dcom, out='combo.tsv')
169 |     com = os.path.join(dcom, 'combo.tsv')
170 | 
171 |     log('Calling', level='PROGRESS')
172 |     cmd = 'python2.7 {} {} -P {} -K {} -j {}'
173 |     cmd = cmd.format(get_comp('Caller.py'), com, args['maxploidy'], args['upperk'], args['jobs'])
174 |     if args['seed'] is not None:
175 |         cmd += " --seed {}".format(args['seed'])
176 |     runcmd(cmd, dcal, out='calls.tsv')
177 |     calls = os.path.join(dcal, 'calls.tsv')
178 | 
179 |     log('Cloning', level='PROGRESS')
180 |     cmd = 'python2.7 {} {}'
181 |     cmd = cmd.format(get_comp('Cloner.py'), calls)
182 |     if args['seed'] is not None:
183 |         cmd += " --seed {}".format(args['seed'])
184 |     runcmd(cmd, dclo, out='mapping.tsv')
185 |     mapping = os.path.join(dclo, 'mapping.tsv')
186 | 
187 |     log('Plotting', level='PROGRESS')
188 |     os.chdir(dplo)
189 |     up = (lambda f : os.path.join(os.pardir, f))
190 |     cmd = 'python2.7 {} {} -m {}'
191 |     cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping))
192 |     runcmd(cmd, './')
193 |     os.chdir(os.pardir)
194 | 
195 | 
196 | def setup(args):
197 |     if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']):
198 |         log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN')
199 | 
200 |     dbaf = os.path.join(args['rundir'], 'baf')
201 |     if not os.path.isdir(dbaf):
202 |         os.mkdir(dbaf)
203 | 
204 |     drdr = os.path.join(args['rundir'], 'rdr')
205 |     if not os.path.isdir(drdr):
206 |         os.mkdir(drdr)
207 | 
208 |     dcom = os.path.join(args['rundir'], 'combo')
209 |     if not os.path.isdir(dcom):
210 |         os.mkdir(dcom)
211 | 
212 |     dcal = os.path.join(args['rundir'], 'calls')
213 |     if not os.path.isdir(dcal):
214 |         os.mkdir(dcal)
215 | 
216 |     dclo = os.path.join(args['rundir'], 'clones')
217 |     if not os.path.isdir(dclo):
218 |         os.mkdir(dclo)
219 | 
220 |     dplo = os.path.join(args['rundir'], 'plots')
221 |     if not os.path.isdir(dplo):
222 |         os.mkdir(dplo)
223 | 
224 |     return dbaf, drdr, dcom, dcal, dclo, dplo
225 | 
226 | 
227 | if __name__ == '__main__':
228 |     main()
229 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_nonormal_combocall.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | os.environ["OMP_NUM_THREADS"] = "1" 
  5 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
  6 | os.environ["MKL_NUM_THREADS"] = "1" 
  7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 
  8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 
  9 | import argparse
 10 | from subprocess import Popen
 11 | import chisel
 12 | 
 13 | src = os.path.dirname(chisel.__file__)
 14 | from ..Utils import *
 15 | 
 16 | 
 17 | def parse_args():
 18 |     description = "CHISEL command to run the complete pipeline starting from RDRs and BAFs for one or multiple samples from previously executions of CHISEL or CHISEl preprocess."
 19 |     parser = argparse.ArgumentParser(description=description)
 20 |     parser.add_argument("INPUT", type=str, nargs='+', help="One or multiple CHISEL directory runs for different samples from which to combine RDRs and BAFs")
 21 |     parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome")
 22 |     parser.add_argument("--names", required=False, default=None, type=str, nargs='+', help="Sample names when combining multiple samples (default: idx used)")
 23 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
 24 |     parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)")
 25 |     parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")")
 26 |     parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)")
 27 |     parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)")
 28 |     parser.add_argument("--minimumsnps", required=False, type=float, default=0.08, help="Minimum SNP density per kb (default: 0.08)")
 29 |     parser.add_argument("--missingsnps", required=False, type=str, default="10,0", help="A,B counts for genomic bins without minimum minimum SNP density (default: 10,0 i.e. BAF=0)")
 30 |     parser.add_argument("--nogccorr", required=False, default=False, action='store_true', help="Disable correction for GC bias (default: enabled)")
 31 |     parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)")
 32 |     parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)")
 33 |     parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
 34 |     parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)")
 35 |     parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)")
 36 |     parser.add_argument("--simcov", required=False, type=float, default=2, help="Sequencing fold coverage of simulated normal BAM file (default: 2)")
 37 |     parser.add_argument("--binstats", required=False, type=int, default=None, help="Number of bins to sample per chromosome to estimate sequencing stats (default: all are used, fix a number for improving speed)")
 38 |     parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)")
 39 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 40 |     args = parser.parse_args()
 41 | 
 42 |     for indir in args.INPUT:
 43 |         if not os.path.isdir(indir):
 44 |             raise ValueError("Input directory does not exists: {}".format(indir))
 45 |         rdr_file = os.path.join(indir, 'rdr', 'rdr.tsv')
 46 |         if not os.path.isfile(rdr_file):
 47 |             raise ValueError("Input directory does not contain RDR file: {}".format(rdr_file))
 48 |         tot_file = os.path.join(indir, 'rdr', 'total.tsv')
 49 |         if not os.path.isfile(tot_file):
 50 |             raise ValueError("Input directory does not contain Total read file: {}".format(tot_file))
 51 |         baf_file = os.path.join(indir, 'baf', 'baf.tsv')
 52 |         if not os.path.isfile(baf_file):
 53 |             raise ValueError("Input directory does not contain BAF file: {}".format(baf_file))
 54 |     if not os.path.isdir(args.rundir):
 55 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
 56 |     if args.seed and args.seed < 1:
 57 |         raise ValueError("The random seed  must be positive!")
 58 |     if args.maxploidy < 3:
 59 |         raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!")
 60 |     if args.upperk < 1:
 61 |         raise ValueError("The maximum number of clusters must be positive!")
 62 |     if args.minimumsnps < 0.0:
 63 |         raise ValueError("The minimum SNP density must be >= 0.0!")
 64 |     if args.simcov <= 0.0:
 65 |         raise ValueError("The sequencing coverage of simulated normal must be >= 0.0!")
 66 |     if args.binstats is not None and args.binstats <= 0:
 67 |         raise ValueError("The number of bins for sequencing stats must be >= 0.0!")
 68 | 
 69 |     if not os.path.isfile(args.reference):
 70 |         raise ValueError(error("Reference genome file does not exist: {}".format(args.reference)))
 71 |     refidx = ['{}.{}'.format(args.reference, ix) for ix in ['amb', 'ann', 'bwt', 'pac', 'sa']]
 72 |     if not all(os.path.isfile(f) for f in refidx):
 73 |         raise ValueError(error("Some of the BWA index files are missing, please make sure these are available and generated through the command \n\t``bwa index {}''.\n Expected files are: {}".format(args.reference, '\n'.join(refidx))))
 74 | 
 75 |     blocksize = 0
 76 |     try:
 77 |         if args.blocksize[-2:] == "kb":
 78 |             blocksize = int(args.blocksize[:-2]) * 1000
 79 |         elif args.blocksize[-2:] == "Mb":
 80 |             blocksize = int(args.blocksize[:-2]) * 1000000
 81 |         else:
 82 |             blocksize = int(args.blocksize)
 83 |     except:
 84 |         raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 85 | 
 86 |     if not args.jobs:
 87 |         args.jobs = mp.cpu_count()
 88 |     if args.jobs < 1:
 89 |         raise ValueError("The number of jobs must be positive!")
 90 | 
 91 |     bcftools = args.bcftools
 92 |     if not bcftools:
 93 |         bcftools = "bcftools"
 94 |     if which(bcftools) is None:
 95 |         raise ValueError("bcftools has not been found or is not executable!")
 96 | 
 97 |     samtools = args.samtools
 98 |     if not samtools:
 99 |         samtools = "samtools"
100 |     if which(samtools) is None:
101 |         raise ValueError("samtools has not been found or is not executable!")
102 | 
103 |     return {
104 |         "input" : args.INPUT,
105 |         "reference" : os.path.abspath(args.reference),
106 |         "names" : args.names,
107 |         "rundir" : args.rundir,
108 |         "blocksize" : blocksize,
109 |         "chromosomes" : args.chromosomes,
110 |         "phasecorr" : not args.nophasecorr,
111 |         "bcftools" : bcftools,
112 |         "samtools" : samtools,
113 |         "maxploidy" : args.maxploidy,
114 |         "upperk" : args.upperk,
115 |         'minimumsnps' : args.minimumsnps,
116 |         'missingsnps' : args.missingsnps,
117 |         "cellprefix" : args.cellprefix,
118 |         "cellsuffix" : args.cellsuffix,
119 |         "gccorr" : not args.nogccorr,
120 |         "phasecorr" : not args.nophasecorr,
121 |         "simcov" : args.simcov,
122 |         "binstats" : args.binstats,
123 |         "seed" : args.seed,
124 |         "jobs" : args.jobs
125 |     }
126 | 
127 | 
128 | def main():
129 |     log('Parsing and checking arguments', level='PROGRESS')
130 |     args = parse_args()
131 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
132 | 
133 |     log('Setting directories', level='PROGRESS')
134 |     dbaf, drdr, dcom, dcal, dclo, dplo = setup(args)
135 |     def get_comp(name):
136 |         comp = os.path.join(src, name)
137 |         if not os.path.isfile(comp):
138 |             raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src))
139 |         return comp
140 | 
141 |     lcel = os.path.join(drdr, 'total.tsv')
142 |     if os.path.isfile(lcel):
143 |         raise ValueError("Total read file {} already exists, please remove it or it'd get overwritten!".format(lcel))
144 |     rdr = os.path.join(drdr, 'rdr.tsv')
145 |     if os.path.isfile(rdr):
146 |         raise ValueError("RDR file {} already exists, please remove it or it'd get overwritten!".format(rdr))
147 |     baf = os.path.join(dbaf, 'baf.tsv')
148 |     if os.path.isfile(baf):
149 |         raise ValueError("BAF file {} already exists, please remove it or it'd get overwritten!".format(baf))
150 |     
151 |     log('Aggregating previously-computed RDRs and BAFs', level='PROGRESS')
152 |     aggregate(rdr, lcel, baf, args['input'], args['names'])
153 | 
154 |     log('Combining RDRs and BAFs', level='PROGRESS')
155 |     cmd = 'python2.7 {} -r {} -b {} -j {} -k {} -l {} --minimumsnps {} --missingsnps {}'
156 |     cmd = cmd.format(get_comp('Combiner.py'), rdr, baf, args['jobs'], args['blocksize'], lcel, args['minimumsnps'], args['missingsnps'])
157 |     if args['gccorr']:
158 |         cmd += " --gccorr {}".format(args['reference'])
159 |     if not args['phasecorr']:
160 |         cmd += " --nophasecorr"
161 |     if args['seed'] is not None:
162 |         cmd += " --seed {}".format(args['seed'])
163 |     runcmd(cmd, dcom, out='combo.tsv')
164 |     com = os.path.join(dcom, 'combo.tsv')
165 | 
166 |     log('Calling', level='PROGRESS')
167 |     cmd = 'python2.7 {} {} -P {} -K {} -j {}'
168 |     cmd = cmd.format(get_comp('Caller.py'), com, args['maxploidy'], args['upperk'], args['jobs'])
169 |     if args['seed'] is not None:
170 |         cmd += " --seed {}".format(args['seed'])
171 |     runcmd(cmd, dcal, out='calls.tsv')
172 |     calls = os.path.join(dcal, 'calls.tsv')
173 | 
174 |     log('Cloning', level='PROGRESS')
175 |     cmd = 'python2.7 {} {}'
176 |     cmd = cmd.format(get_comp('Cloner.py'), calls)
177 |     if args['seed'] is not None:
178 |         cmd += " --seed {}".format(args['seed'])
179 |     runcmd(cmd, dclo, out='mapping.tsv')
180 |     mapping = os.path.join(dclo, 'mapping.tsv')
181 | 
182 |     log('Plotting', level='PROGRESS')
183 |     os.chdir(dplo)
184 |     up = (lambda f : os.path.join(os.pardir, f))
185 |     cmd = 'python2.7 {} {} -m {}'
186 |     cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping))
187 |     runcmd(cmd, './')
188 |     os.chdir(os.pardir)
189 | 
190 | 
191 | def aggregate(rdr, lcel, baf, input_dirs, names):
192 |     if names is None or len(names) != len(input_dirs):
193 |         names = list(range(len(input_dirs)))
194 |     for indir, name in zip(input_dirs, names):
195 |         log('Aggregating RDRs for {} with name {}'.format(indir, name), level='INFO')
196 |         Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,$3,"{}_"$4,$5,$6,$7{}'.format('{', name, '}'),
197 |                                                      os.path.join(indir, 'rdr', 'rdr.tsv'),
198 |                                                      rdr), shell=True).communicate() 
199 |     for indir, name in zip(input_dirs, names):
200 |         log('Aggregating Total reads for {} with name {}'.format(indir, name), level='INFO')
201 |         Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print "{}_"$1,$2{}'.format('{', name, '}'),
202 |                                                      os.path.join(indir, 'rdr', 'total.tsv'),
203 |                                                      lcel), shell=True).communicate() 
204 |     for indir, name in zip(input_dirs, names):
205 |         log('Aggregating BAFs for {} with name {}'.format(indir, name), level='INFO')
206 |         Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,"{}_"$3,$4,$5{}'.format('{', name, '}'),
207 |                                                      os.path.join(indir, 'baf', 'baf.tsv'),
208 |                                                      baf), shell=True).communicate() 
209 |     return
210 | 
211 | 
212 | def setup(args):
213 |     if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']):
214 |         log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN')
215 | 
216 |     dbaf = os.path.join(args['rundir'], 'baf')
217 |     if not os.path.isdir(dbaf):
218 |         os.mkdir(dbaf)
219 | 
220 |     drdr = os.path.join(args['rundir'], 'rdr')
221 |     if not os.path.isdir(drdr):
222 |         os.mkdir(drdr)
223 | 
224 |     dcom = os.path.join(args['rundir'], 'combo')
225 |     if not os.path.isdir(dcom):
226 |         os.mkdir(dcom)
227 | 
228 |     dcal = os.path.join(args['rundir'], 'calls')
229 |     if not os.path.isdir(dcal):
230 |         os.mkdir(dcal)
231 | 
232 |     dclo = os.path.join(args['rundir'], 'clones')
233 |     if not os.path.isdir(dclo):
234 |         os.mkdir(dclo)
235 | 
236 |     dplo = os.path.join(args['rundir'], 'plots')
237 |     if not os.path.isdir(dplo):
238 |         os.mkdir(dplo)
239 | 
240 |     return dbaf, drdr, dcom, dcal, dclo, dplo
241 | 
242 | 
243 | if __name__ == '__main__':
244 |     main()
245 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_plotting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | 
 3 | import os
 4 | import argparse
 5 | import subprocess as sp
 6 | import multiprocessing as mp
 7 | import shlex
 8 | import datetime
 9 | import re
10 | 
11 | import chisel
12 | 
13 | src = os.path.dirname(chisel.__file__)
14 | from ..Utils import *
15 | 
16 | 
17 | def parse_args():
18 |     description = "CHISEL command to re-create the plots."
19 |     parser = argparse.ArgumentParser(description=description)
20 |     parser.add_argument("INPUT", nargs='?', default='calls/calls.tsv', type=str, help="Input file with inferred copy numbers (default: calls/calls.tsv)")
21 |     parser.add_argument("-m", "--clonemap", required=False, type=str, default='clones/mapping.tsv', help="Clone map (default: not used, the cells will be clustered for plotting purposes)")
22 |     parser.add_argument("-f", "--figformat", required=False, type=str, default='png', help="Format of output figures (default: png, the only other option is pdf)")
23 |     parser.add_argument("-s", "--sample", required=False, type=int, default=20, help="Number of cells to sample (default: 20)")
24 |     parser.add_argument("--excludenoisy", required=False, default=False, action='store_true', help="Exclude noisy cells from plots (default: False)")
25 |     parser.add_argument("--gridsize", required=False, type=str, default='12,6', help="Grid dimenstions specified as comma-separated numbers (default: 12,6)")
26 |     parser.add_argument("--plotsize", required=False, type=str, default='5,1.5', help="Plot dimenstions for RDR-BAF plots, specified as comma-separated numbers (default: 5,1.5)")
27 |     parser.add_argument("--clussize", required=False, type=str, default='5,3', help="Grid dimenstions for clustered plots, specified as comma-separated numbers (default: 5,3)")
28 |     parser.add_argument("--xmax", required=False, type=float, default=None, help="Maximum x-axis value (default: None)")
29 |     parser.add_argument("--xmin", required=False, type=float, default=None, help="Minimum x-axis value (default: None)")
30 |     parser.add_argument("--ymax", required=False, type=float, default=None, help="Maximum x-axis value (default: None)")
31 |     parser.add_argument("--ymin", required=False, type=float, default=None, help="Minimum x-axis value (default: None)")
32 |     args = parser.parse_args()
33 | 
34 |     if not os.path.isfile(args.INPUT):
35 |         raise ValueError('ERROR: input file {} does not exist!'.format(args.INPUT))
36 |     if args.clonemap and not os.path.isfile(args.clonemap):
37 |         raise ValueError('ERROR: the provided clone map does not exist!')
38 |     if args.figformat not in ['pdf', 'png']:
39 |         raise ValueError('ERROR: figure format must be either pdf or png!')
40 |     if args.sample < 1:
41 |         raise ValueError('ERROR: number of sampled cells must be positive!')
42 | 
43 |     return {
44 |         'input' : args.INPUT,
45 |         'clonemap' : args.clonemap,
46 |         'format' : args.figformat,
47 |         'sample' : args.sample,
48 |         'nonoisy' : args.excludenoisy,
49 |         'gridsize' : args.gridsize,
50 |         'plotsize' : args.plotsize,
51 |         'clussize' : args.clussize,
52 |         'xmax' : args.xmax,
53 |         'xmin' : args.xmin,
54 |         'ymax' : args.ymax,
55 |         'ymin' : args.ymin
56 |     }
57 | 
58 | 
59 | def main():
60 |     log('Parsing and checking arguments', level='PROGRESS')
61 |     args = parse_args()
62 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
63 | 
64 |     log('Setting directories', level='PROGRESS')
65 |     dplo = 'plots'
66 |     if os.path.isdir(dplo):
67 |         log("The plots sub-directory in the running directory already exists, results will be overwritten!", level='WARN')
68 |     else:
69 |         os.mkdir(dplo)
70 | 
71 |     log('Plotting', level='PROGRESS')
72 |     os.chdir(dplo)
73 |     up = (lambda f : os.path.join(os.pardir, f))
74 |     cmd = 'python2.7 {} {} -m {} -f {} -s {}'
75 |     cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(args['input']), up(args['clonemap']), args['format'], args['sample'])
76 |     cmd += ' --gridsize {}'.format(args['gridsize'])
77 |     cmd += ' --plotsize {}'.format(args['plotsize'])
78 |     cmd += ' --clussize {}'.format(args['clussize'])
79 |     if args['nonoisy']:
80 |         cmd += ' --excludenoisy'
81 |     if args['xmax']:
82 |         cmd += ' --xmax {}'.format(args['xmax'])
83 |     if args['xmin']:
84 |         cmd += ' --xmin {}'.format(args['xmin'])
85 |     if args['ymax']:
86 |         cmd += ' --ymax {}'.format(args['ymax'])
87 |     if args['ymin']:
88 |         cmd += ' --ymin {}'.format(args['ymin'])
89 | 
90 |     runcmd(cmd, './')
91 |     os.chdir(os.pardir)
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     main()
96 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os, sys
  4 | os.environ["OMP_NUM_THREADS"] = "1" 
  5 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
  6 | os.environ["MKL_NUM_THREADS"] = "1" 
  7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 
  8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 
  9 | import argparse
 10 | import chisel
 11 | 
 12 | src = os.path.dirname(chisel.__file__)
 13 | from ..Utils import *
 14 | 
 15 | 
 16 | def parse_args():
 17 |     description = "Preprocess CHISEL command to compute RDRs and BAFs preprocess data from standard CHISEL inputs."
 18 |     parser = argparse.ArgumentParser(description=description)
 19 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
 20 |     parser.add_argument("-t","--tumor", required=True, type=str, help="Barcoded single-cell BAM file")
 21 |     parser.add_argument("-n","--normal", required=True, type=str, help="Matched-normal BAM file")
 22 |     parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome")
 23 |     parser.add_argument("-l","--listphased", type=str, required=True, help="Phased SNPs file (lines of heterozygous germline SNPs must contain either 0|1 or 1|0)")
 24 |     parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"")
 25 |     parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)")
 26 |     parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")")
 27 |     parser.add_argument("-m","--minreads", type=int, required=False, default=300000, help="Minimum number total reads to select cells (default: 300000)")
 28 |     parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)")
 29 |     parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)")
 30 |     parser.add_argument("--addgccorr", required=False, default=False, action='store_true', help="Add additional custome correction for GC bias (default: disabled)")
 31 |     parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)")
 32 |     parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)")
 33 |     parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
 34 |     parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)")
 35 |     parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)")
 36 |     parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)")
 37 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 38 |     args = parser.parse_args()
 39 | 
 40 |     if not os.path.isdir(args.rundir):
 41 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
 42 |     if not os.path.isfile(args.tumor):
 43 |         raise ValueError("Barcoded single-cell BAM file does not exist: {}".format(args.tumor))
 44 |     if not os.path.isfile(args.normal):
 45 |         raise ValueError("Matched-normal BAM file does not exist: {}".format(args.normal))
 46 |     if not os.path.isfile(args.reference):
 47 |         raise ValueError("Reference genome file does not exist: {}".format(args.reference))
 48 |     if not os.path.isfile(args.listphased):
 49 |         raise ValueError("Phased SNPs file does not exist: {}".format(args.listphased))
 50 |     if args.seed and args.seed < 1:
 51 |         raise ValueError("The random seed  must be positive!")
 52 |     if args.minreads < 1:
 53 |         raise ValueError("The minimum number of reads must be positive!")
 54 |     if args.maxploidy < 3:
 55 |         raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!")
 56 |     if args.upperk < 1:
 57 |         raise ValueError("The maximum number of clusters must be positive!")
 58 | 
 59 |     size = 0
 60 |     try:
 61 |         if args.size[-2:] == "kb":
 62 |             size = int(args.size[:-2]) * 1000
 63 |         elif args.size[-2:] == "Mb":
 64 |             size = int(args.size[:-2]) * 1000000
 65 |         else:
 66 |             size = int(args.size)
 67 |     except:
 68 |         raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 69 | 
 70 |     blocksize = 0
 71 |     try:
 72 |         if args.blocksize[-2:] == "kb":
 73 |             blocksize = int(args.blocksize[:-2]) * 1000
 74 |         elif args.blocksize[-2:] == "Mb":
 75 |             blocksize = int(args.blocksize[:-2]) * 1000000
 76 |         else:
 77 |             blocksize = int(args.blocksize)
 78 |     except:
 79 |         raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 80 | 
 81 |     if not args.jobs:
 82 |         args.jobs = mp.cpu_count()
 83 |     if args.jobs < 1:
 84 |         raise ValueError("The number of jobs must be positive!")
 85 | 
 86 |     bcftools = args.bcftools
 87 |     if not bcftools:
 88 |         bcftools = "bcftools"
 89 |     if which(bcftools) is None:
 90 |         raise ValueError("bcftools has not been found or is not executable!")
 91 | 
 92 |     samtools = args.samtools
 93 |     if not samtools:
 94 |         samtools = "samtools"
 95 |     if which(samtools) is None:
 96 |         raise ValueError("samtools has not been found or is not executable!")
 97 | 
 98 |     return {
 99 |         "rundir" : args.rundir,
100 |         "tumor" : args.tumor,
101 |         "normal" : args.normal,
102 |         "reference" : args.reference,
103 |         "listphased" : args.listphased,
104 |         "binsize" : size,
105 |         "blocksize" : blocksize,
106 |         "chromosomes" : args.chromosomes,
107 |         "minreads" : args.minreads,
108 |         "addgccorr" : args.addgccorr,
109 |         "phasecorr" : not args.nophasecorr,
110 |         "bcftools" : bcftools,
111 |         "samtools" : samtools,
112 |         "maxploidy" : args.maxploidy,
113 |         "upperk" : args.upperk,
114 |         "cellprefix" : args.cellprefix,
115 |         "cellsuffix" : args.cellsuffix,
116 |         "seed" : args.seed,
117 |         "jobs" : args.jobs
118 |     }
119 | 
120 | 
121 | def main():
122 |     log('Parsing and checking arguments', level='PROGRESS')
123 |     args = parse_args()
124 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
125 | 
126 |     log('Setting directories', level='PROGRESS')
127 |     dbaf, drdr = setup(args)
128 |     def get_comp(name):
129 |         comp = os.path.join(src, name)
130 |         if not os.path.isfile(comp):
131 |             raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src))
132 |         return comp
133 | 
134 |     log('Computing RDRs', level='PROGRESS')
135 |     cmd = 'python2.7 {} -n {} -t {} -r {} -b {} -m {} -j {} -c \"{}\" --outdir {}'
136 |     cmd = cmd.format(get_comp('RDREstimator.py'), args['normal'], args['tumor'], args['reference'], args['binsize'], args['minreads'], args['jobs'], args['chromosomes'], drdr)
137 |     if args['samtools'] is not None:
138 |         cmd += " -s {}".format(args['samtools'])
139 |     cmd += " --cellprefix {}".format(args['cellprefix'])
140 |     if args['cellsuffix'] is not None:
141 |         cmd += " --cellsuffix {}".format(args['cellsuffix'])
142 |     runcmd(cmd, drdr, out='rdr.tsv')
143 |     lcel = os.path.join(drdr, 'total.tsv')
144 |     rdr = os.path.join(drdr, 'rdr.tsv')
145 | 
146 |     log('Computing BAFs', level='PROGRESS')
147 |     cmd = 'python2.7 {} -n {} -t {} -r {} -j {} -c {} -l {}'
148 |     cmd = cmd.format(get_comp('BAFEstimator.py'), args['normal'], args['tumor'], args['reference'], args['jobs'], lcel, args['listphased'])
149 |     if args['samtools'] is not None:
150 |         cmd += " -s {}".format(args['samtools'])
151 |     if args['bcftools'] is not None:
152 |         cmd += " -b {}".format(args['bcftools'])
153 |     cmd += " --cellprefix {}".format(args['cellprefix'])
154 |     if args['cellsuffix'] is not None:
155 |         cmd += " --cellsuffix {}".format(args['cellsuffix'])
156 |     runcmd(cmd, dbaf, out='baf.tsv')
157 |     baf = os.path.join(dbaf, 'baf.tsv')
158 | 
159 | 
160 | def setup(args):
161 |     if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']):
162 |         log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN')
163 | 
164 |     dbaf = os.path.join(args['rundir'], 'baf')
165 |     if not os.path.isdir(dbaf):
166 |         os.mkdir(dbaf)
167 | 
168 |     drdr = os.path.join(args['rundir'], 'rdr')
169 |     if not os.path.isdir(drdr):
170 |         os.mkdir(drdr)
171 | 
172 |     return dbaf, drdr
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     main()
177 | 


--------------------------------------------------------------------------------
/src/chisel/bin/chisel_pseudonormal.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | import os
  4 | import argparse
  5 | import shlex, shutil
  6 | import multiprocessing as mp
  7 | 
  8 | from multiprocessing import Lock, Value, Pool
  9 | from collections import defaultdict
 10 | from collections import Counter
 11 | 
 12 | import numpy as np
 13 | 
 14 | import chisel
 15 | 
 16 | src = os.path.dirname(chisel.__file__)
 17 | from ..Utils import *
 18 | from ..RDREstimator import *
 19 | 
 20 | 
 21 | def parse_args():
 22 |     description = "CHISEL command to generate a pseudo-matched normal sample by extracting diploid cells from a barcoded single-cell BAM file."
 23 |     parser = argparse.ArgumentParser(description=description)
 24 |     parser.add_argument("INPUT", type=str, help="Barcoded single-cell BAM file")
 25 |     parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome")
 26 |     parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
 27 |     parser.add_argument("-e","--threshold", type=float, required=False, default=0.9, help="Minimum fraction of diploid genome to select diploid cells (default: 0.9)")
 28 |     parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"")
 29 |     parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")")
 30 |     parser.add_argument("-m","--minreads", type=int, required=False, default=100000, help="Minimum number total reads to select cells (default: 100000)")
 31 |     parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
 32 |     parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
 33 |     parser.add_argument("--tmpdir", required=False, default='_TMP', type=str, help="Temporary directory in running directory (default: _TMP)")
 34 |     parser.add_argument("-n","--normal", required=False, type=str, default="pseudonormal.bam", help="Name of the generated pseudo matched-normal BAM file (default: pseudonormal.bam)")
 35 |     parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)")
 36 |     parser.add_argument("--cellsuffix", type=str, required=False, default='', help="Suffix of cell barcode field in SAM format (default: none)")
 37 |     args = parser.parse_args()
 38 | 
 39 |     if not os.path.isdir(args.rundir):
 40 |         raise ValueError("Running directory does not exists: {}".format(args.rundir))
 41 |     tmpdir = os.path.join(args.rundir, args.tmpdir)
 42 |     if os.path.isdir(tmpdir):
 43 |         raise ValueError("Temporary directory already exists within specified running direcotyr: {}".format(tmpdir))    
 44 |     if not os.path.isfile(args.INPUT):
 45 |         raise ValueError("Barcoded single-cell BAM file does not exist: {}".format(args.INPUT))
 46 |     if not args.normal[:-4] != ".bam":
 47 |         raise ValueError("The provided output name does not end in .bam: {}".format(args.normal))
 48 |     if not (0.0 <= args.threshold <= 1.0):
 49 |         raise ValueError("The provided threshold is not in [0, 1]: {}".format(args.threshold))
 50 |     if not os.path.isfile(args.reference):
 51 |         raise ValueError("Reference genome file does not exist: {}".format(args.reference))
 52 |     if args.minreads < 1:
 53 |         raise ValueError("The minimum number of reads must be positive!")
 54 | 
 55 |     size = 0
 56 |     try:
 57 |         if args.size[-2:] == "kb":
 58 |             size = int(args.size[:-2]) * 1000
 59 |         elif args.size[-2:] == "Mb":
 60 |             size = int(args.size[:-2]) * 1000000
 61 |         else:
 62 |             size = int(args.size)
 63 |     except:
 64 | 	raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")
 65 | 
 66 |     if not args.jobs:
 67 |         args.jobs = mp.cpu_count()
 68 |     if args.jobs < 1:
 69 |         raise ValueError("The number of jobs must be positive!")
 70 | 
 71 |     samtools = args.samtools
 72 |     if not samtools:
 73 |         samtools = "samtools"
 74 |     if which(samtools) is None:
 75 |         raise ValueError("samtools has not been found or is not executable!")
 76 | 
 77 |     return {
 78 |         "rundir" : args.rundir,
 79 |         "tmpdir" : tmpdir,
 80 |         "tumor" : args.INPUT,
 81 |         "thres" : args.threshold,
 82 |         "normal" : args.normal,
 83 |         "reference" : args.reference,
 84 |         "binsize" : size,
 85 |         "chromosomes" : args.chromosomes,
 86 |         "minreads" : args.minreads,
 87 |         "samtools" : samtools,
 88 |         "cellprefix" : args.cellprefix,
 89 |         "cellsuffix" : args.cellsuffix,
 90 |         "jobs" : args.jobs
 91 |     }
 92 | 
 93 | 
 94 | def main():
 95 |     log('Parsing and checking arguments')
 96 |     args = parse_args()
 97 |     log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')
 98 | 
 99 |     log('Computing bins')
100 |     chrs = args['chromosomes'].split()
101 |     bins = get_bins(args['reference'], chrs, args['binsize'], bams=[args['tumor']], samtools=args['samtools'])
102 |     counts = defaultdict(lambda : defaultdict(lambda : dict()))
103 |     assert len(chrs) == len(bins.keys())
104 |     chrs = bins.keys()
105 |     
106 |     log('Counting reads on barcoded cells')
107 |     counts = counting_cells(counts, args['tumor'], bins, args['samtools'], args['jobs'], args['cellprefix'], args['cellsuffix'])
108 |     cells = set(e for c in counts for b in counts[c] for e in counts[c][b])
109 |     log('Number of identified cells: {}'.format(len(cells)), level='INFO')
110 |     
111 |     log('Computing total numbers of sequenced reads')
112 |     total = reduce(inupdate, (Counter(counts[c][b]) for c in counts for b in counts[c]))
113 | 
114 |     log('Selecting all cells to consider for the analysis')
115 |     if args['minreads']:
116 |         cells = set(e for e in total if total[e] >= args['minreads'])
117 |     log('Number of selected cells: {}'.format(len(cells)), level='INFO')
118 | 
119 |     log('Selecting diploid cells')
120 |     diploid = sorted(set(filter((lambda e : isdiploid(counts, e, args['thres'])), cells)))
121 |     dlist = os.path.join(args['rundir'], 'diploid.tsv')
122 |     with open(dlist, 'w') as o:
123 |         o.write('\n'.join(diploid) + '\n')
124 |     log('Number of identified diploid cells: {}'.format(len(diploid)), level='INFO')
125 | 
126 |     if len(diploid) > 0:
127 |         cov = (float(sum(total[e] for e in diploid)) * 100.0) / float(sum(b[1] - b[0] for c in counts for b in counts[c]))
128 |         log('Approximate sequencing coverage of pseudo matched-normal sample: {}'.format(cov), level='INFO')
129 | 
130 |         log('Extracting sequencing reads from selected diploid cells')
131 |         extracting_diploid(args['tumor'], args['samtools'], chrs, args['tmpdir'], dlist, args['jobs'])
132 | 
133 |         log('Merging and extracted sequencing reads and indexing the output pseduo matched-normal sample')
134 |         merging_diploid(chrs, args['tmpdir'], args['samtools'], os.path.join(args['rundir'], args['normal']))
135 | 
136 |         log('Removing temporary files')
137 |         shutil.rmtree(args['tmpdir'])
138 | 
139 |     log('KTHXBYE')
140 | 
141 | 
142 | def isdiploid(counts, cell, THRES):
143 |     rdr = np.array([counts[c][b][cell] for c in counts for b in counts[c] if cell in counts[c][b] and counts[c][b][cell] > 0])
144 |     base = np.sum(rdr) / float(rdr.shape[0])
145 |     assert base > 0, "Found a cell with no sequencing reads"
146 |     rdr = rdr / base
147 |     avg = 2.0 / (np.sum(rdr) / float(rdr.shape[0]))
148 |     dip = (lambda t : np.sum(np.rint(t * rdr) == 2))
149 |     scale = max((avg + (x/100.0)*d for x in xrange(0, 100+1, 1) for d in {-1, 1}), key=dip)
150 |     return (dip(scale) / float(rdr.shape[0])) >= THRES
151 | 
152 | 
153 | def extracting_diploid(bam, samt, chrs, tmpdir, dlist, J):
154 |     lock = Lock()
155 |     counter = Value('i', 0)
156 |     assert not os.path.isdir(tmpdir)
157 |     os.mkdir(tmpdir)
158 |     initargs = (lock, counter, len(chrs), bam, samt, tmpdir, dlist)
159 |     pool = Pool(processes=min(J, len(chrs)), initializer=init_extracting_diploid, initargs=initargs)
160 |     res = {o for o in pool.imap_unordered(extract, chrs)}
161 |         #if o.strip() != '':
162 |             #raise ValueError("SAMtools raised the following error during extraction ofsequencing reads: {}".format(o))
163 |     return
164 | 
165 |         
166 | def init_extracting_diploid(lock, counter, l, bam, samt, _tmpdir, dlist):
167 |     global bar, cmd_sam, cmd_gre, cmd_com, tmpdir
168 |     bar = ProgressBar(total=l, length=min(l, 40), lock=lock, counter=counter, verbose=False)
169 |     cmd_sam = "{} view -h -F 1796 -q 13 {} {}".format(samt, bam, "{}")
170 |     cmd_gre = "grep -F -f {} -e \"@HD\" -e \"@SQ\" -e \"@RG\" -e \"@PG\" -e \"@CO\"".format(dlist)
171 |     cmd_com = "{} sort -O bam -o {} -T {}".format(samt, "{}", "{}")
172 |     tmpdir = _tmpdir
173 | 
174 | 
175 | def extract(c):
176 |     cmd = cmd_sam.format(c)
177 |     out = os.path.join(tmpdir, '{}.bam'.format(c))
178 |     tmp = os.path.join(tmpdir, '_TMP_{}'.format(c))
179 |     os.mkdir(tmp)
180 |     sam = sp.Popen(shlex.split(cmd_sam.format(c)), stdout=sp.PIPE, stderr=sp.PIPE)
181 |     gre = sp.Popen(shlex.split(cmd_gre), stdin=sam.stdout, stdout=sp.PIPE, stderr=sp.PIPE)
182 |     stdout, stderr = sp.Popen(shlex.split(cmd_com.format(out, tmp)), stdin=gre.stdout, stdout=sp.PIPE, stderr=sp.PIPE).communicate()
183 |     return stderr.strip()
184 | 
185 | 
186 | def merging_diploid(chrs, tmpdir, samt, out):
187 |     cfiles = map((lambda c : os.path.join(tmpdir, '{}.bam'.format(c))), sorted(chrs, key=orderchrs))
188 |     assert all(os.path.isfile(f) for f in cfiles), "Extracted reads are missing for some files!"
189 |     cmd = "{} merge -f {} {}".format(samt, out, ' '.join(cfiles))
190 |     stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate()
191 |     #if stderr.strip() != '':
192 |     #    raise ValueError("SAMtools merging terminated with the following error: {}".format(stderr))
193 |     cmd = "{} index {}".format(samt, out)
194 |     stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate()
195 |     #if stderr.strip() != '':
196 |     #    raise ValueError("SAMtools indexing terminated with the following error: {}".format(stderr))
197 |     return
198 | 
199 | 
200 | if __name__ == '__main__':
201 |     main()
202 | 


--------------------------------------------------------------------------------
/src/chisel/bin/count.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk
 2 | 
 3 | BEGIN{}
 4 | {
 5 |     if ( match($0, /CB:Z:[ACGT]+/) )
 6 |     {
 7 |         REF = $4 - 1;
 8 |         QUE = 0;
 9 |         CIG = $6;
10 |         CEL = substr($0, RSTART+5, RLENGTH-5);
11 |         while( match(CIG, /^[[:digit:]]+/) )
12 |         {
13 |             N = substr(CIG, RSTART, RLENGTH);
14 |             CIG = substr(CIG, RSTART+RLENGTH);
15 |             if( match(CIG, /^[MIDNSHP=X]/) )
16 |             {
17 |                 C = substr(CIG, RSTART, RLENGTH);
18 |                 CIG = substr(CIG, RSTART+RLENGTH);
19 |                 if (C == "M" || C == "=" || C == "X")
20 |                 {
21 |                     REF += N;
22 |                     QUE += N;
23 |                     if (TAG <= REF)
24 |                     {
25 |                         X[CEL, substr($10, QUE - REF + TAG, 1)]++;
26 |                         next;
27 |                     };
28 |                 } else if (C == "D" || C == "N")
29 |                 {
30 |                     REF += N;
31 |                     if (TAG <= REF)
32 |                     {
33 |                         X[CEL, "N"]++;
34 |                         next;
35 |                     }
36 |                 } else if (C == "I" || C == "S")
37 |                 {
38 |                     QUE += N;
39 |                 };
40 |             };
41 |         };
42 |     };
43 | }
44 | END{ for (p in X) { split(p, x, SUBSEP); print x[1], x[2], X[x[1], x[2]] } }
45 | 


--------------------------------------------------------------------------------
/src/chisel/count.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk
 2 | 
 3 | BEGIN{}
 4 | {
 5 |     if ( match($0, /CB:Z:[ACGT]+/) )
 6 |     {
 7 |         REF = $4 - 1;
 8 |         QUE = 0;
 9 |         CIG = $6;
10 |         CEL = substr($0, RSTART+5, RLENGTH-5);
11 |         while( match(CIG, /^[[:digit:]]+/) )
12 |         {
13 |             N = substr(CIG, RSTART, RLENGTH);
14 |             CIG = substr(CIG, RSTART+RLENGTH);
15 |             if( match(CIG, /^[MIDNSHP=X]/) )
16 |             {
17 |                 C = substr(CIG, RSTART, RLENGTH);
18 |                 CIG = substr(CIG, RSTART+RLENGTH);
19 |                 if (C == "M" || C == "=" || C == "X")
20 |                 {                    
21 |                     REF += N;
22 |                     QUE += N;
23 |                     if (TAG <= REF)
24 |                     {
25 |                         X[CEL, substr($10, QUE - REF + TAG, 1)]++;
26 |                         next;
27 |                     };
28 |                 } else if (C == "D" || C == "N")
29 |                 {
30 |                     REF += N;
31 |                     if (TAG <= REF)
32 |                     {
33 |                         X[CEL, "N"]++;
34 |                         next;
35 |                     }
36 |                 } else if (C == "I" || C == "S")
37 |                 {
38 |                     QUE += N;
39 |                 };
40 |             };
41 |         };
42 |     };
43 | }
44 | END{ for (p in X) { split(p, x, SUBSEP); print x[1], x[2], X[x[1], x[2]] } }
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/allchecks.sh:
--------------------------------------------------------------------------------
  1 | # Checks
  2 | : ex: set ft=markdown ;:<<'```shell' #
  3 | 
  4 | This script runs all the tests to check that the current CHISEL implementation is correct and behaves as expected.
  5 | 
  6 | ## Set up
  7 | 
  8 | ```shell
  9 | set -e
 10 | set -o xtrace
 11 | PS4='[\t]'
 12 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )
 13 | rm -rf X/ complete/ callingE/ cloningE/ plottingE/ pseudonormal/  
 14 | :<<'```shell' # Ignore this line
 15 | ```
 16 | 
 17 | ## Check function
 18 | 
 19 | ```shell
 20 | check () {
 21 |       if cmp $1 $2
 22 |       then
 23 |           echo "CHECK $3: TEST $1 SUCCESS!"
 24 |       else
 25 |           echo "CHECK $3: TEST $1 FAILED!"
 26 |           exit 1
 27 |       fi
 28 | }
 29 | :<<'```shell' # Ignore this line
 30 | ```
 31 | 
 32 | ## Check complete
 33 | 
 34 | ```shell
 35 | mkdir X/
 36 | cp ../demos/complete/demo-complete.sh X/demo-complete.sh
 37 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/complete.tar.gz | tar -xvz
 38 | check complete.chk <(bash X/demo-complete.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "complete"
 39 | for F in complete/*.png; do check ${F} X/plots/$(basename ${F}) "complete"; done
 40 | check complete/calls.tsv X/calls/calls.tsv "complete"
 41 | check complete/mapping.tsv X/clones/mapping.tsv "complete"
 42 | rm -rf X/ complete/
 43 | :<<'```shell' # Ignore this line
 44 | ```
 45 | 
 46 | ## Check callingE
 47 | 
 48 | ```shell
 49 | mkdir X/
 50 | cp ../demos/callingE/demo-callingE.sh X/demo-callingE.sh
 51 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/callingE.tar.gz | tar -xvz
 52 | check callingE.chk <(bash X/demo-callingE.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "callingE"
 53 | for F in callingE/*.png; do check ${F} X/plots/$(basename ${F}) "callingE"; done
 54 | check callingE/calls.tsv X/calls/calls.tsv "callingE"
 55 | check callingE/mapping.tsv X/clones/mapping.tsv "callingE"
 56 | rm -rf X/ callingE/
 57 | :<<'```shell' # Ignore this line
 58 | ```
 59 | 
 60 | ## Check cloningE
 61 | 
 62 | ```shell
 63 | mkdir X/
 64 | cp ../demos/cloningE/demo-cloningE.sh X/demo-cloningE.sh
 65 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/cloningE.tar.gz | tar -xvz
 66 | check cloningE.chk <(bash X/demo-cloningE.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "cloningE"
 67 | for F in cloningE/*.png; do check ${F} X/plots/$(basename ${F}) "cloningE"; done
 68 | check cloningE/mapping.tsv X/clones/mapping.tsv "cloningE"
 69 | rm -rf X/ cloningE/
 70 | :<<'```shell' # Ignore this line
 71 | ```
 72 | 
 73 | ## Check plottingE
 74 | 
 75 | ```shell
 76 | mkdir X/
 77 | cp ../demos/plottingE/demo-plottingE.sh X/demo-plottingE.sh
 78 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/plottingE.tar.gz | tar -xvz
 79 | check plottingE.chk <(bash X/demo-plottingE.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "plottingE"
 80 | for F in plottingE/*.png; do check ${F} X/plots/$(basename ${F}) "plottingE"; done
 81 | rm -rf X/ plottingE/
 82 | :<<'```shell' # Ignore this line
 83 | ```
 84 | 
 85 | ## Checking pseudonormal
 86 | 
 87 | ```shell
 88 | mkdir X/
 89 | cp ../demos/pseudonormal/demo-pseudonormal.sh X/demo-pseudonormal.sh
 90 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/pseudonormal.tar.gz | tar -xvz
 91 | check pseudonormal.chk <(bash X/demo-pseudonormal.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "pseudonormal"
 92 | check pseudonormal/diploid.tsv X/diploid.tsv "pseudonormal"
 93 | rm -rf X/ pseudonormal/
 94 | :<<'```shell' # Ignore this line
 95 | ```
 96 | 
 97 | ## Successful checks
 98 | 
 99 | ```shell
100 | echo "ALL CHECKS PASSED SUCCESSFULLY!"
101 | exit $?
102 | ```
103 | 


--------------------------------------------------------------------------------
/tests/callingE.chk:
--------------------------------------------------------------------------------
  1 | + PS4=''
  2 | :
  3 | mkdir -p data
  4 | export INPUT=data/combo.tsv
  5 | INPUT=data/combo.tsv
  6 | :
  7 | Parsing and checking arguments
  8 | Arguments:
  9 | 	upperk : 100
 10 | 	sensitivity : 1.0
 11 | 	seed : 25
 12 | 	INPUT : data/combo.tsv
 13 | 	maxploidy : 4
 14 | 
 15 | Setting directories
 16 | Calling
 17 | Parsing and checking arguments
 18 | Arguments:
 19 | 	significativity : 0.02
 20 | 	sensitivity : 1.0
 21 | 	seed : 25
 22 | 	input : data/combo.tsv
 23 | 	LB : 0
 24 | 	scoring : False
 25 | 	e : 0.05
 26 | 	shift : 0.05
 27 | 	maxploidy : 4
 28 | 	fastscaling : False
 29 | 	restarts : 200
 30 | 	lord : 1
 31 | 	UB : 100
 32 | Reading combined RDRs and BAFs of barcoded cells
 33 | Formatting RDRs and BAFs
 34 | Clustering globally
 35 | Computing for 100:
 36 | Objective value for 100: 0.328880907635
 37 | Computing for 50:
 38 | Objective value for 50: 0.380881364939
 39 | Computing for 75:
 40 | Objective value for 75: 0.354660110754
 41 | Computing for 62:
 42 | Objective value for 62: 0.369479243234
 43 | Computing for 56:
 44 | Objective value for 56: 0.375508283812
 45 | Computing for 53:
 46 | Objective value for 53: 0.382415700949
 47 | Computing for 54:
 48 | Objective value for 54: 0.377394006258
 49 | Computing for 53:
 50 | Objective value for 53: 0.382415700949
 51 | Computing for 54:
 52 | Objective value for 54: 0.377394006258
 53 | Estimating RDR and BAF of every cluster
 54 | Selecting ploidies
 55 | Number of cells for every ploidy' level:
 56 | Cells with base ploidy 2: 401
 57 | Cells with base ploidy 4: 1674
 58 | Inferring copy numbers
 59 | Phasing copy-number states along the genome
 60 | Writing results
 61 | Cloning
 62 | Parsing and checking arguments
 63 | Arguments:
 64 | 	minsize : 14
 65 | 	refinement : 0.0
 66 | 	seed : 25
 67 | 	maxdiff : 0.06
 68 | 	input : ./calls/calls.tsv
 69 | 	linkage : single
 70 | Reading input
 71 | Clustering cells in clones
 72 | Selecting clones
 73 | Number of identified clones: 6
 74 | Refining clustering
 75 | Number of discarded cells: 784 over 2075 in total
 76 | Profiling clones
 77 | Writing clone map
 78 | Writing clone-corrected copy numbers in provided input
 79 | Plotting
 80 | Parsing and checking arguments
 81 | Arguments:
 82 | 	format : png
 83 | 	plotsize : (5.0, 1.5)
 84 | 	sample : 20
 85 | 	xmin : None
 86 | 	clonemap : .././clones/mapping.tsv
 87 | 	nonoisy : False
 88 | 	gridsize : (12.0, 6.0)
 89 | 	ymax : None
 90 | 	clussize : (5.0, 3.0)
 91 | 	xmax : None
 92 | 	ymin : None
 93 | 	input : .././calls/calls.tsv
 94 | Reading input
 95 | Number of cells: 2075
 96 | Number of bins: 570
 97 | Setting style
 98 | Reading clonemap
 99 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png
100 | Plotting clustered RDR plots for 20 random cells in crdr.png
101 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png
102 | Plotting read-depth ratios in rdrs.png
103 | Plotting B-allele frequencies in bafs.png
104 | Plotting total copy numbers in totalcn.png
105 | Plotting total copy numbers corrected by clones in totalcn-corrected.png
106 | Plotting LOH in loh.png
107 | Plotting LOH corrected by clones in loh-corrected.png
108 | Plotting A-specific copy numbers in Aspecificcn.png
109 | Plotting A-specific copy numbers corrected by clones in Aspecificcn-corrected.png
110 | Plotting B-specific copy numbers in Bspecificcn.png
111 | Plotting B-specific copy numbers corrected by clones in Bspecificcn-corrected.png
112 | Plotting allele-specific copy numbers in allelecn.png
113 | Plotting allele-specific copy numbers corrected by clones in allelecn-corrected.png
114 | Plotting haplotype-specific copy numbers in haplotypecn.png
115 | Plotting haplotype-specific copy numbers corrected by clones in haplotypecn-corrected.png
116 | KTHKBYE!
117 | exit 0
118 | 


--------------------------------------------------------------------------------
/tests/cloningE.chk:
--------------------------------------------------------------------------------
 1 | + PS4=''
 2 | :
 3 | mkdir -p data
 4 | export INPUT=data/calls.tsv
 5 | INPUT=data/calls.tsv
 6 | :
 7 | Parsing and checking arguments
 8 | Arguments:
 9 | 	minsize : 14
10 | 	refinement : None
11 | 	seed : 25
12 | 	maxdiff : 0.06
13 | 	INPUT : data/calls.tsv
14 | 
15 | Setting directories
16 | Cloning
17 | Parsing and checking arguments
18 | Arguments:
19 | 	minsize : 14
20 | 	refinement : 0.0
21 | 	seed : 25
22 | 	maxdiff : 0.06
23 | 	input : data/calls.tsv
24 | 	linkage : single
25 | Reading input
26 | Clustering cells in clones
27 | Selecting clones
28 | Number of identified clones: 6
29 | Refining clustering
30 | Number of discarded cells: 689 over 2075 in total
31 | Profiling clones
32 | Writing clone map
33 | Writing clone-corrected copy numbers in provided input
34 | Plotting
35 | Parsing and checking arguments
36 | Arguments:
37 | 	format : png
38 | 	plotsize : (5.0, 1.5)
39 | 	sample : 20
40 | 	xmin : None
41 | 	clonemap : .././clones/mapping.tsv
42 | 	nonoisy : False
43 | 	gridsize : (12.0, 6.0)
44 | 	ymax : None
45 | 	clussize : (5.0, 3.0)
46 | 	xmax : None
47 | 	ymin : None
48 | 	input : ../data/calls.tsv
49 | Reading input
50 | Number of cells: 2075
51 | Number of bins: 570
52 | Setting style
53 | Reading clonemap
54 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png
55 | Plotting clustered RDR plots for 20 random cells in crdr.png
56 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png
57 | Plotting read-depth ratios in rdrs.png
58 | Plotting B-allele frequencies in bafs.png
59 | Plotting total copy numbers in totalcn.png
60 | Plotting total copy numbers corrected by clones in totalcn-corrected.png
61 | Plotting LOH in loh.png
62 | Plotting LOH corrected by clones in loh-corrected.png
63 | Plotting A-specific copy numbers in Aspecificcn.png
64 | Plotting A-specific copy numbers corrected by clones in Aspecificcn-corrected.png
65 | Plotting B-specific copy numbers in Bspecificcn.png
66 | Plotting B-specific copy numbers corrected by clones in Bspecificcn-corrected.png
67 | Plotting allele-specific copy numbers in allelecn.png
68 | Plotting allele-specific copy numbers corrected by clones in allelecn-corrected.png
69 | Plotting haplotype-specific copy numbers in haplotypecn.png
70 | Plotting haplotype-specific copy numbers corrected by clones in haplotypecn-corrected.png
71 | KTHKBYE!
72 | exit 0
73 | 


--------------------------------------------------------------------------------
/tests/complete.chk:
--------------------------------------------------------------------------------
  1 | + PS4=''
  2 | :
  3 | mkdir -p data
  4 | echo 'Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary.'
  5 | Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary.
  6 | export TUM=data/cells.bam
  7 | TUM=data/cells.bam
  8 | echo 'Downloading matched-normal BAM file from Zenodo, please be patient as downloading time may vary.'
  9 | Downloading matched-normal BAM file from Zenodo, please be patient as downloading time may vary.
 10 | export NOR=data/normal.bam
 11 | NOR=data/normal.bam
 12 | :
 13 | echo 'Downloading human reference genome, please be patient as downloading time may vary.'
 14 | Downloading human reference genome, please be patient as downloading time may vary.
 15 | export REF=data/hg19.fa
 16 | REF=data/hg19.fa
 17 | export DIC=data/hg19.dict
 18 | DIC=data/hg19.dict
 19 | :
 20 | export PHA=data/phases.tsv
 21 | PHA=data/phases.tsv
 22 | :
 23 | Parsing and checking arguments
 24 | Arguments:
 25 | 	chromosomes : chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22
 26 | 	reference : data/hg19.fa
 27 | 	normal : data/normal.bam
 28 | 	blocksize : 50000
 29 | 	seed : 12
 30 | 	listphased : data/phases.tsv
 31 | 	maxploidy : 4
 32 | 	minreads : 100000
 33 | 	binsize : 5000000
 34 | 	tumor : data/cells.bam
 35 | 	upperk : 100
 36 | 	bcftools : None
 37 | 
 38 | Setting directories
 39 | Computing RDRs
 40 | Parsing and checking arguments
 41 | Arguments:
 42 | tumor : data/cells.bam
 43 | minreads : 100000
 44 | chrs : 
 45 | normal : data/normal.bam
 46 | list : None
 47 | ref : data/hg19.fa
 48 | bins : 5000000
 49 | outdir : ./rdr
 50 | Computing bins
 51 | Counting reads on normal
 52 | Counting reads on barcoded cells
 53 | Evaluating set of found cells
 54 | Computing total numbers of sequenced reads
 55 | Selecting cells
 56 | Number of selected cells: 100
 57 | Writing the totals in ./rdr/total.tsv
 58 | Estimating RDR
 59 | KTHXBYE
 60 | Computing BAFs
 61 | Parsing and checking arguments
 62 | Arguments:
 63 | 	tumor : data/cells.bam
 64 | 	gamma : 0.01
 65 | 	phased : data/phases.tsv
 66 | 	normal : data/normal.bam
 67 | 	list : ./rdr/total.tsv
 68 | 	ref : data/hg19.fa
 69 | 	bcftools : bcftools
 70 | Extracting chromosomes
 71 | Chromosomes analyzed: chr6
 72 | Total number of given phased positions: 95659
 73 | Counting phased SNPs in matched normal
 74 | Number of selected heterozygous SNPs: 93728
 75 | Extracting SNP counts for all cells
 76 | Reading cell list
 77 | Writing A/B counts for selected phased SNPs across selected cells
 78 | KTHXBYE
 79 | Combining RDRs and BAFs
 80 | Parsing and checking arguments
 81 | Arguments:
 82 | 	blocksize : 50000
 83 | 	seed : 12
 84 | 	maxerror : None
 85 | 	restarts : 100
 86 | 	minerror : 0.001
 87 | 	rdr : ./rdr/rdr.tsv
 88 | 	bootstrap : 100
 89 | 	baf : ./baf/baf.tsv
 90 | 	listofcells : ./rdr/total.tsv
 91 | 	significance : 0.05
 92 | Read list of cells
 93 | Reading RDR
 94 | Reading BAF
 95 | Combining
 96 | Printing combined RDR and BAF
 97 | Calling
 98 | Parsing and checking arguments
 99 | Arguments:
100 | 	significativity : 0.02
101 | 	sensitivity : 1.0
102 | 	seed : 12
103 | 	input : ./combo/combo.tsv
104 | 	LB : 0
105 | 	scoring : False
106 | 	e : 0.05
107 | 	shift : 0.05
108 | 	maxploidy : 4
109 | 	fastscaling : False
110 | 	restarts : 200
111 | 	lord : 1
112 | 	UB : 100
113 | Reading combined RDRs and BAFs of barcoded cells
114 | Formatting RDRs and BAFs
115 | Clustering globally
116 | Computing for 35:
117 | Objective value for 35: 0.0
118 | Computing for 17:
119 | Objective value for 17: 0.00597601109068
120 | Computing for 8:
121 | Objective value for 8: 0.0131765568746
122 | Computing for 4:
123 | Objective value for 4: 0.0184374208269
124 | Computing for 2:
125 | Objective value for 2: 0.10191549988
126 | Computing for 3:
127 | Objective value for 3: 0.0256275628979
128 | Computing for 2:
129 | Objective value for 2: 0.10191549988
130 | Computing for 3:
131 | Objective value for 3: 0.0256275628979
132 | Estimating RDR and BAF of every cluster
133 | Selecting ploidies
134 | Number of cells for every ploidy' level:
135 | Cells with base ploidy 2: 100
136 | Inferring copy numbers
137 | Phasing copy-number states along the genome
138 | Writing results
139 | Cloning
140 | Parsing and checking arguments
141 | Arguments:
142 | 	minsize : 14
143 | 	refinement : 0.0
144 | 	seed : 12
145 | 	maxdiff : 0.06
146 | 	input : ./calls/calls.tsv
147 | 	linkage : single
148 | Reading input
149 | Clustering cells in clones
150 | Selecting clones
151 | Number of identified clones: 2
152 | Refining clustering
153 | Number of discarded cells: 0 over 100 in total
154 | Profiling clones
155 | Writing clone map
156 | Writing clone-corrected copy numbers in provided input
157 | Plotting
158 | Parsing and checking arguments
159 | Arguments:
160 | 	format : png
161 | 	plotsize : (5.0, 1.5)
162 | 	sample : 20
163 | 	xmin : None
164 | 	clonemap : .././clones/mapping.tsv
165 | 	nonoisy : False
166 | 	gridsize : (12.0, 6.0)
167 | 	ymax : None
168 | 	clussize : (5.0, 3.0)
169 | 	xmax : None
170 | 	ymin : None
171 | 	input : .././calls/calls.tsv
172 | Reading input
173 | Number of cells: 100
174 | Number of bins: 35
175 | Setting style
176 | Reading clonemap
177 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png
178 | Plotting clustered RDR plots for 20 random cells in crdr.png
179 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png
180 | Plotting read-depth ratios in rdrs.png
181 | Plotting B-allele frequencies in bafs.png
182 | Plotting total copy numbers in totalcn.png
183 | Plotting total copy numbers corrected by clones in totalcn-corrected.png
184 | Plotting LOH in loh.png
185 | Plotting LOH corrected by clones in loh-corrected.png
186 | Plotting A-specific copy numbers in Aspecificcn.png
187 | Plotting A-specific copy numbers corrected by clones in Aspecificcn-corrected.png
188 | Plotting B-specific copy numbers in Bspecificcn.png
189 | Plotting B-specific copy numbers corrected by clones in Bspecificcn-corrected.png
190 | Plotting allele-specific copy numbers in allelecn.png
191 | Plotting allele-specific copy numbers corrected by clones in allelecn-corrected.png
192 | Plotting haplotype-specific copy numbers in haplotypecn.png
193 | Plotting haplotype-specific copy numbers corrected by clones in haplotypecn-corrected.png
194 | KTHKBYE!
195 | exit 0
196 | 


--------------------------------------------------------------------------------
/tests/plottingE.chk:
--------------------------------------------------------------------------------
 1 | + PS4=''
 2 | :
 3 | mkdir -p data
 4 | export INPUT=data/calls.tsv
 5 | INPUT=data/calls.tsv
 6 | export MAPP=data/mapping.tsv
 7 | MAPP=data/mapping.tsv
 8 | :
 9 | Parsing and checking arguments
10 | Arguments:
11 | 	format : png
12 | 	plotsize : 5,1.5
13 | 	sample : 20
14 | 	xmin : None
15 | 	clonemap : data/mapping.tsv
16 | 	nonoisy : False
17 | 	gridsize : 12,6
18 | 	ymax : None
19 | 	clussize : 5,3
20 | 	xmax : None
21 | 	ymin : None
22 | 	input : data/calls.tsv
23 | 
24 | Setting directories
25 | Plotting
26 | Parsing and checking arguments
27 | Arguments:
28 | 	format : png
29 | 	plotsize : (5.0, 1.5)
30 | 	sample : 20
31 | 	xmin : None
32 | 	clonemap : ../data/mapping.tsv
33 | 	nonoisy : False
34 | 	gridsize : (12.0, 6.0)
35 | 	ymax : None
36 | 	clussize : (5.0, 3.0)
37 | 	xmax : None
38 | 	ymin : None
39 | 	input : ../data/calls.tsv
40 | Reading input
41 | Number of cells: 2075
42 | Number of bins: 570
43 | Setting style
44 | Reading clonemap
45 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png
46 | Plotting clustered RDR plots for 20 random cells in crdr.png
47 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png
48 | Plotting read-depth ratios in rdrs.png
49 | Plotting B-allele frequencies in bafs.png
50 | Plotting total copy numbers in totalcn.png
51 | Plotting LOH in loh.png
52 | Plotting A-specific copy numbers in Aspecificcn.png
53 | Plotting B-specific copy numbers in Bspecificcn.png
54 | Plotting allele-specific copy numbers in allelecn.png
55 | Plotting haplotype-specific copy numbers in haplotypecn.png
56 | KTHKBYE!
57 | exit 0
58 | 


--------------------------------------------------------------------------------
/tests/pseudonormal.chk:
--------------------------------------------------------------------------------
 1 | + PS4=''
 2 | :
 3 | mkdir -p data
 4 | echo 'Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary.'
 5 | Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary.
 6 | export BAM=data/cells.bam
 7 | BAM=data/cells.bam
 8 | :
 9 | echo 'Downloading human reference genome, please be patient as downloading time may vary.'
10 | Downloading human reference genome, please be patient as downloading time may vary.
11 | export REF=data/hg19.fa
12 | REF=data/hg19.fa
13 | export DIC=data/hg19.dict
14 | DIC=data/hg19.dict
15 | :
16 | Parsing and checking arguments
17 | Arguments:
18 | 	chromosomes : chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22
19 | 	reference : data/hg19.fa
20 | 	normal : pseudonormal.bam
21 | 	tmpdir : ./_TMP
22 | 	minreads : 100000
23 | 	binsize : 5000000
24 | 	tumor : data/cells.bam
25 | 	thres : 0.9
26 | 
27 | Computing bins
28 | Counting reads on barcoded cells
29 | Computing total numbers of sequenced reads
30 | Selecting all cells to consider for the analysis
31 | Number of selected cells: 30
32 | Selecting diploid cells
33 | Number of identified diploid cells: 10
34 | Approximate sequencing coverage of pseudo matched-normal sample: 2.09471330866
35 | Extracting sequencing reads from selected diploid cells
36 | Merging and extracted sequencing reads and indexing the output pseduo matched-normal sample
37 | Removing temporary files
38 | KTHXBYE
39 | exit 0
40 | 


--------------------------------------------------------------------------------
/tests/pytests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | 
4 | 
5 | @pytest.fixture(scope='module')
6 | def input_folder():
7 |     return os.getenv('TEST_DIRECTORY', os.path.join(os.path.dirname(__file__), 'data', 'input'))
8 | 


--------------------------------------------------------------------------------
/tests/pytests/data/input/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !README.md
4 | 


--------------------------------------------------------------------------------
/tests/pytests/data/input/README.md:
--------------------------------------------------------------------------------
1 | Test data downloaded from zenodo goes in this folder.
2 | 
3 | To download required files manually:
4 | ```
5 | pip3 install zenodo-get
6 | python3 -m zenodo_get 10.5281/zenodo.3950299
7 | ```
8 | 


--------------------------------------------------------------------------------
/tests/pytests/data/output/total.tsv:
--------------------------------------------------------------------------------
  1 | normal	21614076
  2 | ACACACCA	957619
  3 | ACTACTGT	956113
  4 | GGGGCGAC	1031993
  5 | ACCCGAAA	953522
  6 | GGCCCTAG	1033041
  7 | GGCGCGCA	1033743
  8 | ACCGTCCT	955027
  9 | GGATAGCT	1033487
 10 | AGCACGTA	953145
 11 | GGGGGACC	1034588
 12 | GGTCTTGG	1035622
 13 | GTTTTGGC	1038136
 14 | GAATGGAA	1035362
 15 | ACGGAGAA	955108
 16 | GCTGGGGG	1035464
 17 | GCGAACCC	1034271
 18 | AAACCCCG	957751
 19 | GTATCGAT	1034150
 20 | ACGGATTT	956703
 21 | AGTCTCTC	954078
 22 | ATCGGGGG	954968
 23 | GGTGTCGC	1035138
 24 | ATATCGAG	952700
 25 | ATGCAAGC	956242
 26 | AGCCAGCA	956451
 27 | GGTGGATT	1034351
 28 | AGTACCAT	954224
 29 | GGAGACTG	1033339
 30 | AGGGCAAT	956665
 31 | ACAGAATA	955855
 32 | ATATTTAC	956517
 33 | GTAGTGAT	1034883
 34 | GTACTCCC	1035802
 35 | GATCCGCC	1034840
 36 | ATCATAGT	955420
 37 | ACCGGCCC	954656
 38 | GGGCAATA	1034728
 39 | AGAAGCGA	958082
 40 | AATGTACA	955777
 41 | AGACGATA	953727
 42 | AGCGATGG	958039
 43 | ATCGGTAT	954110
 44 | GCGGAGTT	1033472
 45 | ACCGCACA	954183
 46 | AGGAGATA	956402
 47 | AGCGTCGA	956780
 48 | ATAGGGCC	956105
 49 | GGTTAACA	1033195
 50 | ACTATTTG	955162
 51 | ATCGGGAC	954520
 52 | AATCGGGC	954964
 53 | GACGATTA	1034515
 54 | GAGGCCGG	1033828
 55 | AGATTAGT	956151
 56 | GGCTGTAT	1035321
 57 | AGGCCACT	957262
 58 | GGAGGATT	1035972
 59 | GCACCTCC	1035573
 60 | GCAACCGA	1037662
 61 | ACATCGTT	952277
 62 | GGACTATG	1034823
 63 | AAACGCCT	954376
 64 | ATTCCGGT	956273
 65 | GGTAATTC	1037230
 66 | GCCCTCTC	1033039
 67 | GACCACGG	1031569
 68 | ACGTCGGA	957720
 69 | GCCCATTC	1036101
 70 | GTAGGATT	1033946
 71 | ATGCGGTA	957359
 72 | ACTAGACG	956299
 73 | GCCTAACT	1033407
 74 | AATTAGCC	955433
 75 | GATTGATT	1036081
 76 | AGATCGCT	956010
 77 | GCTCTCCA	1034989
 78 | AAGGGGTA	954521
 79 | ACATGAGA	953597
 80 | GATCTCGC	1035522
 81 | GCTCGGTT	1037580
 82 | GGTGGGAA	1034029
 83 | GCCAGTCG	1034770
 84 | ATTAGCTA	955147
 85 | AGTATTAT	954554
 86 | AGGAGTTA	954289
 87 | ACATCACT	958150
 88 | ATCATACT	956359
 89 | GGCCCCCC	1035345
 90 | GGTCAGGT	1035923
 91 | GATGCCAC	1036401
 92 | GGGCATTC	1035342
 93 | ATCCATGG	956450
 94 | GCTGTACC	1036280
 95 | ACACCCAA	957480
 96 | GCTATTTA	1034242
 97 | GGGGGGGG	1035002
 98 | GAGGCCCC	1034277
 99 | GAGCAGTA	1036894
100 | GAGACCGA	1035678
101 | GACAACAA	1035912


--------------------------------------------------------------------------------
/tests/pytests/test_baf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import tempfile
 4 | from chisel.BAFEstimator import main
 5 | 
 6 | this_dir = os.path.dirname(__file__)
 7 | DATA_FOLDER = os.path.join(this_dir, 'data')
 8 | 
 9 | 
10 | def test_baf(input_folder):
11 |     with tempfile.NamedTemporaryFile('w') as f:
12 |         # The complete chr6 phase data takes quite a while to process.
13 |         # Here we test with just the first 1k rows of the phase data.
14 |         with tempfile.NamedTemporaryFile('w') as phases_f:
15 |             with open(os.path.join(input_folder, 'phases.tsv'), 'r') as phases_full:
16 |                 lines = [next(phases_full) for _ in range(1000)]
17 |             phases_f.writelines(lines)
18 |             phases_f.flush()
19 |             main(args=[
20 |                 '-n', os.path.join(input_folder, 'normal.bam'),
21 |                 '-t', os.path.join(input_folder, 'cells.bam'),
22 |                 '-r', os.path.join(input_folder, 'hg19.fa'),
23 |                 '-j', '1',
24 |                 '-c', os.path.join(DATA_FOLDER, 'output', 'total.tsv'),
25 |                 '-l', phases_f.name
26 |             ], stdout_file=f.name)
27 | 
28 |             assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \
29 |                    'bb41cbc96c1f76020bb965f41a961884'
30 | 


--------------------------------------------------------------------------------
/tests/pytests/test_call.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import tempfile
 4 | from chisel.Caller import main
 5 | 
 6 | this_dir = os.path.dirname(__file__)
 7 | DATA_FOLDER = os.path.join(this_dir, 'data')
 8 | 
 9 | 
10 | def test_call():
11 |     with tempfile.NamedTemporaryFile('w') as f:
12 |         main(args=[
13 |             os.path.join(DATA_FOLDER, 'output', 'combo.tsv'),
14 |             '-j', '1',
15 |             '--seed', '12'
16 |         ], stdout_file=f.name)
17 | 
18 |         assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \
19 |                'be0b7472a1c8d6f8f96b9aa0fb8df3c9'
20 | 


--------------------------------------------------------------------------------
/tests/pytests/test_clone.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import shutil
 4 | import tempfile
 5 | from chisel.Cloner import main
 6 | 
 7 | this_dir = os.path.dirname(__file__)
 8 | DATA_FOLDER = os.path.join(this_dir, 'data')
 9 | 
10 | 
11 | def test_clone():
12 |     with tempfile.NamedTemporaryFile('w') as f:
13 |         # The Cloner.main function overwrites the input file!
14 |         # Make a temporary copy for the purpose of testing the call.
15 |         with tempfile.NamedTemporaryFile('w') as f_calls:
16 |             input_file = os.path.join(DATA_FOLDER, 'output', 'calls.tsv')
17 | 
18 |             shutil.copy(input_file, f_calls.name)
19 |             main(args=[
20 |                 f_calls.name,
21 |                 '--seed', '12'
22 |             ], stdout_file=f.name)
23 | 
24 |             assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \
25 |                    '2b94dde27bc7f5e2c9215dc1f890f5f7'
26 | 


--------------------------------------------------------------------------------
/tests/pytests/test_combine.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import tempfile
 4 | from chisel.Combiner import main
 5 | 
 6 | this_dir = os.path.dirname(__file__)
 7 | DATA_FOLDER = os.path.join(this_dir, 'data')
 8 | 
 9 | 
10 | def test_combine():
11 |     with tempfile.NamedTemporaryFile('w') as f:
12 |         main(args=[
13 |             '-r', os.path.join(DATA_FOLDER, 'output', 'rdr.tsv'),
14 |             '-b', os.path.join(DATA_FOLDER, 'output', 'baf.tsv'),
15 |             '-j', '1',
16 |             '-s', '12'
17 |         ], stdout_file=f.name)
18 | 
19 |         assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \
20 |                '569c320780744a704a629989cb6d4a88'
21 | 


--------------------------------------------------------------------------------
/tests/pytests/test_rdr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import tempfile
 4 | from chisel.RDREstimator import main
 5 | 
 6 | 
 7 | def test_rdr(input_folder):
 8 |     with tempfile.NamedTemporaryFile('w') as f:
 9 |         main(args=[
10 |             '-n', os.path.join(input_folder, 'normal.bam'),
11 |             '-t', os.path.join(input_folder, 'cells.bam'),
12 |             '-r', os.path.join(input_folder, 'hg19.fa'),
13 |             '-b', '5Mb',
14 |             '-m', '100000',
15 |             '-c', 'chr6',
16 |             '-j', '1'
17 |         ], stdout_file=f.name)
18 | 
19 |         assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \
20 |                'eaf470105df0dcd8be7a995f1d6a8525'
21 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27
 3 | 
 4 | [gh-actions]
 5 | python =
 6 |     2.7: py27
 7 | 
 8 | [testenv]
 9 | passenv = CI TRAVIS TRAVIS_* TEST_DIRECTORY
10 | 
11 | deps =
12 |     pytest
13 | 
14 | commands =
15 |     pytest tests/pytests


--------------------------------------------------------------------------------