├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── README.md ├── demos ├── callingE │ └── demo-callingE.sh ├── cloningE │ └── demo-cloningE.sh ├── complete-nonormal │ └── demo-complete-nonormal.sh ├── complete │ └── demo-complete.sh ├── old_demos │ └── completeE.sh ├── plottingE │ └── demo-plottingE.sh └── pseudonormal │ └── demo-pseudonormal.sh ├── doc ├── chisel-calling.md ├── chisel-cartoon.png ├── chisel-cloning.md ├── chisel-plotting.md ├── chisel-pseudonormal.md └── chisel.md ├── guides ├── clones.md ├── clustering.md └── ploidy.md ├── install_full.sh ├── man ├── chisel-bedding.md ├── chisel-calling.md ├── chisel-cloning.md ├── chisel-plotting.md ├── chisel-prep.md ├── chisel-pseudonormal.md └── chisel.md ├── setup.py ├── src └── chisel │ ├── BAFEstimator.py │ ├── Caller.py │ ├── Cloner.py │ ├── Clusterizer.py │ ├── Combiner.py │ ├── Mutator.py │ ├── Plotter.py │ ├── RDREstimator.py │ ├── Utils.py │ ├── __init__.py │ ├── bin.awk │ ├── bin │ ├── __init__.py │ ├── chisel_bedding.py │ ├── chisel_calling.py │ ├── chisel_cloning.py │ ├── chisel_combocall.py │ ├── chisel_main.py │ ├── chisel_nonormal.py │ ├── chisel_nonormal_combocall.py │ ├── chisel_nonormal_preprocess.py │ ├── chisel_plotting.py │ ├── chisel_prep.py │ ├── chisel_preprocess.py │ ├── chisel_pseudonormal.py │ ├── chisel_rdr.py │ └── count.awk │ └── count.awk ├── tests ├── allchecks.sh ├── callingE.chk ├── cloningE.chk ├── complete.chk ├── plottingE.chk ├── pseudonormal.chk └── pytests │ ├── conftest.py │ ├── data │ ├── input │ │ ├── .gitignore │ │ └── README.md │ └── output │ │ ├── baf.tsv │ │ ├── calls.tsv │ │ ├── combo.tsv │ │ ├── rdr.tsv │ │ └── total.tsv │ ├── test_baf.py │ ├── test_call.py │ ├── test_clone.py │ ├── test_combine.py │ └── test_rdr.py └── tox.ini /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master, develop, py3, ci ] 6 | pull_request: 7 | branches: [ master, develop, py3, ci ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python: [2.7.18] 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | with: 20 | lfs: false 21 | 22 | - name: Cache Testing Data 23 | id: cache-test-data 24 | uses: actions/cache@v2 25 | with: 26 | path: testdata 27 | key: testdata 28 | 29 | - name: Download Testing Data 30 | if: steps.cache-test-data.outputs.cache-hit != 'true' 31 | run: | 32 | pip3 install wheel 33 | pip3 install setuptools 34 | pip3 install zenodo-get 35 | python3 -m zenodo_get 10.5281/zenodo.3950299 --output-dir=testdata 36 | 37 | - name: Set Testing Data Envvar 38 | run: | 39 | echo "TEST_DIRECTORY=$(realpath testdata)" >> $GITHUB_ENV 40 | 41 | - name: Install SAMtools 42 | run: | 43 | wget https://sourceforge.net/projects/samtools/files/samtools/1.7/samtools-1.7.tar.bz2/download -O samtools-1.7.tar.bz2 44 | tar xvjf samtools-1.7.tar.bz2 45 | (cd samtools-1.7 && ./configure && make) 46 | echo $(realpath samtools-1.7) >> $GITHUB_PATH 47 | 48 | - name: Install BCFTools 49 | run: | 50 | wget https://sourceforge.net/projects/samtools/files/samtools/1.7/bcftools-1.7.tar.bz2/download -O bcftools-1.7.tar.bz2 51 | tar xvjf bcftools-1.7.tar.bz2 52 | (cd bcftools-1.7 && ./configure && make) 53 | echo $(realpath bcftools-1.7) >> $GITHUB_PATH 54 | 55 | - name: Download and index hg19.fa 56 | run: | 57 | wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz --directory-prefix=${TEST_DIRECTORY} 58 | (cd ${TEST_DIRECTORY} && gunzip -df hg19.fa.gz && samtools faidx hg19.fa && samtools dict hg19.fa > hg19.dict) 59 | 60 | - name: Setup Python 61 | uses: actions/setup-python@v2 62 | with: 63 | python-version: ${{ matrix.python }} 64 | 65 | - name: Install Tox and any other packages 66 | run: | 67 | python -m pip install coverage tox tox-gh-actions 68 | 69 | - name: Test with tox 70 | run: | 71 | tox 72 | env: 73 | PLATFORM: ${{ matrix.python }} 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Princeton University 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /demos/callingE/demo-callingE.sh: -------------------------------------------------------------------------------- 1 | # Demo for WGS data from a cancer patient 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | The following CHISEL demo represents a guided example of the CHISEL pipeline starting from the computed RDRs and BAFs (typically the file `combo.tsv` in the folder `combo`) for tumor section E of breast cancer patient S0. Simply run this file through BASH as a standard script to run the complete demo. The demo represent a guided example for the command `chisel-calling` which allows to re-run the inference of copy numbers and can be used to try different parameters, especially related to the inference of tumor ploidy. 5 | 6 | ## Requirements and set up 7 | 8 | The demo requires that CHISEL has been succesfully installed with conda. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied. 9 | 10 | We gurantee that the running directory in the same directory of the demo and we remove previous results. 11 | 12 | ```shell 13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/ 15 | :<<'```shell' # Ignore this line 16 | ``` 17 | 18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands 19 | ```shell 20 | set -e 21 | set -o xtrace 22 | PS4='[\t]' 23 | :<<'```shell' # Ignore this line 24 | ``` 25 | 26 | ## Downloading of data 27 | 28 | The demo auomatically downloads the required RDRs and BAFs already computed by the complete CHISEL pipeline in `data` folder. 29 | 30 | ```shell 31 | # Creating data folder 32 | mkdir -p data 33 | 34 | # Downloading RDRs and BAFs computed by CHISEL for tumor section E 35 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/callingE/combo.tsv.gz > data/combo.tsv.gz 36 | gzip -df data/combo.tsv.gz 37 | export INPUT="data/combo.tsv" 38 | :<<'```shell' # Ignore this line 39 | ``` 40 | 41 | ## Run CHISEL 42 | 43 | We now run the command CHISEL command that starts from the inference of copy numbers from RDRs and BAFs. 44 | 45 | ```shell 46 | chisel_calling ${INPUT} --seed 25 47 | exit $? 48 | ``` 49 | -------------------------------------------------------------------------------- /demos/cloningE/demo-cloningE.sh: -------------------------------------------------------------------------------- 1 | # Demo for WGS data from a cancer patient 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | The following CHISEL demo represents a guided example of the CHISEL pipeline starting from the inferred copy numbers (typically the file `calls.tsv` in the folder `calls`) for tumor section E of breast cancer patient S0, and thus identifies the clones and produces the corresponding plots. Simply run this file through BASH as a standard script to run the complete demo. The demo represent a guided example for the command `chisel-cloning` which allows to re-run the inference of clones and can be used to try different parameters to explore different solutions and clustering of cells.. 5 | 6 | ## Requirements and set up 7 | 8 | The demo requires that CHISEL has been succesfully installed with conda. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied. 9 | 10 | We gurantee that the running directory in the same directory of the demo and we remove previous results. 11 | 12 | ```shell 13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/ 15 | :<<'```shell' # Ignore this line 16 | ``` 17 | 18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands 19 | ```shell 20 | set -e 21 | set -o xtrace 22 | PS4='[\t]' 23 | :<<'```shell' # Ignore this line 24 | ``` 25 | 26 | ## Downloading of data 27 | 28 | The demo auomatically downloads the required inferred copy numbers already computed by the complete CHISEL pipeline in `data` folder. 29 | 30 | ```shell 31 | # Creating data folder 32 | mkdir -p data 33 | 34 | # Downloading copy numbers inferred by CHISEL for tumor section E 35 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/cloningE/calls.tsv.gz > data/calls.tsv.gz 36 | gzip -df data/calls.tsv.gz 37 | export INPUT="data/calls.tsv" 38 | :<<'```shell' # Ignore this line 39 | ``` 40 | 41 | ## Run CHISEL 42 | 43 | We now run the command CHISEL command that starts from the inference of copy numbers from RDRs and BAFs. 44 | 45 | ```shell 46 | chisel_cloning ${INPUT} --seed 25 47 | exit $? 48 | ``` 49 | -------------------------------------------------------------------------------- /demos/complete-nonormal/demo-complete-nonormal.sh: -------------------------------------------------------------------------------- 1 | # Complete demo of CHISEL in nonormal mode 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | The following CHISEL demo represents a guided example of the complete CHISEL pipeline in nonormal mode starting from an exemplary barcoded [BAM file](https://doi.org/10.5281/zenodo.3950299) publicly available. From this directory, simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented. 5 | 6 | ## Requirements and set up 7 | 8 | The demo requires that CHISEL has been succesfully installed with conda. If the custom installation was used, please make sure that you can succesfully run the command `chisel` as well as the required `samtools`, `bcftools`, and `awk`. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied. 9 | 10 | We gurantee that the running directory in the same directory of the demo and we remove previous results. 11 | 12 | ```shell 13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/ 15 | :<<'```shell' # Ignore this line 16 | ``` 17 | 18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands 19 | ```shell 20 | set -e 21 | set -o xtrace 22 | PS4='[\t]' 23 | :<<'```shell' # Ignore this line 24 | ``` 25 | 26 | ## Downloading of data 27 | 28 | The demo auomatically downloads the required barcoded single-cell and matched-normal BAM files in `data` folder. 29 | 30 | ```shell 31 | # Creating data folder 32 | mkdir -p data 33 | 34 | # Downloading tumor barcoded BAM file 35 | echo "Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary." 36 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam?download=1' > data/cells.bam 37 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam.bai?download=1' > data/cells.bam.bai 38 | export TUM="data/cells.bam" 39 | :<<'```shell' # Ignore this line 40 | ``` 41 | 42 | Next the corresponding reference genome is downloaded and unpacked. Also, the required indexes are generated. 43 | 44 | ```shell 45 | echo "Downloading human reference genome, please be patient as downloading time may vary." 46 | curl -L https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz | gzip -d > data/hg19.fa 47 | samtools faidx data/hg19.fa 48 | samtools dict data/hg19.fa > data/hg19.dict 49 | bwa index data/hg19.fa 50 | export REF="data/hg19.fa" 51 | export DIC="data/hg19.dict" 52 | :<<'```shell' # Ignore this line 53 | ``` 54 | 55 | Last, we download the pre-computed list of phased germline SNPs. Note that differently from the [one](https://github.com/raphael-group/chisel-data/raw/master/demos/completeE/phased.HRC.vcf.gz) obtained through the reccommended instructions (i.e. using BCFtools to call germline SNPs and Eagle2 throught the Michigan Imputation Serverve with HRC panel to phase the SNPs) this file only contains the lables `0|1` or `1|0` for every SNP, which is the minimum requirement for CHISEL. 56 | 57 | ```shell 58 | curl -L 'https://zenodo.org/record/3950299/files/phases.tsv?download=1' > data/phases.tsv 59 | export PHA="data/phases.tsv" 60 | :<<'```shell' # Ignore this line 61 | ``` 62 | 63 | ## Run CHISEL 64 | 65 | We now run the complete pipeline of CHISEL with the corresponding command `chisel_nonormal`. 66 | 67 | ```shell 68 | chisel_nonormal -t ${TUM} -r ${REF} -l ${PHA} --simcov 0.2 --seed 12 69 | exit $? 70 | ``` 71 | -------------------------------------------------------------------------------- /demos/complete/demo-complete.sh: -------------------------------------------------------------------------------- 1 | # Complete demo of CHISEL 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | The following CHISEL demo represents a guided example of the complete CHISEL pipeline starting from an exemplary barcoded [BAM file](https://doi.org/10.5281/zenodo.3950299) publicly available. From this directory, simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented. 5 | 6 | ## Requirements and set up 7 | 8 | The demo requires that CHISEL has been succesfully installed with conda. If the custom installation was used, please make sure that you can succesfully run the command `chisel` as well as the required `samtools`, `bcftools`, and `awk`. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied. 9 | 10 | We gurantee that the running directory in the same directory of the demo and we remove previous results. 11 | 12 | ```shell 13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/ 15 | :<<'```shell' # Ignore this line 16 | ``` 17 | 18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands 19 | ```shell 20 | set -e 21 | set -o xtrace 22 | PS4='[\t]' 23 | :<<'```shell' # Ignore this line 24 | ``` 25 | 26 | ## Downloading of data 27 | 28 | The demo auomatically downloads the required barcoded single-cell and matched-normal BAM files in `data` folder. 29 | 30 | ```shell 31 | # Creating data folder 32 | mkdir -p data 33 | 34 | # Downloading tumor barcoded BAM file 35 | echo "Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary." 36 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam?download=1' > data/cells.bam 37 | curl -L 'https://zenodo.org/record/3950299/files/cells.bam.bai?download=1' > data/cells.bam.bai 38 | export TUM="data/cells.bam" 39 | 40 | # Downloading matched-normal BAM file 41 | echo "Downloading matched-normal BAM file from Zenodo, please be patient as downloading time may vary." 42 | curl -L 'https://zenodo.org/record/3950299/files/normal.bam?download=1' > data/normal.bam 43 | curl -L 'https://zenodo.org/record/3950299/files/normal.bam.bai?download=1' > data/normal.bam.bai 44 | export NOR="data/normal.bam" 45 | :<<'```shell' # Ignore this line 46 | ``` 47 | 48 | Next the corresponding reference genome is downloaded and unpacked 49 | 50 | ```shell 51 | echo "Downloading human reference genome, please be patient as downloading time may vary." 52 | curl -L https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz | gzip -d > data/hg19.fa 53 | samtools faidx data/hg19.fa 54 | samtools dict data/hg19.fa > data/hg19.dict 55 | export REF="data/hg19.fa" 56 | export DIC="data/hg19.dict" 57 | :<<'```shell' # Ignore this line 58 | ``` 59 | 60 | Last, we download the pre-computed list of phased germline SNPs. Note that differently from the [one](https://github.com/raphael-group/chisel-data/raw/master/demos/completeE/phased.HRC.vcf.gz) obtained through the reccommended instructions (i.e. using BCFtools to call germline SNPs and Eagle2 throught the Michigan Imputation Serverve with HRC panel to phase the SNPs) this file only contains the lables `0|1` or `1|0` for every SNP, which is the minimum requirement for CHISEL. 61 | 62 | ```shell 63 | curl -L 'https://zenodo.org/record/3950299/files/phases.tsv?download=1' > data/phases.tsv 64 | export PHA="data/phases.tsv" 65 | :<<'```shell' # Ignore this line 66 | ``` 67 | 68 | ## Run CHISEL 69 | 70 | We now run the complete pipeline of CHISEL with the corresponding command `chisel`. 71 | 72 | ```shell 73 | chisel -t ${TUM} -n ${NOR} -r ${REF} -l ${PHA} --seed 12 74 | exit $? 75 | ``` 76 | -------------------------------------------------------------------------------- /demos/old_demos/completeE.sh: -------------------------------------------------------------------------------- 1 | # Demo for WGS data from a cancer patient 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | The following CHISEL demo represents a guided example of the complete CHISEL pipeline starting from the barcoded [BAM file](https://support.10xgenomics.com/single-cell-dna/datasets/1.0.0/breast_tissue_E_2k) publicly available from 10X Genomics archive and obtained through 10X Chromium Single Cell CNV Solution for section E of a breast tumor. Simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented. 5 | 6 | ## Requirements and set up 7 | 8 | The demo requires that CHISEL has been succesfully installed, such that the python environment called by the command `python2.7` has the required packages, and both `samtools` and `awk` are available in `${PATH}`. 9 | 10 | ```shell 11 | export CHISEL_HOME="../../" # This is CHISEL home by default, update if needed 12 | :<<'```shell' # Ignore this line 13 | ``` 14 | 15 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands 16 | ```shell 17 | set -e 18 | set -o xtrace 19 | PS4='[\t]' 20 | :<<'```shell' # Ignore this line 21 | ``` 22 | 23 | ## Downloading of data 24 | 25 | The demo auomatically downloads the required barcoded single-cell and matched-normal BAM files from 10X Genomics archive through the following commands in `data` folder. 26 | 27 | ```shell 28 | # Creating data folder 29 | mkdir -p data 30 | 31 | # Downloading barcoded single-cell BAM of breast tumor section E 32 | wget -N -c http://s3-us-west-2.amazonaws.com/10x.files/samples/cell-dna/1.0.0/breast_tissue_E_2k/breast_tissue_E_2k_possorted_bam.bam -P data/ 33 | wget -N -c http://cf.10xgenomics.com/samples/cell-dna/1.0.0/breast_tissue_E_2k/breast_tissue_E_2k_possorted_bam.bam.bai -P data/ 34 | export TUM="data/breast_tissue_E_2k_possorted_bam.bam" 35 | 36 | # Downloading matched-normal BAM file as section A 37 | wget -N -c http://s3-us-west-2.amazonaws.com/10x.files/samples/cell-dna/1.0.0/breast_tissue_A_2k/breast_tissue_A_2k_possorted_bam.bam -P data/ 38 | wget -N -c http://cf.10xgenomics.com/samples/cell-dna/1.0.0/breast_tissue_A_2k/breast_tissue_A_2k_possorted_bam.bam.bai -P data/ 39 | export NOR="data/breast_tissue_A_2k_possorted_bam.bam" 40 | :<<'```shell' # Ignore this line 41 | ``` 42 | 43 | Next the corresponding reference genome is downloaded and unpacked 44 | 45 | ```shell 46 | export REF="data/refdata-GRCh38-2.1.0/fasta/genome.fa" 47 | export DIC="data/refdata-GRCh38-2.1.0/fasta/genome.dict" 48 | if [[ ! -f "${REF}" || ! -f "${DIC}" ]]; then 49 | wget -N -c http://cf.10xgenomics.com/supp/genome/refdata-GRCh38-2.1.0.tar.gz -P data/ 50 | tar -xzvf data/refdata-GRCh38-2.1.0.tar.gz -C data/ 51 | rm -f data/refdata-GRCh38-2.1.0.tar.gz 52 | fi 53 | :<<'```shell' # Ignore this line 54 | ``` 55 | 56 | Last, we download the pre-computed VCF with phased SNPs; the VCF has been computed following the reccommended instructions, using BCFtools to call germline SNPs and Eagle2 throught the Michigan Imputation Serverve with HRC panel to phase the SNPs. 57 | 58 | ```shell 59 | wget -N -c https://github.com/raphael-group/chisel-data/raw/master/demos/completeE/phased.HRC.vcf.gz -P data/ 60 | gzip -f -d data/phased.HRC.vcf.gz 61 | export PHA="data/phased.HRC.vcf" 62 | :<<'```shell' # Ignore this line 63 | ``` 64 | 65 | ## Run CHISEL 66 | 67 | We now run the complete pipeline of CHISEL with the corresponding command `chisel`. 68 | 69 | ```shell 70 | chisel -t ${TUM} -n ${NOR} -r ${REF} -l ${PHA} --seed 25 71 | exit $? 72 | ``` 73 | 74 | 75 | -------------------------------------------------------------------------------- /demos/plottingE/demo-plottingE.sh: -------------------------------------------------------------------------------- 1 | # Demo for WGS data from a cancer patient 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | The following CHISEL demo represents a guided example of the CHISEL pipeline starting from the inferred copy numbers (typically the file `calls.tsv` in the folder `calls`) and identified clones (typically the file `mapping.tsv` in the folder `clones`) for tumor section E of breast cancer patient S0, and thus produces the corresponding plots. The demo represent a guided example for the command `chisel-plotting` which allows to re-run the plot generation and can be used to try different parameters to obtain the best format for the results. 5 | 6 | ## Requirements and set up 7 | 8 | The demo requires that CHISEL has been succesfully installed with conda. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied. 9 | 10 | We gurantee that the running directory in the same directory of the demo and we remove previous results. 11 | 12 | ```shell 13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/ 15 | :<<'```shell' # Ignore this line 16 | ``` 17 | 18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands 19 | ```shell 20 | set -e 21 | set -o xtrace 22 | PS4='[\t]' 23 | :<<'```shell' # Ignore this line 24 | ``` 25 | 26 | ## Downloading of data 27 | 28 | The demo auomatically downloads the required inferred copy numbers already computed by the complete CHISEL pipeline in `data` folder. 29 | 30 | ```shell 31 | # Creating data folder 32 | mkdir -p data 33 | 34 | # Downloading copy numbers and clones inferred by CHISEL for tumor section E 35 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/cloningE/calls.tsv.gz > data/calls.tsv.gz 36 | gzip -df data/calls.tsv.gz 37 | export INPUT="data/calls.tsv" 38 | 39 | curl -L https://github.com/raphael-group/chisel-data/raw/master/demos/plottingE/mapping.tsv.gz > data/mapping.tsv.gz 40 | gzip -df data/mapping.tsv.gz 41 | export MAPP="data/mapping.tsv" 42 | :<<'```shell' # Ignore this line 43 | ``` 44 | 45 | ## Run CHISEL 46 | 47 | We now run the command CHISEL command that starts from the inference of copy numbers from RDRs and BAFs. 48 | 49 | ```shell 50 | chisel_plotting ${INPUT} -m ${MAPP} 51 | exit $? 52 | ``` 53 | -------------------------------------------------------------------------------- /demos/pseudonormal/demo-pseudonormal.sh: -------------------------------------------------------------------------------- 1 | # Demo for generating pseudo matched-normal sample 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | The following CHISEL demo represents a guided example of the CHISEL command for generating a pseudo matched-normal sample starting from an exemplary barcoded [BAM file](https://doi.org/10.5281/zenodo.3952985) publicly available. Simply run this file through BASH as a standard script to run the complete demo. The demo can also be considered as a guided example of a complete execution and is correspondingly commented. 5 | 6 | ## Requirements and set up 7 | 8 | The demo requires that CHISEL has been succesfully installed with conda. If the custom installation was used, please make sure that you can succesfully run the command `chisel_pseudonormal` as well as the required `samtools`, `bcftools`, and `awk`. The demo includes the downloading of all the required files and will terminate in <20 minutes on machine with minimum requirements satisfied. 9 | 10 | We gurantee that the running directory in the same directory of the demo and we remove previous results. 11 | 12 | ```shell 13 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 14 | rm -rf rdr/ baf/ combo/ calls/ clones/ plots/ 15 | :<<'```shell' # Ignore this line 16 | ``` 17 | 18 | We also ask the demo to terminate in case of errors and to print a trace of the execution by the following commands 19 | ```shell 20 | set -e 21 | set -o xtrace 22 | PS4='[\t]' 23 | :<<'```shell' # Ignore this line 24 | ``` 25 | 26 | ## Downloading of data 27 | 28 | The demo auomatically downloads the required barcoded single-cell BAM file from 10X Genomics archive through the following commands in `data` folder. 29 | 30 | ```shell 31 | # Creating data folder 32 | mkdir -p data 33 | 34 | # Downloading tumor barcoded BAM file 35 | echo "Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary." 36 | curl -L https://zenodo.org/record/3952985/files/cells.bam?download=1 > data/cells.bam 37 | curl -L https://zenodo.org/record/3952985/files/cells.bam.bai?download=1 > data/cells.bam.bai 38 | export BAM="data/cells.bam" 39 | :<<'```shell' # Ignore this line 40 | ``` 41 | 42 | Last, the corresponding reference genome is downloaded and unpacked 43 | 44 | ```shell 45 | echo "Downloading human reference genome, please be patient as downloading time may vary." 46 | curl -L https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz | gzip -d > data/hg19.fa 47 | samtools faidx data/hg19.fa 48 | samtools dict data/hg19.fa > data/hg19.dict 49 | export REF="data/hg19.fa" 50 | export DIC="data/hg19.dict" 51 | :<<'```shell' # Ignore this line 52 | ``` 53 | 54 | ## Run CHISEL 55 | 56 | We now run the command `chisel_pseudonormal` of CHISEL for generating a pseudo mathched-normal sample by extracting the sequencing reads from diploid cells in the provided barcoded BAM file `${BAM}`. 57 | Specifically, we are required to specify the reference genome `${REF}` and we use the default values of all parameters. 58 | By default, temporary files and the sorted and indexed output BAM `pseudonormal.bam` will be generated in the current directory. 59 | 60 | ```shell 61 | chisel_pseudonormal ${BAM} -r ${REF} 62 | exit $? 63 | ``` 64 | -------------------------------------------------------------------------------- /doc/chisel-calling.md: -------------------------------------------------------------------------------- 1 | # Command `chisel-calling.py` 2 | 3 | The CHISEL command `chisel-calling.py` runs the CHISEL pipeline starting from the already estimated RDRs and BAFs. 4 | To do this, this command requires to have the folder `combo` with the files and formats described [here](chisel.md). 5 | This command is particularly useful if the user would like to re-run the CHISEL pipeline without the extensive re-computation of RDRs and BAFs but using different values of some of the main parameters, including: 6 | 7 | 1. `-A`: varying sensitiviy of the model selection criterion for cell ploidy: in case of particularly noisy datasets or with particularly high variance, the estimation of cell ploidy may be more challenging and it may needed to increase the sensitivity of the selection (e.g. 2, 3, 4, ...); 8 | 2. `-K`: varying the maximum number of clusters allowed in the global clustering of RDRs and BAFs: choosing values lower than the default (i.e. 100) generally allows to reduce presence of noisy CNAs at the cost of lower resolution; 9 | 3. `-P`: varying the maximum value allowed for cell ploidy, since the default is 4 which generally corresponds to at most one WGD. 10 | 11 | -------------------------------------------------------------------------------- /doc/chisel-cartoon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raphael-group/chisel/24e227df0aba769c4be241bcb8548575b93a9556/doc/chisel-cartoon.png -------------------------------------------------------------------------------- /doc/chisel-cloning.md: -------------------------------------------------------------------------------- 1 | # Command `chisel-cloning.py` 2 | 3 | The CHISEL command `chisel-cloning.py` runs the CHISEL pipeline starting from the already estimated allele- and haplotype-specific copy numbers. 4 | To do this, this command requires to have the folder `calls` with the files and formats described [here](chisel.md). 5 | This command is particularly useful if the user would like to re-run the CHISEL's inference of tumor clones to adapt to datasets with particularly high levels of noise and variance. 6 | Examples of usage of this command for QC is described [here](../guides/clones.md). 7 | -------------------------------------------------------------------------------- /doc/chisel-plotting.md: -------------------------------------------------------------------------------- 1 | # Command `chisel-plotting.py` 2 | 3 | The CHISEL command `chisel-plotting.py` generates several useful plots, which can be used to inspect the inferred results or for quality control. 4 | More specifically, this command generates 15 plots. 5 | 6 | ## Main plots 7 | 8 | ### Allele-specific copy numbers 9 | 10 | This plot (`allelecn.png`) depicts the allele-specific copy numbers inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 11 | The colors of the heatmap represent the difference pairs of allele-specific copy numbers and a full description of the color map used is available in the [CHISEL manuscript](https://doi.org/10.1101/837195). 12 | 13 | ### Corrected allele-specific copy numbers 14 | 15 | This plot (`allelecn-corrected.png`) depicts the allele-specific copy numbers inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 16 | The colors of the heatmap represent the difference pairs of allele-specific copy numbers and a full description of the color map used is available in the [CHISEL manuscript](https://doi.org/10.1101/837195). 17 | 18 | ### Haplotype-specific copy numbers 19 | 20 | This plot (`haplotypecn.png`) depicts the haplotype-specific copy numbers inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 21 | The colors of the heatmap represent the haplotype of the allele with fewer copies, such that green and magenta represent haplotype A and B, respectively. 22 | Note that balanced regions (allele with the same number of copies) are represented in white. 23 | Further descriptions of the color map used are available in the [CHISEL manuscript](https://doi.org/10.1101/837195). 24 | 25 | ### Corrected haplotype-specific copy numbers 26 | 27 | This plot (`haplotypecn-corrected.png`) depicts the haplotype-specific copy numbers inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 28 | The colors of the heatmap represent the haplotype of the allele with fewer copies, such that green and magenta represent haplotype A and B, respectively. 29 | Note that balanced regions (allele with the same number of copies) are represented in white. 30 | Further descriptions of the color map used are available in the [CHISEL manuscript](https://doi.org/10.1101/837195). 31 | 32 | ## Useful additional plots 33 | 34 | ### BAF and RDR plots 35 | 36 | This plot (`rbplot_mirrored.png`) shows the global clusters of RDRs and BAFs inferred for a random sample of a certain number of cells (by default 20 cells). 37 | Each plot corresponds to a different cell, with each plot depicting the bins (each point) which are represented by the corresponding values of `|0.5 - mirrored BAF|` (x-axis) and RDR (y-axis), and are colored according to the corresponding cluster; note that colors are consistent across cells. 38 | 39 | ### Clustered RDR 40 | 41 | This plot (`crdr.png`) shows the estimated RDR and their cluster for a random sample of a certain number of cells (by default 20 cells). 42 | Each plot corresponds to a different cell, with each plot depicting the bins (each point) which are represented by the corresponding values of RDR (y-axis) along the entire genome (x-axis) and are colored according to the corresponding cluster; note that colors are consistent across cells. 43 | 44 | ### Clustered mirrored BAF 45 | 46 | This plot (`cbaf.png`) shows the estimated BAF and their cluster for a random sample of a certain number of cells (by default 20 cells). 47 | Each plot corresponds to a different cell, with each plot depicting the bins (each point) which are represented by the corresponding values of `|0.5 - mirrored BAF|` (y-axis) along the entire genome (x-axis) and are colored according to the corresponding cluster; note that colors are consistent across cells. 48 | 49 | ### Total copy numbers 50 | 51 | This plot (`totalcn.png`) is an heatmap that shows the total copy numbers inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 52 | Each point of the heatmap this represents the total copy numbers inferred by CHISEL, such that grey represents 2 copies, blue colors represent <2 copies with darker colors corresponding to smaller values, and red colors represent >2 copies with darker colors corresponding to higher values. 53 | 54 | ### Corrected total copy numbers 55 | 56 | This plot (`totalcn-corrected.png`) is an heatmap that shows the total copy numbers inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 57 | Each point of the heatmap this represents the total copy numbers inferred by CHISEL, such that grey represents 2 copies, blue colors represent <2 copies with darker colors corresponding to smaller values, and red colors represent >2 copies with darker colors corresponding to higher values. 58 | 59 | ### LOH 60 | 61 | This plot (`loh.png`) is an heatmap that shows the LOH inferred by CHISEL for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 62 | Each point of the heatmap is colored according to the absence (white) or presence (black) of a LOH in the corresponding cell and bin. 63 | 64 | ### Corrected LOH 65 | 66 | This plot (`loh-corrected.png`) is an heatmap that shows the LOH inferred by CHISEL and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 67 | Each point of the heatmap is colored according to the absence (white) or presence (black) of a LOH in the corresponding cell and bin. 68 | 69 | ### A-specific copy numbers 70 | 71 | This plot (`Aspecificcn.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype A for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 72 | The colors of the heatmap are the same for the total copy-numbers (see above). 73 | 74 | ### Corrected A-specific copy numbers 75 | 76 | This plot (`Aspecificcn-corrected.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype A and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 77 | The colors of the heatmap are the same for the total copy-numbers (see above). 78 | 79 | ### B-specific copy numbers 80 | 81 | This plot (`Bspecificcn.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype B for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 82 | The colors of the heatmap are the same for the total copy-numbers (see above). 83 | 84 | ### Corrected B-specific copy numbers 85 | 86 | This plot (`Bspecificcn-corrected.png`) is an heatmap that shows the copy numbers inferred by CHISEL for haplotype B and corrected using the inferred clones for every cell (rows) along the entire genome (y-axis), with cells/rows colored according to the inferred clone and columns/genomic bins colored according to the chromosome. 87 | The colors of the heatmap are the same for the total copy-numbers (see above). 88 | 89 | 90 | -------------------------------------------------------------------------------- /doc/chisel-pseudonormal.md: -------------------------------------------------------------------------------- 1 | # Command `chisel-pseudonormal.py` 2 | 3 | The CHISEL command `chisel-pseudonormal.py` implements the method integrated in CHISEL for generating a pseudo matched-normal sample by extracting diploid cells from a barcoded BAM file. 4 | The command simply required as input a barcoded BAM file and the corresponding reference genome; detailed descriptions of the required input are available [here](../man/chisel-pseudonormal.md). 5 | After the execution, the command generates a new BAM file by only merging sequencing reads from diploid cells; thus the resulting BAM file can be used as a paseudo matched-normal sample to run the entire [CHISEL's pipeline](chisel.md). -------------------------------------------------------------------------------- /doc/chisel.md: -------------------------------------------------------------------------------- 1 | # Command `chisel` 2 | 3 | The CHISEL command `chisel` as well as the command `chisel_nonormal` runs the entire CHISEL pipeline starting from the required inputs (e.g. BAM files). 4 | During the executiong, the command creates six folders which contain the temporary and final results produced by the 5 distinct steps of CHISEL. 5 | 6 | ## Estimating RDRs 7 | 8 | This step aims to estimate the RDR for every genomic bin in each cell. 9 | Moreover, it selects the barcodes that correspond to cells using a specified threshold on the minimum number of reads. 10 | This step creates a folder `rdr` with three files: 11 | 12 | 1. `total.tsv`: a TSV dataframe containing the number of sequencing reads observed for every selected cell. More specifically, the fields are: 13 | 1. `CELL`: the name of a cell or the name `normal` indicating the matched-normal sample; 14 | 2. `TOTAL`: the total number of sequencing reads observed for the cell. 15 | 2. `rdr.tsv`: a TSV dataframe containg the estimated RDRs with the following fields: 16 | 1. `CHROMOSOME`: the name of a chromosome; 17 | 2. `START`: the starting coordinate of a genomic bin; 18 | 3. `END`: the ending coordinate of the genomic bin; 19 | 4. `CELL`: the name of a cell; 20 | 5. `NORMAL`: the number of sequencing reads from the matched-normal sample for the bin; 21 | 5. `COUNT`: the number of sequencing reads from the cell `CELL` in the bin; 22 | 6. `RDR`: the estimated RDR. 23 | 3. `log`: a logging file of the execution of this step (optional). 24 | 25 | ## Estimating BAF 26 | 27 | This step aims to estimate the BAF for phased germline heterozygous SNPs in the selected cells. 28 | This step creates a folder `baf` with two files: 29 | 30 | 1. `baf.tsv`: a TSV dataframe with the following fields: 31 | 1. `CHROMOSOME`: the name of a chromosome; 32 | 2. `POS`: a genomic position in the chromosome `CHROMOSOME` for a germline heterozygous SNP; 33 | 3. `CELL`: the name of a cell; 34 | 4. `A-COUNT`: the number of observed sequencing reads from the haplotype A of the SNP; 35 | 4. `B-COUNT`: the number of observed sequencing reads from the haplotype B of the SNP. 36 | 2. `log`: a logging file of the execution of this step (optional). 37 | 38 | ## Combining RDRs and BAFs 39 | 40 | This step aims to combine the RDRs and BAFs for the selected bins in the selected cells. 41 | This step creates a folder `combo` with two files: 42 | 43 | 1. `combo.tsv`: a TSV dataframe with the following fields: 44 | 1. `CHROMOSOME`: the name of a chromosome; 45 | 2. `START`: the starting coordinate of a genomic bin; 46 | 3. `END`: the ending coordinate of the genomic bin; 47 | 4. `CELL`: the name of a cell; 48 | 5. `NORMAL`: the number of sequencing reads from the matched-normal sample for the bin; 49 | 6. `COUNT`: the number of sequencing reads from the cell `CELL` in the bin; 50 | 7. `RDR`: the estimated RDR for the bin in the cell `CELL`; 51 | 8. `A-COUNT`: the number of observed sequencing reads from the haplotype A of the SNP; 52 | 9. `B-COUNT`: the number of observed sequencing reads from the haplotype B of the SNP; 53 | 10. `BAF`: the B-allele frequency estimated for the bin in the cell `CELL`. 54 | 2. `log`: a logging file of the execution of this step (optional). 55 | 56 | ## Calling 57 | 58 | This step aims to infer the ploidy of each cell and, after global clustering of RDRs and BAFs, to infer the allele- and haplotype-specific copy numbers for every bin in every cell. 59 | This step creates a folder `calls` with two files 60 | 61 | 1. `calls.tsv`: a TSV dataframe with the following fields: 62 | 1. `CHROMOSOME`: the name of a chromosome; 63 | 2. `START`: the starting coordinate of a genomic bin; 64 | 3. `END`: the ending coordinate of the genomic bin; 65 | 4. `CELL`: the name of a cell; 66 | 5. `NORMAL`: the number of sequencing reads from the matched-normal sample for the bin; 67 | 6. `COUNT`: the number of sequencing reads from the cell `CELL` in the bin; 68 | 7. `RDR`: the estimated RDR for the bin in the cell `CELL`; 69 | 8. `A-COUNT`: the number of observed sequencing reads from the haplotype A of the SNP; 70 | 9. `B-COUNT`: the number of observed sequencing reads from the haplotype B of the SNP; 71 | 10. `BAF`: the B-allele frequency estimated for the bin in the cell `CELL`; 72 | 11. `ALLELECN`: dash-separated ordered pair of the inferred haplotype-specific copy numbers for the bin in the cell `CELL`. 73 | 2. `log`: a logging file of the execution of this step (optional). 74 | 75 | ## Cloning 76 | 77 | This steps aims to infer the clones by clustering cells based on the inferred haplotype-specific copy numbers and selecting the clusters that correspond to actual clones. 78 | This step creates a folder `clones` with two files: 79 | 80 | 1. `mapping.tsv`: a TSV dataframe with the following fields: 81 | 1. `CELL`: the name of a selected cell; 82 | 2. `CLUSTER`: the cluster where the cell `CELL` has been assigned; 83 | 3. `CLONE`: the clone of the cell `CELL`, however it corresponds to `None` if the cells is classified as noisy. 84 | 2. `log`: a logging file of the execution of this step (optional). 85 | 86 | Moreover, this step introduces a new field (right-most field) in the file `calls.tsv` which is `CORRECTED_CNS` and corresponds to the final haplotype-specific copy numbers estimated after consensus of cells in the same clone. 87 | 88 | ## Plotting 89 | 90 | This step generate several useful plots about the results, which are fully described [here](chisel-plotting.md). 91 | -------------------------------------------------------------------------------- /guides/clones.md: -------------------------------------------------------------------------------- 1 | # Identification of clones 2 | 3 | CHISEL infers clones by clustering cells based on the inferred haplotype-specific copy numbers and selecting the clusters that correspond to actual clones. 4 | This selection is indeed required because the data are noisy and minor differences between cells may indicate errors as well as small clusters may indicate noisy cells with bad sequencing. 5 | This identification is controlled by two parameters: 6 | - `f`: the maximum fraction of the genome with different haplotype-specific copy numbers for cells in the same clone (default: 0.06); 7 | - `s`: the minimum number of cells in a clone (default: 14). 8 | 9 | The values of these two parameters have been calibrated for the expected number of cells and sequencing coverage of 10X Genomics datasets. 10 | However, when analyzing datasets with different number of cells, different sequencing coverage, or particularly noisy datasets, the default values of these parameters may not be appropriate. 11 | Therefore, when the user observes a outlying high number of noisy cells or too few inferred clones (even 0), it is important to vary these values to explore different solutions. 12 | 13 | Given the inferred clones with the previous parameters, there is one additional parameter that can be used to adjust the classification of noisy cells: `-r`, which controls the refinement of the identified clones and allows the user to include more "noisy" cells into the identified clones. Specifically, every cell that has a fraction of the genome with different haplotype-specific copy numbers lower than then value of `r` will be included into the clones. Therefore, the user can user increasingly higher values to force the inclusion of more noisy cells into the inferred clones, for example `-r 0.2`, `-r 0.3`, `-r 0.4`, etc. Note that `-r 1` will force every cell to be assigned to a clone. 14 | 15 | These tasks can be performed very efficiently and easily by using the [CHISEL command `chisel-cloning.py`](../doc/chisel-cloning.md), which allows the user to only re-execute the inference of clones and the generation of plots very efficiently from the already inferred haploty-specific copy numbers. 16 | As such, using this command, the user can attempt to use different combinations of the parameters, varying the maximum difference `f` (e.g. `-f 0.1`, `-f 0.12`, `-f 0.15`, ...) and the minimum number `-s` of cells to select the clones (either increasing like `-s 20`, `-s 30`, ... or decreasing like `-s 3`, `-s 2`, according to the total number of cells). 17 | More details on adjusting and selecting reasonable values of these parameters are available in the [CHISEL's manuscript](https://doi.org/10.1101/837195). 18 | -------------------------------------------------------------------------------- /guides/clustering.md: -------------------------------------------------------------------------------- 1 | # Clustering 2 | 3 | CHISEL globally clusters the estimated RDRs and BAFs by using a k-means algorithm and model-selection criterion based on the elbow method to select the best number of clusters (further details are reported in the [CHISEL's manuscript](https://doi.org/10.1101/837195)). 4 | In order to do this, CHISEL fixes the maximum number of clusters (default value is 100). 5 | However, the value can be too high when analyzing very noisy datasets since the high levels of noise in the data can be misinterpreted and may lead to overfitting. 6 | Therefore, the user can assess the levels of variance and noise by using the [BAF and RDR plots](../doc/chisel-plotting.md): in particular, high levels of noise can be immediately noted when a clear clustering structure is missing from such plots. 7 | As such, when analyzing datasets with very high levels of variance, the user can lower the maximum number of clusters to avoid overfitting. 8 | Another possible signal that may indicate overiffing is for example the inference of many outlying and noisy CNAs, i.e. observing many cells with isolated and small CNAs. 9 | For such QC purposes, the user can use the CHISEL [command `chisel-calling.py`](../doc/chisel-calling.md) to vary the maximum number of clusters with the argument `-K` to re-run the CHISEL inference without the need of re-estimating RDRs and BAFs (which generally is the most time-consuming step). 10 | The user can thus quantify the presence of noisy CNAs when varying the value of this parameter, for example `-K 80`, `-K 60`, `-K 40`... 11 | -------------------------------------------------------------------------------- /guides/ploidy.md: -------------------------------------------------------------------------------- 1 | # Ploidy selection 2 | 3 | CHISEL infers the ploidy of each cell from the estimated RDRs and BAFs by using a model-selection criterion which has been calibrated for observing a sufficient number of reads on average across all bins and cells (further details are reported in the [CHISEL's manuscript](https://doi.org/10.1101/837195)). 4 | In the case of particularly noisy cells or datasets with a particularly low sequencing coverage or high variance, the inference can thus be more challenging. 5 | A sign which generally indicates potential issues in the inference of cell ploidies is the observation of a substatial number of cells with different ploidies (and thus with completely different copy numbers). 6 | However, CHISEL provides a parameter to adjust the sensitivity of the model-selection criterion for dealing with these cases. 7 | 8 | For QC purposes, the reccommendation is to analyze the allele-specific copy numbers inferred by CHISEL, for example using the corresponding [plots](../doc/chisel-plotting.md). 9 | If a substantial number of cells with different ploidies has been inferred, the reccommendation is to analyze how the results change by re-running the inference of copy numbers varying the sensitivity of the model-selection criterion. 10 | Specifically, the CHISEL [command `chisel-calling.py`](../doc/chisel-calling.md) can be used to do this very efficiently by varying the sensitivity with the argument `-A` to re-run the CHISEL inference without the need of re-estimating RDRs and BAFs (which generally is the most time-consuming step). 11 | The use can thus analyse the inferred tumor ploidies by increase the sensitivity with values of `-A 2`, `-A 3`, `-A 4`... 12 | The inference of different ploidies is well supported by the data if the results do not substantially change when increasing the sensitivity, otherwise the results obtained with higher sensitivity are more likely. 13 | -------------------------------------------------------------------------------- /install_full.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | # Finding whether os is Linux or MacOSX 7 | OS=$(uname -s) 8 | case ${OS} in 9 | Linux*) OS=Linux;; 10 | Darwin*) OS=MacOSX;; 11 | *) echo "Unknown OS ${OS}; please use manual installation." && exit 1;; 12 | esac 13 | 14 | # Finding whether machine is 32bit or 64bit 15 | case ${OS} in 16 | Linux) 17 | VER=$(uname -i) 18 | case ${VER} in 19 | x86_64) MINICONDA="https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh";; 20 | *) MINICONDA="https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86.sh";; 21 | esac;; 22 | MacOSX) 23 | VER=$(uname -m) 24 | case ${VER} in 25 | *) MINICONDA="https://repo.anaconda.com/miniconda/Miniconda2-latest-MacOSX-x86_64.sh";; 26 | esac;; 27 | *) 28 | echo "Unknown OS ${OS}; please use manual installation." && exit 1;; 29 | esac 30 | 31 | # Installing Miniconda 32 | CHISEL_HOME=$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 33 | cd ${CHISEL_HOME} 34 | curl -L ${MINICONDA} > miniconda.sh 35 | rm -rf ./conda/ 36 | bash miniconda.sh -b -f -p ./conda/ 37 | export CONDA_HOME=${CHISEL_HOME}/conda/bin 38 | 39 | # Installing chisel 40 | ${CONDA_HOME}/conda config --add channels defaults 41 | ${CONDA_HOME}/conda config --add channels bioconda 42 | ${CONDA_HOME}/conda config --add channels conda-forge 43 | ${CONDA_HOME}/conda create -n chisel chisel -y 44 | 45 | # Activating CHISEL 46 | source ${CONDA_HOME}/activate chisel 47 | echo -e "\nInstallation was succesfull and CHISEL is ready!\nPlease remember to run the following command now and during every new session before using CHISEL:\n\n\t\tsource ${CONDA_HOME}/activate chisel\n\n" 48 | -------------------------------------------------------------------------------- /man/chisel-bedding.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | usage: chisel_bedding [-h] [-x RUNDIR] [--rawcalls] [--noextending] [-j JOBS] 3 | [INPUT] 4 | 5 | CHISEL command to generate a BED file for each cell with the corresponding 6 | CHISEL's results. 7 | 8 | positional arguments: 9 | INPUT Input file with inferred copy numbers (default: 10 | calls/calls.tsv) 11 | 12 | optional arguments: 13 | -h, --help show this help message and exit 14 | -x RUNDIR, --rundir RUNDIR 15 | Running directory (default: current directory) 16 | --rawcalls Use raw copy numbers instead of consensus corrected 17 | ones (default: False) 18 | --noextending Merge consecutive bins only if they are neighboring 19 | (default: False, segments are extended to fill gaps) 20 | -j JOBS, --jobs JOBS Number of parallele jobs to use (default: equal to 21 | number of available processors) 22 | ``` 23 | -------------------------------------------------------------------------------- /man/chisel-calling.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | usage: chisel-calling.py [-h] [-x RUNDIR] [-A SENSITIVITY] [-P MAXPLOIDY] 3 | [-K UPPERK] [--seed SEED] [-j JOBS] 4 | [INPUT] 5 | 6 | CHISEL command to re-run the inference of allele- and haplotype-specific copy 7 | numbers, cell clustering, and plotting. This steps starts from estimated RDRs 8 | and BAFs. 9 | 10 | positional arguments: 11 | INPUT Input file with combined RDR and BAF per bin and per 12 | cell (default: combo/combo.tsv) 13 | 14 | optional arguments: 15 | -h, --help show this help message and exit 16 | -x RUNDIR, --rundir RUNDIR 17 | Running directory (default: current directory) 18 | -A SENSITIVITY, --sensitivity SENSITIVITY 19 | Sensitivity of model selection for ploidy (default: 1, 20 | increase this parameter to lower sensitivity to noisy 21 | data, adjust this value (e.g. 2, 4, ..., 10, ...) to 22 | better deal with high-variance data (e.g. low 23 | coverage, small number of cells, low number of phased 24 | SNPs, etc...) 25 | -P MAXPLOIDY, --maxploidy MAXPLOIDY 26 | Maximum total copy number to consider for balanced 27 | cluster (default: 4, corresponding to a WGD) 28 | -K UPPERK, --upperk UPPERK 29 | Maximum number of bin clusters (default: 100, use 0 to 30 | consider maximum number of clusters) 31 | --seed SEED Random seed for replication (default: None) 32 | -j JOBS, --jobs JOBS Number of parallele jobs to use (default: equal to 33 | number of available processors) 34 | ``` -------------------------------------------------------------------------------- /man/chisel-cloning.md: -------------------------------------------------------------------------------- 1 | ```chisel 2 | usage: chisel-cloning.py [-h] [-x RUNDIR] [-f MAXDIFF] [-s MINSIZE] 3 | [-r REFINEMENT] [--seed SEED] 4 | [INPUT] 5 | 6 | CHISEL command to run the pipeline starting from inferred copy numbers. 7 | 8 | positional arguments: 9 | INPUT Input file with combined RDR and BAF per bin and per 10 | cell 11 | 12 | optional arguments: 13 | -h, --help show this help message and exit 14 | -x RUNDIR, --rundir RUNDIR 15 | Running directory (default: current directory) 16 | -f MAXDIFF, --maxdiff MAXDIFF 17 | Maximum haplotype-specific distance between the genome 18 | of cells in the same clone (default: 0.06, when -1 is 19 | chosen the maximum cluster method of SciPy is used) 20 | -s MINSIZE, --minsize MINSIZE 21 | Minimum number of cells in a subpopulation to define a 22 | clone (default: 14) 23 | -r REFINEMENT, --refinement REFINEMENT 24 | Maximum difference to assign noisy cells to the 25 | closest clone (default: 0.0, note that 1.0 can be used 26 | to force the assigment of all cells) 27 | --seed SEED Random seed for replication (default: None) 28 | ``` -------------------------------------------------------------------------------- /man/chisel-plotting.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | usage: chisel-plotting.py [-h] [-m CLONEMAP] [-f FIGFORMAT] [-s SAMPLE] 3 | [--excludenoisy] [--gridsize GRIDSIZE] 4 | [--plotsize PLOTSIZE] [--clussize CLUSSIZE] 5 | [--xmax XMAX] [--xmin XMIN] [--ymax YMAX] 6 | [--ymin YMIN] 7 | [INPUT] 8 | 9 | CHISEL command to re-create the plots. 10 | 11 | positional arguments: 12 | INPUT Input file with inferred copy numbers (default: 13 | calls/calls.tsv) 14 | 15 | optional arguments: 16 | -h, --help show this help message and exit 17 | -m CLONEMAP, --clonemap CLONEMAP 18 | Clone map (default: not used, the cells will be 19 | clustered for plotting purposes) 20 | -f FIGFORMAT, --figformat FIGFORMAT 21 | Format of output figures (default: png, the only other 22 | option is pdf) 23 | -s SAMPLE, --sample SAMPLE 24 | Number of cells to sample (default: 20) 25 | --excludenoisy Exclude noisy cells from plots (default: False) 26 | --gridsize GRIDSIZE Grid dimenstions specified as comma-separated numbers 27 | (default: 12,6) 28 | --plotsize PLOTSIZE Plot dimenstions for RDR-BAF plots, specified as 29 | comma-separated numbers (default: 5,1.5) 30 | --clussize CLUSSIZE Grid dimenstions for clustered plots, specified as 31 | comma-separated numbers (default: 5,3) 32 | --xmax XMAX Maximum x-axis value (default: None) 33 | --xmin XMIN Minimum x-axis value (default: None) 34 | --ymax YMAX Maximum x-axis value (default: None) 35 | --ymin YMIN Minimum x-axis value (default: None) 36 | ``` -------------------------------------------------------------------------------- /man/chisel-prep.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | usage: chisel_prep [-h] [-r REFERENCE] [-x RUNDIR] [-o OUTPUT] 3 | [--rexpname REXPNAME] [--rexpread REXPREAD] 4 | [--noduplicates] [--keeptmpdir] 5 | [--barcodelength BARCODELENGTH] [--bcftools BCFTOOLS] 6 | [--samtools SAMTOOLS] [--bwa BWA] [-j JOBS] [--seed SEED] 7 | INPUT [INPUT ...] 8 | 9 | CHISEL command to create a barcoded BAM file from single-cell FASTQs (or gz- 10 | compressed FASTQs), single-cell BAMs, or a `RG:Z:`-barcoded BAM files without 11 | `CB:Z:` tags. When single-cell FASTQs or BAMs are provided a CELL name is 12 | assigned to each file (through either filename or table) and the same cell 13 | barcode will be assigned to all corresponding reads, but a different RG tag as 14 | they are considered as different repetitions of sequencing of the same cell. 15 | Specifically, when a table of inputs is not provied, for FASTQs each CELL name 16 | is extracted from the filename through the provided regular expression 17 | (default matches Illumina standard format), for BAMs basename is used as CELL 18 | name. When single-cell FASTQs are provided a READ value is also assigned to 19 | each file (through either filename or table) and files with the same filename 20 | when removing READ values are considered as pairs of sequencing read mates. 21 | Input files, CELL names, and possible READ values can be provided through a 22 | table of inputs. 23 | 24 | positional arguments: 25 | INPUT Input FASTQs, BAMs, or TSV file with different 26 | behaviors: ......................................... 27 | (1) FASTQs -- specified in a directory DIR as 28 | `DIR/*.fastq` or `DIR/*.fastq.gz` -- will be barcoded 29 | and aligned with (optionally) marked duplicates into a 30 | barcoded BAM file; ................................. 31 | (2) BAMs -- specified in a directory DIR as 32 | `DIR/*.bam` -- will be barcoded and aligned with 33 | (optionally) marked duplicates into a barcoded BAM 34 | file; .............................................. 35 | (3) a single BAM file with unique cells names in the 36 | field `RG:Z:` will be converted into a barcoded BAM 37 | file with the additional `CB:Z:` tag; .............. 38 | (4) a tab-separated table of inputs (TSV with optional 39 | header starting with `#`) with two columns: the first 40 | column is an input file (FASTQ or BAM) and the second 41 | column is the corresponding cell name. When FASTQs are 42 | provided, a third column can be optionally specified 43 | to indicate the read name in paired-end sequencing, 44 | e.g., indicating either R1 or R2 for the first or 45 | second mate of paired-end reads, respectively. If a 46 | third column is not present, FASTQs are assumed to be 47 | from single-end sequencing. 48 | 49 | optional arguments: 50 | -h, --help show this help message and exit 51 | -r REFERENCE, --reference REFERENCE 52 | Reference genome, which is mandatory in FASTQ mode 53 | (default: None) 54 | -x RUNDIR, --rundir RUNDIR 55 | Running directory (default: current directory) 56 | -o OUTPUT, --output OUTPUT 57 | Output name in running directory (default: 58 | barcodedcells.bam) 59 | --rexpname REXPNAME Regulare expression to extract cell name from input 60 | FASTQ filenames (default: 61 | `(.*)_S.*_L.*_R[1|2]_001.fastq.*`) 62 | --rexpread REXPREAD Regulare expression to extract cell name from input 63 | FASTQ filenames (default: 64 | `.*_S.*_L.*_(R[1|2])_001.fastq.*`) 65 | --barcodeonly Only compute barcodes but do not run aligning pipeline 66 | (default: False) 67 | --noduplicates Do not perform marking duplicates and recalibration 68 | with Picard tools (default: False) 69 | --keeptmpdir Do not erase temporary directory (default: False) 70 | --barcodelength BARCODELENGTH 71 | Length of barcodes (default: 12) 72 | --bcftools BCFTOOLS Path to the directory to "bcftools" executable 73 | (default: in $PATH) 74 | --samtools SAMTOOLS Path to the directory to "samtools" executable 75 | (default: in $PATH) 76 | --bwa BWA Path to the directory to "bwa" executable (default: in 77 | $PATH) 78 | -j JOBS, --jobs JOBS Number of parallele jobs to use (default: equal to 79 | number of available processors) 80 | --seed SEED Random seed for replication (default: None) 81 | ``` 82 | -------------------------------------------------------------------------------- /man/chisel-pseudonormal.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | usage: chisel-pseudonormal.py [-h] -r REFERENCE [-x RUNDIR] [-e THRESHOLD] 3 | [-b SIZE] [-c CHROMOSOMES] [-m MINREADS] 4 | [--samtools SAMTOOLS] [-j JOBS] 5 | [--tmpdir TMPDIR] [-n NORMAL] 6 | INPUT 7 | 8 | CHISEL command to generate a pseudo-matched normal sample by extracting 9 | diploid cells from a barcoded single-cell BAM file. 10 | 11 | positional arguments: 12 | INPUT Barcoded single-cell BAM file 13 | 14 | optional arguments: 15 | -h, --help show this help message and exit 16 | -r REFERENCE, --reference REFERENCE 17 | Reference genome 18 | -x RUNDIR, --rundir RUNDIR 19 | Running directory (default: current directory) 20 | -e THRESHOLD, --threshold THRESHOLD 21 | Minimum fraction of diploid genome to select diploid 22 | cells (default: 0.9) 23 | -b SIZE, --size SIZE Bin size, with or without "kb" or "Mb" 24 | -c CHROMOSOMES, --chromosomes CHROMOSOMES 25 | Space-separeted list of chromosomes between apices 26 | (default: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 27 | chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 28 | chr18 chr19 chr20 chr21 chr22") 29 | -m MINREADS, --minreads MINREADS 30 | Minimum number total reads to select cells (default: 31 | 100000) 32 | --samtools SAMTOOLS Path to the directory to "samtools" executable, 33 | required in default mode (default: samtools is 34 | directly called as it is in user $PATH) 35 | -j JOBS, --jobs JOBS Number of parallele jobs to use (default: equal to 36 | number of available processors) 37 | --tmpdir TMPDIR Temporary directory in running directory (default: 38 | _TMP) 39 | -n NORMAL, --normal NORMAL 40 | Name of the generated pseudo matched-normal BAM file 41 | (default: pseudonormal.bam) 42 | ``` -------------------------------------------------------------------------------- /man/chisel.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | usage: chisel.py [-h] [-x RUNDIR] -t TUMOR -n NORMAL -r REFERENCE -l 3 | LISTPHASED [-b SIZE] [-k BLOCKSIZE] [-c CHROMOSOMES] 4 | [-m MINREADS] [-p MAXPLOIDY] [-K UPPERK] 5 | [--bcftools BCFTOOLS] [--samtools SAMTOOLS] 6 | [--cellprefix CELLPREFIX] [--cellsuffix CELLSUFFIX] 7 | [--seed SEED] [-j JOBS] 8 | 9 | CHISEL command to run the complete pipeline starting from the 4 required data: 10 | (1) Barcoded single-cell BAM; (2) Matched-normal BAM; (3) Reference genome; 11 | (4) Phased VCF. 12 | 13 | optional arguments: 14 | -h, --help show this help message and exit 15 | -x RUNDIR, --rundir RUNDIR 16 | Running directory (default: current directory) 17 | -t TUMOR, --tumor TUMOR 18 | Barcoded single-cell BAM file 19 | -n NORMAL, --normal NORMAL 20 | Matched-normal BAM file 21 | -r REFERENCE, --reference REFERENCE 22 | Reference genome 23 | -l LISTPHASED, --listphased LISTPHASED 24 | Phased SNPs file (lines of heterozygous germline SNPs 25 | must contain either 0|1 or 1|0) 26 | -b SIZE, --size SIZE Bin size, with or without "kb" or "Mb" 27 | -k BLOCKSIZE, --blocksize BLOCKSIZE 28 | Size of the haplotype blocks (default: 50kb, use 0 to 29 | disable) 30 | -c CHROMOSOMES, --chromosomes CHROMOSOMES 31 | Space-separeted list of chromosomes between apices 32 | (default: "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 33 | chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 34 | chr18 chr19 chr20 chr21 chr22") 35 | -m MINREADS, --minreads MINREADS 36 | Minimum number total reads to select cells (default: 37 | 100000) 38 | -p MAXPLOIDY, --maxploidy MAXPLOIDY 39 | Maximum total copy number to consider for balanced 40 | cluster (default: 4, corresponding to a WGD) 41 | -K UPPERK, --upperk UPPERK 42 | Maximum number of bin clusters (default: 100, use 0 to 43 | consider maximum number of clusters) 44 | --bcftools BCFTOOLS Path to the directory to "bcftools" executable, 45 | required in default mode (default: bcftools is 46 | directly called as it is in user $PATH) 47 | --samtools SAMTOOLS Path to the directory to "samtools" executable, 48 | required in default mode (default: samtools is 49 | directly called as it is in user $PATH) 50 | --cellprefix CELLPREFIX 51 | Prefix of cell barcode field in SAM format (default: 52 | CB:Z:) 53 | --cellsuffix CELLSUFFIX 54 | Suffix of cell barcode field in SAM format (default: 55 | none) 56 | --seed SEED Random seed for replication (default: None) 57 | -j JOBS, --jobs JOBS Number of parallele jobs to use (default: equal to 58 | number of available processors) 59 | ``` 60 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from setuptools import setup 3 | 4 | 5 | setuptools.setup( 6 | name='chisel', 7 | version='1.2', 8 | python_requires='==2.7.*', 9 | packages=['chisel', 'chisel.bin'], 10 | package_dir={'': 'src'}, 11 | author='Simone Zaccaria', 12 | author_email='s.zaccaria@ucl.ac.uk', 13 | description='Copy-number Haplotype Inference in Single-cell by Evolutionary Links', 14 | long_description='https://github.com/raphael-group/chisel', 15 | url='https://github.com/raphael-group/chisel', 16 | install_requires=[ 17 | 'numpy>=1.16.1', 18 | 'scipy>=1.2.1', 19 | 'pandas', 20 | 'seaborn>=0.7.1', 21 | 'statsmodels<=0.10.1' 22 | ], 23 | extras_require={ 24 | 'dev': ['pytest', 'mock'] 25 | }, 26 | license='BSD', 27 | platforms=["Linux", "MacOs", "Windows"], 28 | classifiers=[ 29 | 'Programming Language :: Python :: 2.7', 30 | "Intended Audience :: Science/Research", 31 | "Natural Language :: English", 32 | "Operating System :: MacOS :: MacOS X", 33 | "Operating System :: Microsoft :: Windows", 34 | "Operating System :: POSIX :: Linux", 35 | "Topic :: Scientific/Engineering :: Bio-Informatics", 36 | ], 37 | keywords=[ 38 | 'scientific', 39 | 'sequence analysis', 40 | 'cancer', 41 | 'single-cell', 42 | 'DNA', 43 | 'copy-number'], 44 | entry_points={'console_scripts': ['chisel=chisel.bin.chisel_main:main', 45 | 'chisel_nonormal=chisel.bin.chisel_nonormal:main', 46 | 'chisel_preprocess=chisel.bin.chisel_preprocess:main', 47 | 'chisel_nonormal_preprocess=chisel.bin.chisel_nonormal_preprocess:main', 48 | 'chisel_combocall=chisel.bin.chisel_combocall:main', 49 | 'chisel_nonormal_combocall=chisel.bin.chisel_nonormal_combocall:main', 50 | 'chisel_calling=chisel.bin.chisel_calling:main', 51 | 'chisel_cloning=chisel.bin.chisel_cloning:main', 52 | 'chisel_plotting=chisel.bin.chisel_plotting:main', 53 | 'chisel_pseudonormal=chisel.bin.chisel_pseudonormal:main', 54 | 'chisel_prep=chisel.bin.chisel_prep:main', 55 | 'chisel_bedding=chisel.bin.chisel_bedding:main', 56 | 'chisel_rdr=chisel.bin.chisel_rdr:main']} 57 | ) 58 | -------------------------------------------------------------------------------- /src/chisel/Cloner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import sys, os 4 | import argparse 5 | import shutil 6 | import warnings 7 | 8 | from itertools import cycle 9 | from collections import defaultdict 10 | from collections import Counter 11 | 12 | import numpy as np 13 | import scipy 14 | import scipy.cluster 15 | import scipy.cluster.hierarchy as hier 16 | 17 | from Utils import * 18 | 19 | 20 | def parse_args(args): 21 | description = "Infer clones as subpopulations of cells with the same complement of CNAs and outputs a file with the mapping of every cell to the corresponding clone." 22 | parser = argparse.ArgumentParser(description=description) 23 | parser.add_argument("INPUT", type=str, help="Input file with RDR, BAF, and inferred copy numbers.") 24 | parser.add_argument("-f", "--maxdiff", required=False, type=float, default=0.07, help="Maximum fraction of the genome with different copy-number states allowed in a clone (default: 0.07, when -1 is chosen the maximum cluster method of SciPy is used)") 25 | parser.add_argument("-r", "--refinement", required=False, type=float, default=0.15, help="Maximum difference to assign noisy cells to a clone (default: 0.15)") 26 | parser.add_argument("-s", "--minsize", required=False, type=int, default=14, help="Minimum size of subpopultation to define a clone (default: 14)") 27 | parser.add_argument("-l", "--linkage", required=False, type=str, default='weighted', help="Linkage method to use for the hierarchical clustering (default: weighted, it must be a valid linkage method available in SciPy when using a non-euclidean distance, i.e. 'single', 'complete', 'average', 'weighted')") 28 | parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: none)") 29 | args = parser.parse_args(args) 30 | 31 | if not os.path.isfile(args.INPUT): 32 | raise ValueError('ERROR: input file does not exist!') 33 | if (not 0.0 <= args.maxdiff <= 1.0) and args.maxdiff != -1: 34 | raise ValueError('ERROR: the maximum different fraction of the genome must be either within [0, 1] or equal to -1!') 35 | if args.refinement is None: 36 | args.refinement = args.maxdiff 37 | if not 0.0 <= args.refinement <= 1.0: 38 | raise ValueError('ERROR: the refinement must be either within [0, 1]!') 39 | if args.minsize <= 0: 40 | raise ValueError('ERROR: the minimum size of subpopulations must be positive!') 41 | if not args.linkage in {'single', 'complete', 'average', 'weighted'}: 42 | raise ValueError('ERROR: the linkage method is invalid or not available for non-euclidean distances!') 43 | if args.seed and args.seed < 0: 44 | raise ValueError("Random seed must be positive or zero!") 45 | else: 46 | np.random.seed(args.seed) 47 | 48 | return { 49 | 'input' : args.INPUT, 50 | 'maxdiff' : args.maxdiff, 51 | 'refinement' : args.refinement, 52 | 'minsize' : args.minsize, 53 | 'linkage' : args.linkage, 54 | 'seed' : args.seed 55 | } 56 | 57 | 58 | def main(args=None, stdout_file=None): 59 | log('Parsing and checking arguments') 60 | args = parse_args(args) 61 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]), level='INFO') 62 | 63 | log('Reading input') 64 | cns, pos, cells = reading(args['input']) 65 | 66 | log('Clustering cells in clones') 67 | clus = clustering(cns, pos, cells, args['maxdiff'], args['linkage']) 68 | 69 | log('Selecting clones') 70 | clones = selecting(clus, args['minsize']) 71 | log('Number of identified clones: {}'.format(len(set(clones.values()))), level='INFO') 72 | 73 | if len(clones) > 0 and args['refinement'] >= 0.0: 74 | log('Refining clustering') 75 | clones, clus = refining(cns, clus, clones, args['refinement']) 76 | log('Number of discarded cells: {} over {} in total'.format(len(set(cells) - set(clones.keys())), len(set(cells))), level='INFO') 77 | 78 | log('Profiling clones') 79 | profiles = profiling(cns, clus) 80 | 81 | log('Writing clone map') 82 | header = '\t'.join(['#CELL', 'CLUSTER', 'CLONE']) 83 | if stdout_file is not None: 84 | with open(stdout_file, 'w') as f: 85 | f.write(header + '\n') 86 | for c in cells: 87 | f.write('\t'.join(map(str, [c, clus[c], 'Clone{}'.format(clones[c]) if c in clones else 'None'])) + '\n') 88 | else: 89 | print header 90 | for c in cells: 91 | print '\t'.join(map(str, [c, clus[c], 'Clone{}'.format(clones[c]) if c in clones else 'None'])) 92 | 93 | log('Writing clone-corrected copy numbers in provided input') 94 | ftmp = args['input'] + '_TMP' 95 | assert not os.path.isfile(ftmp), "Temporary file {} does already exist!".format(ftmp) 96 | form = (lambda p : ((p[0], int(p[1]), int(p[2])), p[3], p[0:12])) 97 | with open(args['input'], 'r') as i: 98 | with open(ftmp, 'w') as o: 99 | for l in i: 100 | if '#' != l[0]: 101 | b, e, val = form(l.strip().split()) 102 | o.write('\t'.join(val + ['{}|{}'.format(*profiles[b][clus[e]])]) + '\n') 103 | else: 104 | o.write('\t'.join(['#CHR', 'START', 'END', 'CELL', 'NORM_COUNT', 'COUNT', 'RDR', 'A_COUNT', 'B_COUNT', 'BAF', 'CLUSTER', 'HAP_CN', 'CORRECTED_HAP_CN']) + '\n') 105 | shutil.move(ftmp, args['input']) 106 | 107 | 108 | def reading(f): 109 | cns = defaultdict(lambda : dict()) 110 | form = (lambda p : ((p[0], int(p[1]), int(p[2])), p[3], tuple(map(int, p[11].split('|'))))) 111 | with open(f, 'r') as i: 112 | for l in i: 113 | if l[0] != '#' and len(l) > 1: 114 | b, c, cn = form(l.strip().split()) 115 | assert c not in cns[b] # and c not in stuff[b] 116 | cns[b][c] = cn 117 | cns = dict(cns) 118 | orderchrs = (lambda x : int(''.join([l for l in x if l.isdigit()]))) 119 | order = (lambda b : (orderchrs(b[0]), int(b[1]), int(b[2]))) 120 | pos = sorted(cns.keys(), key=order) 121 | cells = sorted(set(c for b in cns for c in cns[b])) 122 | return cns, pos, cells 123 | 124 | 125 | def clustering(cns, pos, cells, maxdiff, linkage): 126 | states = {s : x for x, s in enumerate(set(cns[b][c] for b in pos for c in cells))} 127 | data = [[states[cns[b][c]] for b in pos] for c in cells] 128 | linkage = hier.linkage(data, method=linkage, metric='hamming', optimal_ordering=True) 129 | if maxdiff != -1: 130 | clus = hier.fcluster(linkage, t=maxdiff, criterion='distance') 131 | else: 132 | clus = hier.fcluster(linkage, t=len(cells), criterion='maxclust') 133 | return {e : clus[i] for i, e in enumerate(cells)} 134 | 135 | 136 | def selecting(clus, minsize): 137 | size = {i : sum(clus[c] == i for c in clus) for i in set(clus.values())} 138 | return {c : clus[c] for c in clus if size[clus[c]] >= minsize} 139 | 140 | 141 | def refining(cns, clus, chosen, maxdiff): 142 | clones = set(chosen.values()) 143 | safeargmax = (lambda C : argmax(C) if len(C) > 0 else (1, 1)) 144 | getcn = (lambda g, i : safeargmax(Counter([cns[g][c] for c in chosen if chosen[c] == i]))) 145 | profile = {g : {i : getcn(g, i) for i in clones} for g in cns} 146 | diff = (lambda i, c, g : 1 if profile[g][i] != cns[g][c] else 0) 147 | weight = (lambda i, c : float(sum(diff(i, c, g) for g in profile)) / float(len(profile))) 148 | closest = (lambda c : min([(i, weight(i, c)) for i in clones], key=(lambda x : x[1]))) 149 | ref = {c : closest(c) for c in clus if c not in chosen.keys()} 150 | newclones = {c : chosen[c] if c in chosen else ref[c][0] for c in clus if c in chosen or ref[c][1] <= maxdiff} 151 | newclus = {c : newclones[c] if c in newclones else clus[c] for c in clus} 152 | assert False not in set(len({clus[c], chosen[c], newclus[c], newclones[c]}) == 1 for c in chosen) 153 | return newclones, newclus 154 | 155 | 156 | def profiling(cns, clus): 157 | clones = set(clus.values()) 158 | # safeargmax = (lambda C : argmax(C) if len(C) > 0 else (1, 1)) 159 | # getcn = (lambda g, i : safeargmax(Counter([cns[g][c] for c in clus if clus[c] == i]))) 160 | mapclo = {i : filter(lambda e : clus[e] == i, clus.keys()) for i in clones} 161 | assert all(len(mapclo[i]) > 0 for i in mapclo), 'Found cluster assignment with no corresponding cell' 162 | getcn = (lambda g, i : argmax(Counter([cns[g][e] for e in mapclo[i]]))) 163 | return {g : {i : getcn(g, i) for i in clones} for g in cns} 164 | 165 | 166 | if __name__ == '__main__': 167 | main() 168 | -------------------------------------------------------------------------------- /src/chisel/Clusterizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | import argparse 5 | import math 6 | import ctypes 7 | import warnings 8 | 9 | import multiprocessing as mp 10 | from multiprocessing import Lock, Value, Pool 11 | 12 | import numpy as np 13 | import scipy.spatial 14 | 15 | from Utils import * 16 | 17 | 18 | 19 | def kclustering(data, restarts, threshold, seed=None, lord=1, j=1, LB=1, UB=None): 20 | pool_size = mp.cpu_count() 21 | os.system('taskset -cp 0-%d %s > /dev/null' % (pool_size, os.getpid())) 22 | 23 | error, center, pdist, cdist = getord(lord) 24 | if len(set(len(p) for p in data)) != 1: 25 | raise ValueError('All points must have the same length') 26 | if seed is not None: 27 | np.random.seed(seed) 28 | 29 | points = np.array(data) 30 | TCENTER = center(points) 31 | TERROR = sum(error(p, TCENTER) for p in points) 32 | PAIRWISE = pdist(points) 33 | assert np.isfinite(PAIRWISE).all(), 'Pairwise distance contain NaN!\n{}'.format(PAIRWISE) 34 | 35 | objs = {} 36 | clus = {} 37 | 38 | if UB: 39 | R = min(len(data), UB) 40 | else: 41 | R = len(points) 42 | objs[R] = 0.0 43 | clus[R] = [i for i, p in enumerate(points)] 44 | 45 | if LB: 46 | L = max(0, LB) 47 | else: 48 | L = 0 49 | objs[L] = 1.0 50 | clus[L] = [0 for p in points] 51 | 52 | def compute(K): 53 | log('Computing for {}:'.format(K), level='INFO') 54 | if K not in objs: 55 | assert K not in clus, 'The number of clusters {} does not have an objective but a solution'.format(K) 56 | obj, clu = kclustering_fixed(points, K, restarts, TERROR, PAIRWISE, lord, j) 57 | objs[K] = obj 58 | clus[K] = clu 59 | log('Objective value for {}: {}'.format(K, objs[K]), level='INFO') 60 | 61 | MAXR = R 62 | compute(MAXR) 63 | 64 | while(R - L > 1): 65 | M = int(math.floor(float(R + L) / 2.0)) 66 | assert M not in {L, R}, 'Median point is equal to boundaries but it cannot happen' 67 | compute(M) 68 | if objs[M] - objs[MAXR] > threshold: 69 | L = M 70 | else: 71 | R = M 72 | 73 | compute(L) 74 | compute(R) 75 | if L <= threshold: 76 | return clus[L] 77 | else: 78 | return clus[R] 79 | 80 | 81 | def getord(lord): 82 | ## K-means minimizes SQUARE l2-norms while K-medians minimizes L1-norm 83 | if lord == 1: ## K-medians 84 | error = (lambda a, b : np.linalg.norm(a - b, ord=1)) 85 | center = (lambda X : np.median(X, axis=0)) 86 | pdist = (lambda X : scipy.spatial.distance.pdist(X, metric='cityblock')) 87 | cdist = (lambda X, Y : scipy.spatial.distance.cdist(X, Y, metric='cityblock')) 88 | elif lord == 2: ## K-means 89 | error = (lambda a, b : np.linalg.norm(a - b, ord=2)**2) 90 | center = (lambda X : np.mean(X, axis=0)) 91 | pdist = (lambda X : scipy.spatial.distance.pdist(X, metric='sqeuclidean')) 92 | cdist = (lambda X, Y : scipy.spatial.distance.cdist(X, Y, metric='sqeuclidean')) 93 | else: 94 | raise ValueError('Order of l-norm distance must be either 1 or 2!') 95 | return error, center, pdist, cdist 96 | 97 | 98 | def kclustering_fixed(points, K, restarts, TERROR, PAIRWISE, lord=1, j=1): 99 | with warnings.catch_warnings() as w: 100 | warnings.simplefilter("ignore") 101 | shared_points, shared_points_base = share_matrix(points) 102 | shared_pairwise, shared_pairwise_base = share_array(PAIRWISE) 103 | shared_clus, shared_clus_base = newshare_matrix(restarts, len(points)) 104 | 105 | jobs = ((np.random.randint(low=0, high=2**10), x) for x, i in enumerate(range(restarts))) 106 | bar = ProgressBar(total=restarts, length=40, verbose=False) 107 | 108 | initargs = (points.shape[0], points.shape[1], K, lord, TERROR, shared_points, shared_pairwise, shared_clus) 109 | pool = Pool(processes=min(j, restarts), initializer=init_kclustering, initargs=initargs) 110 | progress = (lambda obj, it : bar.progress(advance=True, msg="Obj: {} [Iterations: {}]".format(obj, it))) 111 | best = min(((obj, idx) for obj, idx, it in pool.imap_unordered(run_kclustering, jobs) if progress(obj, it)), key=(lambda x : (x[0], x[1]))) 112 | pool.close() 113 | pool.join() 114 | return best[0], shared_clus[best[1]] 115 | 116 | 117 | def share_array(npdata): 118 | N = npdata.shape[0] 119 | shared_array_base = mp.Array(ctypes.c_double, N) 120 | shared_array = np.ctypeslib.as_array(shared_array_base.get_obj()) 121 | shared_array[:] = npdata 122 | return shared_array, shared_array_base 123 | 124 | 125 | def share_matrix(npdata): 126 | N, M = npdata.shape 127 | shared_matrix_base = mp.Array(ctypes.c_double, N * M) 128 | shared_matrix = np.ctypeslib.as_array(shared_matrix_base.get_obj()) 129 | shared_matrix = shared_matrix.reshape(N, M) 130 | shared_matrix[:] = npdata 131 | return shared_matrix, shared_matrix_base 132 | 133 | 134 | def newshare_matrix(N, M): 135 | shared_matrix_base = mp.Array(ctypes.c_double, N * M) 136 | shared_matrix = np.ctypeslib.as_array(shared_matrix_base.get_obj()) 137 | shared_matrix = shared_matrix.reshape(N, M) 138 | return shared_matrix, shared_matrix_base 139 | 140 | 141 | def init_kclustering(_N, _M, _K, _lord, _TERROR, _points, _pairwise, _clus): 142 | global N, M, K, error, center, pdist, cdist, TERROR, POINTS, PAIRWISE, CLUS 143 | N = _N 144 | M = _M 145 | K = _K 146 | error, center, pdist, cdist = getord(_lord) 147 | TERROR = _TERROR 148 | POINTS = _points 149 | PAIRWISE = _pairwise 150 | CLUS = _clus 151 | 152 | 153 | def run_kclustering(job): 154 | seed, idx = job 155 | 156 | ## Utils 157 | np.random.seed(seed) 158 | randint = np.random.randint 159 | lookup = (lambda i, j : PAIRWISE[indices_to_condensed(i, j, N)]) 160 | 161 | ## Initialization 162 | centroids = [] 163 | for i in range(K): 164 | if len(centroids) == 0: 165 | chosen = randint(N) 166 | probs = [lookup(x, chosen)**2 if x != chosen else 0.0 for x in xrange(N)] 167 | else: 168 | chosen = weighted_ichoice(probs) 169 | probs = [min(probs[x], lookup(x, chosen)**2) if x != chosen else 0.0 for x in xrange(N)] 170 | centroids.append(chosen) 171 | assert len(centroids) == K, 'Found less centroids {} than expected {}'.format(len(centroids), K) 172 | centroids = np.stack([POINTS[i] for i in centroids]) 173 | 174 | ## Iterative process 175 | it = 0 176 | pre = None 177 | while pre is None or np.any(np.abs(centroids - pre) > 0.001): 178 | it += 1 179 | pre = centroids 180 | 181 | ## Assignment 182 | between = cdist(POINTS, centroids) 183 | clu = [min((i for i in range(K)), key=(lambda i : between[x, i])) for x in xrange(N)] 184 | used = set(clu) 185 | 186 | ## Update centroids 187 | centerize = (lambda i : center(np.stack([p for x, p in enumerate(POINTS) if clu[x] == i]))) 188 | centroids = np.stack([centerize(i) if i in used else np.zeros(M) for i in range(K)]) 189 | 190 | CLUS[idx] = clu 191 | 192 | return sum(between[x, clu[x]] for x, p in enumerate(POINTS)) / TERROR, idx, it 193 | 194 | 195 | def weighted_ichoice(weights): 196 | w = np.array(weights) 197 | wsum = np.sum(w, dtype=float) 198 | if wsum > 0: 199 | assert np.isfinite(wsum).all(), 'wsum distance contain NaN!\n{}'.format(wsum) 200 | assert np.isfinite(w).all(), 'w distance contain NaN!\n{}'.format(w) 201 | cs = np.cumsum(w) / wsum 202 | r = np.random.rand() 203 | return np.searchsorted(cs, r) 204 | else: 205 | return np.random.choice(np.arange(len(weights)), size=1)[0] 206 | 207 | -------------------------------------------------------------------------------- /src/chisel/Mutator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | import shlex 5 | import argparse 6 | import subprocess as sp 7 | 8 | from multiprocessing import Lock, Value, Pool 9 | from collections import defaultdict 10 | 11 | from Utils import * 12 | 13 | 14 | def parse_args(): 15 | description = "Cell-specific allele counting for a given list of point mutations." #which must be provided as a stdin stream (symbol '-' must be used in this case) or as the name of a file." 16 | parser = argparse.ArgumentParser(description=description) 17 | parser.add_argument("-l","--listmutations", type=str, required=True, help="List of TSV phased genomic positions (TAB-seprated format '#CHR POS REF VAR')") 18 | parser.add_argument("-t","--tumor", required=True, type=str, help="BAM file for matched normal sample") 19 | parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome") 20 | parser.add_argument("-s","--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)") 21 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 22 | parser.add_argument("-c","--listcells", type=str, required=False, default=None, help="File where first column contains all the cells to consider (default: not used)") 23 | args = parser.parse_args() 24 | 25 | if not os.path.isfile(args.tumor): 26 | raise ValueError("Specified tumor does not exist!") 27 | if not os.path.isfile(args.reference): 28 | raise ValueError("Reference genome does not exist!") 29 | if args.listcells is not None and not os.path.isfile(args.listcells): 30 | raise ValueError("Specified list of cells does not exist!") 31 | 32 | samtools = args.samtools 33 | if not samtools: 34 | samtools = "samtools" 35 | if which(samtools) is None: 36 | raise ValueError("samtools has not been found or is not executable!") 37 | 38 | if not args.jobs: 39 | args.jobs = mp.cpu_count() 40 | if args.jobs < 1: 41 | raise ValueError("The number of jobs must be positive!") 42 | 43 | return { 44 | 'tumor' : args.tumor, 45 | 'mutations' : args.listmutations, 46 | 'ref' : args.reference, 47 | 'samtools' : samtools, 48 | 'J' : args.jobs, 49 | 'list' : args.listcells 50 | } 51 | 52 | 53 | def main(): 54 | log('Parsing and checking arguments') 55 | args = parse_args() 56 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]), level='INFO') 57 | 58 | log('Extracting genomic positions of given mutations') 59 | mutations = read_mutations(args['mutations']) 60 | log('Chromosomes analyzed: {}'.format(','.join(sorted(mutations, key=orderchrs))), level='INFO') 61 | log('Total number of given mutations: {}'.format(sum(len(mutations[c]) for c in mutations)), level='INFO') 62 | 63 | log('Extracting allele counts of mutations for all cells') 64 | amut = extracting(args, mutations) 65 | 66 | if args['list']: 67 | log('Reading cell list') 68 | with open(args['list'], 'r') as i: 69 | cells = set(l.strip().split()[0].replace('-1', '') for l in i if len(l) > 1 and l[0] != '#') 70 | 71 | log('Writing A/B counts for selected phased SNPs across selected cells') 72 | print '\t'.join(['#CHR', 'POS', 'CELL', 'MUT', 'MUTCOV', 'COV']) 73 | for c, o, e in ((c, o, e) for c in sorted(amut, key=orderchrs) for o in sorted(amut[c]) for e in sorted(amut[c][o])): 74 | print '\t'.join(map(str, [c, o, e, mutations[c][o], amut[c][o][e][mutations[c][o]], sum(amut[c][o][e].values())])) 75 | 76 | log('KTHXBYE') 77 | 78 | 79 | def read_mutations(f): 80 | mutations = defaultdict(lambda : dict()) 81 | chrs = map(str, range(1, 23)) 82 | with open(f, 'r') as i: 83 | for l in i: 84 | if len(l) > 1 and l[0] != '#': 85 | p = l.strip().split() 86 | c = p[0] 87 | if ''.join([l for l in c if l.isdigit()]) not in chrs: 88 | continue 89 | try: 90 | o = int(p[1]) 91 | v = p[3] 92 | assert o not in mutations[c] 93 | if v[0] in {'A', 'C', 'G', 'T'} or v[0] in {'+', '-'}: 94 | mutations[c][o] = 'N' if v[0] == '-' else (v[1] if v[0] == '+' else v[0]) 95 | assert mutations[c][o] in {'A', 'C', 'G', 'T', 'N'} 96 | except ValueError: 97 | pass 98 | return mutations 99 | 100 | 101 | def extracting(args, mutations): 102 | jobs = ((c, o) for c in mutations for o in mutations[c]) 103 | njobs = sum(len(mutations[c]) for c in mutations) 104 | countawk = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'count.awk') 105 | bar = ProgressBar(total=njobs, length=40, verbose=False) 106 | 107 | initargs = (args['tumor'], args['samtools'], countawk) 108 | pool = Pool(processes=min(args['J'], njobs), initializer=init_extracting, initargs=initargs) 109 | 110 | ACGT = (lambda : {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0, 'N' : 0}) 111 | amut = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : ACGT()))) 112 | amut = {c : {o : defaultdict(lambda : ACGT()) for o in mutations[c]} for c in mutations} 113 | for c, o, l in pool.imap_unordered(counting_cell, jobs): 114 | if l != '': 115 | for a in l.strip().split('\n'): 116 | e, al, count = tuple(a.split()) 117 | amut[c][o][e][al] += int(count) 118 | bar.progress(advance=True, msg="Extracted SNP {}:{}".format(c, o)) 119 | 120 | return {c : {o : dict(filter(lambda (e, al) : sum(al.values()) > 0, amut[c][o].items())) for o in amut[c]} for c in amut} 121 | 122 | 123 | def init_extracting(_tumor, _sam, countawk): 124 | global cmd_sam, cmd_awk 125 | cmd_sam = "{} view -F 1796 -q 13 {} {}:{}-{}".format(_sam, _tumor, '{}', '{}', '{}') 126 | cmd_awk = 'awk -v TAG="{}" -f {}'.format('{}', countawk) 127 | 128 | 129 | def counting_cell(job): 130 | sam = sp.Popen(shlex.split(cmd_sam.format(job[0], job[1], job[1])), stdout=sp.PIPE, stderr=sp.PIPE) 131 | stdout, stderr = sp.Popen(shlex.split(cmd_awk.format(job[1])), stdin=sam.stdout, stdout=sp.PIPE, stderr=sp.PIPE).communicate() 132 | return (job[0], job[1], stdout) 133 | 134 | 135 | if __name__ == '__main__': 136 | main() 137 | -------------------------------------------------------------------------------- /src/chisel/RDREstimator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | import shlex 5 | import argparse 6 | import subprocess as sp 7 | 8 | from multiprocessing import Lock, Value, Pool 9 | from collections import Counter 10 | 11 | from Utils import * 12 | 13 | 14 | def parse_args(args): 15 | description = "Compute RDR from barcoded single-cell sequencing data." 16 | parser = argparse.ArgumentParser(description=description) 17 | parser.add_argument("-t","--tumor", required=True, type=str, help="Barcoded BAM file") 18 | parser.add_argument("-n","--normal", required=True, type=str, help="BAM file for matched normal sample") 19 | parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"") 20 | parser.add_argument("-s","--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)") 21 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 22 | parser.add_argument("-r","--reference", type=str, required=False, default="hg19", help="Name of the corresponding reference genome among \{hg18, hg19, hg38\} (default: hg19)") 23 | parser.add_argument("-m","--minreads", type=int, required=False, default=100000, help="Minimum number total reads to select cells (default: None)") 24 | parser.add_argument("-l","--cellslist", type=str, required=False, default=None, help="List of cells to select (default: None)") 25 | parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")") 26 | parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)") 27 | parser.add_argument("--cellsuffix", type=str, required=False, default='', help="Suffix of cell barcode field in SAM format (default: none)") 28 | parser.add_argument("--outdir", required=False, default='./', type=str, help="Running directory where to write the list of selected cells (default: current directory)") 29 | args = parser.parse_args(args) 30 | 31 | if not os.path.isfile(args.tumor): 32 | raise ValueError("Specified tumor does not exist!") 33 | if not os.path.isfile(args.normal): 34 | raise ValueError("Specified normal does not exist!") 35 | if not os.path.isfile(args.reference): 36 | raise ValueError("Reference genome not found!") 37 | if not os.path.isfile(os.path.splitext(args.reference)[0] + '.dict'): 38 | raise ValueError("The dictionary .dict of the reference genome not found! Please remember to index it.") 39 | if not os.path.isdir(args.outdir): 40 | raise ValueError("Running directory does not exists: {}".format(args.outdir)) 41 | 42 | size = 0 43 | try: 44 | if args.size[-2:] == "kb": 45 | size = int(args.size[:-2]) * 1000 46 | elif args.size[-2:] == "Mb": 47 | size = int(args.size[:-2]) * 1000000 48 | else: 49 | size = int(args.size) 50 | except: 51 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 52 | 53 | samtools = args.samtools 54 | if not samtools: 55 | samtools = "samtools" 56 | if which(samtools) is None: 57 | raise ValueError("samtools has not been found or is not executable!") 58 | if not args.jobs: 59 | args.jobs = mp.cpu_count() 60 | if args.jobs < 1: 61 | raise ValueError("The number of jobs must be positive!") 62 | if args.minreads is not None and args.cellslist is not None: 63 | raise ValueError("Only one between number of reads or list of cells can be specified!") 64 | if args.minreads and args.minreads < 1: 65 | raise ValueError("Minimum number of reads must be positive!") 66 | if args.cellslist and not os.path.isfile(args.cellslist): 67 | raise ValueError("Cell list does not exist!") 68 | 69 | return { 70 | 'tumor' : args.tumor, 71 | 'normal' : args.normal, 72 | 'bins' : size, 73 | 'samtools' : samtools, 74 | 'J' : args.jobs, 75 | 'ref' : args.reference, 76 | 'minreads' : args.minreads, 77 | 'list' : args.cellslist, 78 | 'chrs' : args.chromosomes.split(), 79 | 'prefix' : args.cellprefix, 80 | 'suffix' : args.cellsuffix, 81 | 'outdir' : args.outdir 82 | } 83 | 84 | 85 | def main(args=None, stdout_file=None): 86 | log('Parsing and checking arguments') 87 | args = parse_args(args) 88 | log('\n'.join(['Arguments:'] + ['{} : {}'.format(a, args[a]) for a in args]), level='INFO') 89 | 90 | log('Computing bins') 91 | bins = get_bins(args['ref'], args['chrs'], args['bins'], bams=[args['tumor'], args['normal']], samtools=args['samtools']) 92 | 93 | log('Counting reads on normal') 94 | counts = counting_normal(args['normal'], bins, args['samtools'], args['J']) 95 | 96 | log('Counting reads on barcoded cells') 97 | counts = counting_cells(counts, args['tumor'], bins, args['samtools'], args['J'], args['prefix'], args['suffix']) 98 | 99 | log('Evaluating set of found cells') 100 | if args['list'] is None: 101 | names = set(e for c in counts for b in counts[c] for e in counts[c][b]) 102 | cells = names - {'normal'} 103 | else: 104 | clist = set() 105 | with open(args['list'], 'r') as i: 106 | for l in i: 107 | clist.add(l.strip().replace(',','\t').split()[0].replace('-1', '')) 108 | names = set(e for c in counts for b in counts[c] for e in counts[c][b] if e in clist) 109 | 110 | log('Computing total numbers of sequenced reads') 111 | total = reduce(inupdate, (Counter(counts[c][b]) for c in counts for b in counts[c])) 112 | 113 | log('Selecting cells') 114 | if args['minreads']: 115 | names = set(e for e in total if total[e] >= args['minreads']) 116 | cells = names - {'normal'} 117 | log('Number of selected cells: {}'.format(len(cells)), level='INFO') 118 | 119 | ftot = os.path.join(args['outdir'], 'total.tsv') 120 | log('Writing the totals in {}'.format(ftot), level='INFO') 121 | with open(ftot, 'w') as o: 122 | o.write('{}\t{}\n'.format('normal', total['normal'])) 123 | o.write('\n'.join(['{}\t{}'.format(e, total[e]) for e in cells])) 124 | 125 | log('Estimating RDR') 126 | scale = {e : float(total['normal']) / float(total[e]) for e in cells} 127 | ratio = (lambda c, b, e : (float(counts[c][b][e]) / float(counts[c][b]['normal'])) if counts[c][b]['normal'] > 0 else 0.0) 128 | rec = (lambda c, b, e, rdr : '{}\t{}\{}\t{}'.format(c, b, e, rdr)) 129 | 130 | if stdout_file is not None: 131 | stdout_f = open(stdout_file, 'w') 132 | 133 | for c in sorted(counts, key=orderchrs): 134 | for b in sorted(counts[c], key=(lambda x : x[0])): 135 | for e in sorted(set(counts[c][b].keys()) & cells): 136 | line = '\t'.join(map(str, [c, b[0], b[1], e, counts[c][b]['normal'], counts[c][b][e], ratio(c, b, e) * scale[e]])) 137 | if stdout_file is not None: 138 | stdout_f.write(line + '\n') 139 | else: 140 | print line 141 | 142 | if stdout_file is not None: 143 | stdout_f.close() 144 | 145 | log('KTHXBYE') 146 | 147 | 148 | def counting_normal(normal, bins, samtools, J): 149 | jobs = [(c, b) for c in bins for b in bins[c]] 150 | lock = Lock() 151 | counter = Value('i', 0) 152 | 153 | initargs = (lock, counter, len(jobs), normal, samtools) 154 | pool = Pool(processes=min(J, len(jobs)), initializer=init_extracting_normal, initargs=initargs) 155 | 156 | counts = defaultdict(lambda : defaultdict(lambda : dict())) 157 | try: 158 | for c, b, rd in pool.imap_unordered(extracting_normal, jobs): 159 | assert 'normal' not in counts[c][b] 160 | if rd != '': 161 | counts[c][b]['normal'] = int(rd.strip()) 162 | else: 163 | counts[c][b]['normal'] = 0 164 | pool.close() 165 | pool.join() 166 | except Exception, e: 167 | pool.close() 168 | pool.terminate() 169 | raise RuntimeError("ERROR: " + str(e)) 170 | sys.exit(1) 171 | 172 | return counts 173 | 174 | 175 | def init_extracting_normal(lock, counter, _l, _normal, sam): 176 | global bar, cmd_sam 177 | bar = ProgressBar(total=_l, length=40, lock=lock, counter=counter, verbose=False) 178 | cmd_sam = "{} view -F 1796 -q 13 -c {} {}:{}-{}".format(sam, _normal, "{}", "{}", "{}") 179 | 180 | 181 | def extracting_normal(job): 182 | c, b = job 183 | cmd = cmd_sam.format(c, b[0], b[1]) 184 | stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate() 185 | bar.progress(advance=True, msg="Counting normal on {}:{}-{}".format(c, b[0], b[1])) 186 | return (c, b, stdout.strip()) 187 | 188 | 189 | def counting_cells(counts, tumor, bins, samtools, J, prefix, suffix): 190 | jobs = [(c, b) for c in bins for b in bins[c]] 191 | bar = ProgressBar(total=len(jobs), length=40, verbose=False) 192 | 193 | initargs = (tumor, samtools, prefix, suffix) 194 | pool = Pool(processes=min(J, len(jobs)), initializer=init_extracting, initargs=initargs) 195 | 196 | for c, b, rd in pool.imap_unordered(extracting, jobs): 197 | if rd != '': 198 | for l in rd.strip().split('\n'): 199 | p = l.split() 200 | assert p[0] not in counts[c][b] 201 | counts[c][b][p[0]] = int(p[1]) 202 | bar.progress(advance=True, msg="Extracted barcodes on {}:{}-{}".format(c, b[0], b[1])) 203 | 204 | pool.close() 205 | pool.join() 206 | 207 | return counts 208 | 209 | 210 | def init_extracting(_tumor, sam, prefix, suffix): 211 | global cmd_sam, cmd_awk 212 | cmd_sam = "{} view -F 1796 -q 13 {} {}:{}-{}".format(sam, _tumor, "{}", "{}", "{}") 213 | cmd_awk = shlex.split("awk 'BEGIN{{}} {{ if(match($0, /{}[ACGT]+{}/)) {{ X[substr($0, RSTART+5, RLENGTH-5)]++ }} }} END{{ for(i in X) print i, X[i] }}'".format(prefix, suffix)) 214 | 215 | 216 | def extracting(job): 217 | c, b = job 218 | cmd = cmd_sam.format(c, b[0], b[1]) 219 | sam = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE) 220 | stdout, stderr = sp.Popen(cmd_awk, stdin=sam.stdout, stdout=sp.PIPE, stderr=sp.PIPE).communicate() 221 | return (c, b, stdout.strip()) 222 | 223 | 224 | def get_bins(ref, chromosomes, bsize, bams=None, samtools=None): 225 | chrs = set(c.replace('chr', '') for c in chromosomes) 226 | ends = {} 227 | refdict = os.path.splitext(ref)[0] + '.dict' 228 | with open(refdict, 'r') as i: 229 | for l in i: 230 | if '@SQ' in l: 231 | assert 'SN:' in l and 'LN:' in l 232 | c = l.split('SN:')[1].split()[0] 233 | if c.replace('chr', '') in chrs: 234 | end = int(l.split('LN:')[1].split()[0]) 235 | ends[c] = end 236 | 237 | missing = [c for c in chrs if c not in ends and 'chr{}'.format(c) not in ends] 238 | if missing: 239 | msg = "The following chromosomes have not been found in the dictionary of the reference genome with or without chr-notation: \n\t{}" 240 | error(msg.format(','.join(missing))) 241 | 242 | if bams and samtools: 243 | for bam in bams: 244 | cmd = "{} view -H {}".format(samtools, bam) 245 | stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate() 246 | allchrs = set(p.replace('SN:','') for l in stdout.strip().split('\n') if l[:3] == '@SQ' for p in l.strip().split() if p[:3]) 247 | missing = [c for c in ends if c not in allchrs] 248 | if missing: 249 | msg = "The following chromosomes have not been found in {} with these exact names: \n\t{}" 250 | error(msg.format(bam, ','.join(missing))) 251 | 252 | fl = (lambda l, c : l + [ends[c]] if l[-1] < ends[c] else (l if l[-1] == ends[c] else [])) 253 | bk = (lambda c : fl(list(range(0, ends[c], bsize)), c)) 254 | bins = {c : sorted(zip(bk(c)[:-1], bk(c)[1:]), key=(lambda x : x[0])) for c in ends} 255 | assert False not in set(len(bins[c]) > 0 for c in bins), "Binning failed for some chromosomes" 256 | 257 | return bins 258 | 259 | 260 | if __name__ == '__main__': 261 | main() 262 | -------------------------------------------------------------------------------- /src/chisel/Utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | import sys 5 | import datetime 6 | import subprocess as sp 7 | import multiprocessing as mp 8 | import shlex 9 | import datetime 10 | import re 11 | 12 | from collections import defaultdict 13 | from contextlib import contextmanager 14 | 15 | 16 | argmax = (lambda d : max(d, key=(lambda x : d[x]))) 17 | 18 | 19 | argmin = (lambda d : min(d, key=(lambda x : d[x]))) 20 | 21 | 22 | def log(m): 23 | sys.stderr.write('# ' + m + '\n') 24 | 25 | checkchrs = (lambda digit_x, x : digit_x if len(digit_x) > 0 else 26 | (23 if 'X' in x else 27 | (24 if 'Y' in x else 28 | (25 if 'M' in x else 26)))) 29 | orderchrs = (lambda x : int(checkchrs(''.join([l for l in x if l.isdigit()]), x))) 30 | 31 | 32 | def inupdate(a, b): 33 | a.update(b) 34 | return a 35 | 36 | 37 | def indices_to_condensed(i, j, n): 38 | assert i != j, "ERROR: equal indices cannot be transformed into condensed indices!" 39 | if i < j: 40 | i, j = j, i 41 | return n*j - j*(j+1)/2 + i - 1 - j 42 | 43 | def which(program): 44 | import os 45 | def is_exe(fpath): 46 | return os.path.isfile(fpath) and os.access(fpath, os.X_OK) 47 | 48 | fpath, fname = os.path.split(program) 49 | if fpath: 50 | if is_exe(program): 51 | return program 52 | else: 53 | for path in os.environ["PATH"].split(os.pathsep): 54 | path = path.strip('"') 55 | exe_file = os.path.join(path, program) 56 | if is_exe(exe_file): 57 | return exe_file 58 | 59 | return None 60 | 61 | @contextmanager 62 | def stdout_redirected(to=os.devnull): 63 | fd = sys.stdout.fileno() 64 | 65 | def _redirect_stdout(to): 66 | sys.stdout.close() 67 | os.dup2(to.fileno(), fd) 68 | sys.stdout = os.fdopen(fd, 'w') 69 | 70 | with os.fdopen(os.dup(fd), 'w') as old_stdout: 71 | with open(to, 'w') as file: 72 | _redirect_stdout(to=file) 73 | try: 74 | yield 75 | finally: 76 | _redirect_stdout(to=old_stdout) 77 | 78 | 79 | class ProgressBar: 80 | 81 | def __init__(self, total, length, lock=None, counter=0, verbose=False, decimals=1, fill=unichr(9608), prefix = 'Progress:', suffix = 'Complete'): 82 | self.total = total 83 | self.length = length 84 | self.decimals = decimals 85 | self.fill = fill 86 | self.prefix = prefix 87 | self.suffix = suffix 88 | self.lock = lock 89 | self.counter = counter 90 | assert lock is not None or counter == 0 91 | self.verbose = verbose 92 | 93 | def progress(self, advance=True, msg=""): 94 | if self.lock is None: 95 | self.progress_unlocked(advance, msg) 96 | else: 97 | self.progress_locked(advance, msg) 98 | return True 99 | 100 | def progress_unlocked(self, advance, msg): 101 | flush = sys.stderr.flush 102 | write = sys.stderr.write 103 | if advance: 104 | self.counter += 1 105 | percent = ("{0:." + str(self.decimals) + "f}").format(100 * (self.counter / float(self.total))) 106 | filledLength = int(self.length * self.counter // self.total) 107 | bar = self.fill * filledLength + '-' * (self.length - filledLength) 108 | rewind = '\x1b[2K\r' 109 | result = '%s |%s| %s%% %s' % (self.prefix, bar, percent, self.suffix) 110 | msg = '[{:%Y-%b-%d %H:%M:%S}]'.format(datetime.datetime.now()) + msg 111 | if not self.verbose: 112 | toprint = rewind + result + " [%s]" % (msg) 113 | else: 114 | toprint = rewind + msg + "\n" + result 115 | write(toprint.encode('utf-8')) 116 | flush() 117 | if self.counter == self.total: 118 | write("\n") 119 | flush() 120 | 121 | def progress_locked(self, advance, msg): 122 | flush = sys.stderr.flush 123 | write = sys.stderr.write 124 | if advance: 125 | with self.counter.get_lock(): 126 | self.counter.value += 1 127 | percent = ("{0:." + str(self.decimals) + "f}").format(100 * (self.counter.value / float(self.total))) 128 | filledLength = int(self.length * self.counter.value // self.total) 129 | bar = self.fill * filledLength + '-' * (self.length - filledLength) 130 | rewind = '\x1b[2K\r' 131 | result = '%s |%s| %s%% %s' % (self.prefix, bar, percent, self.suffix) 132 | msg = '[{:%Y-%b-%d %H:%M:%S}]'.format(datetime.datetime.now()) + msg 133 | if not self.verbose: 134 | toprint = rewind + result + " [%s]" % (msg) 135 | else: 136 | toprint = rewind + msg + "\n" + result 137 | with self.lock: 138 | write(toprint.encode('utf-8')) 139 | flush() 140 | if self.counter.value == self.total: 141 | write("\n") 142 | flush() 143 | 144 | 145 | def log(msg, level='STEP', lock=None): 146 | timestamp = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now()) 147 | if level == "STEP": 148 | if lock is None: 149 | sys.stderr.write("{}{}[{}]{}{}\n".format(bcolors.BOLD, bcolors.HEADER, timestamp, msg, bcolors.ENDC)) 150 | else: 151 | with lock: sys.stderr.write("{}{}[{}]{}{}\n".format(bcolors.BOLD, bcolors.HEADER, timestamp, msg, bcolors.ENDC)) 152 | elif level == "INFO": 153 | if lock is None: 154 | sys.stderr.write("{}[{}]{}{}\n".format(bcolors.OKGREEN, timestamp, msg, bcolors.ENDC)) 155 | else: 156 | with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.OKGREEN, timestamp, msg, bcolors.ENDC)) 157 | elif level == "WARN": 158 | if lock is None: 159 | sys.stderr.write("{}[{}]{}{}\n".format(bcolors.WARNING, timestamp, msg, bcolors.ENDC)) 160 | else: 161 | with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.WARNING, timestamp, msg, bcolors.ENDC)) 162 | elif level == "PROGRESS": 163 | if lock is None: 164 | sys.stderr.write("{}{}[{}]{}{}\n".format(bcolors.UNDERLINE, bcolors.BBLUE, timestamp, msg, bcolors.ENDC)) 165 | else: 166 | with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.BBLUE, timestamp, msg, bcolors.ENDC)) 167 | elif level == "ERROR": 168 | if lock is None: 169 | sys.stderr.write("{}[{}]{}{}\n".format(bcolors.FAIL, timestamp, msg, bcolors.ENDC)) 170 | else: 171 | with lock: sys.stderr.write("{}[{}]{}{}\n".format(bcolors.FAIL, timestamp, msg, bcolors.ENDC)) 172 | else: 173 | if lock is None: 174 | sys.stderr.write("{}\n".format(msg)) 175 | else: 176 | with lock: sys.stderr.write("{}\n".format(msg)) 177 | 178 | 179 | def runcmd(cmd, xdir, out=None, log="log"): 180 | j = os.path.join 181 | tmp = log + '_TMP' 182 | sout = open(j(xdir, out) if out is not None else os.devnull, 'w') 183 | with open(j(xdir, tmp), 'w') as serr: 184 | proc = sp.Popen(shlex.split(cmd), stdout=sout, stderr=sp.PIPE) 185 | for line in iter(lambda : proc.stderr.read(1), ''): 186 | sys.stderr.write(line) 187 | serr.write(line) 188 | sout.flush() 189 | sout.close() 190 | 191 | with open(j(xdir, tmp), 'r') as i: 192 | with open(j(xdir, log), 'w') as o: 193 | for l in i: 194 | if 'Progress' not in l: 195 | o.write(re.sub(r'\033\[[0-9]*m', '', l)) 196 | os.remove(j(xdir, tmp)) 197 | 198 | 199 | def error(msg): 200 | log(msg=msg, level="ERROR") 201 | sys.exit(0) 202 | 203 | 204 | class bcolors: 205 | HEADER = '\033[95m' 206 | OKBLUE = '\033[94m' 207 | BBLUE = '\033[96m' 208 | OKGREEN = '\033[92m' 209 | WARNING = '\033[93m' 210 | FAIL = '\033[91m' 211 | ENDC = '\033[0m' 212 | BOLD = '\033[1m' 213 | UNDERLINE = '\033[4m' 214 | -------------------------------------------------------------------------------- /src/chisel/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.2' 2 | -------------------------------------------------------------------------------- /src/chisel/bin.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk 2 | 3 | 4 | BEGIN{} 5 | { 6 | if ( match($0, /CB:Z:[ACGT]+/) ) 7 | { 8 | X[substr($0, RSTART+5, RLENGTH-5)]++ 9 | } 10 | } 11 | END{ for(i in X) print i, X[i] } 12 | -------------------------------------------------------------------------------- /src/chisel/bin/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1' 2 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_bedding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import sys, os 4 | import argparse 5 | 6 | from multiprocessing import Lock, Value, Pool 7 | from collections import Counter 8 | 9 | import chisel 10 | 11 | src = os.path.dirname(chisel.__file__) 12 | from ..Utils import * 13 | from chisel import Plotter 14 | 15 | 16 | def parse_args(): 17 | description = "CHISEL command to generate a BED file for each cell with the corresponding CHISEL's results." 18 | parser = argparse.ArgumentParser(description=description) 19 | parser.add_argument("INPUT", nargs='?', default='calls/calls.tsv', type=str, help="Input file with inferred copy numbers (default: calls/calls.tsv)") 20 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 21 | parser.add_argument("--rawcalls", required=False, default=False, action='store_true', help="Use raw copy numbers instead of consensus corrected ones (default: False)") 22 | parser.add_argument("--noextending", required=False, default=False, action='store_true', help="Merge consecutive bins only if they are neighboring (default: False, segments are extended to fill gaps)") 23 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 24 | args = parser.parse_args() 25 | 26 | if not os.path.isfile(args.INPUT): 27 | raise ValueError('ERROR: input file {} does not exist!'.format(args.INPUT)) 28 | if not os.path.isdir(args.rundir): 29 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 30 | 31 | if not args.jobs: 32 | args.jobs = mp.cpu_count() 33 | if args.jobs < 1: 34 | raise ValueError("The number of jobs must be positive!") 35 | 36 | return { 37 | "input" : args.INPUT, 38 | "rundir" : args.rundir, 39 | "rawcalls" : args.rawcalls, 40 | "noextending" : args.noextending, 41 | "j" : args.jobs 42 | } 43 | 44 | 45 | def main(): 46 | log('Parsing and checking arguments', level='STEP') 47 | args = parse_args() 48 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 49 | 50 | log('Reading input', level='STEP') 51 | bins, pos, cells, iscorr = Plotter.read_cells(args['input']) 52 | log('Number of cells: {}'.format(len(cells)), level='INFO') 53 | log('Number of bins: {}'.format(len(pos)), level='INFO') 54 | 55 | log('Converting and writing BED files', level='STEP') 56 | make_beds(bins, pos, cells, iscorr, args) 57 | 58 | log('KTHXBYE', level='STEP') 59 | 60 | 61 | def make_beds(bins, pos, cells, iscorr, args): 62 | jobs = (e for e in cells) 63 | bar = ProgressBar(total=len(cells), length=40, verbose=False) 64 | chk = (lambda g, e : bins[g][e]['CORR-CNS' if iscorr and not args['rawcalls'] else 'CNS'] if e in bins[g] else None) 65 | cns = {g : {e : chk(g, e) for e in bins[g]} for g in bins} 66 | initargs = (cns, pos, sum(g[2] - g[1] for g in pos), args['rundir'], args['noextending']) 67 | pool = Pool(processes=min(args['j'], len(cells)), initializer=init_making_bed, initargs=initargs) 68 | progress = (lambda e : bar.progress(advance=True, msg="Wrote cell {}".format(e))) 69 | res = map(progress, pool.imap_unordered(making_bed, jobs)) 70 | pool.close() 71 | pool.join() 72 | 73 | 74 | def init_making_bed(_cns, _pos, _totcov, _rundir, _noextending): 75 | global cns, pos, totcov, rundir, noextending 76 | cns = _cns 77 | pos = _pos 78 | totcov = _totcov 79 | rundir = _rundir 80 | noextending = _noextending 81 | 82 | 83 | def making_bed(e): 84 | ecns = {g : cns[g][e] for g in pos} 85 | flag = {g : x == 0 or g[0] != pos[x-1][0] for x, g in enumerate(pos)} 86 | form = (lambda cns : '{}|{}'.format(*cns)) 87 | 88 | with open(os.path.join(rundir, '{}.bed'.format(e)), 'w') as o: 89 | def out(start, end): 90 | assert start[0] == end[0] and start[1] <= end[1] and start[2] <= end[2] and ecns[start] == ecns[end], 'Error on consecutive bins' 91 | if not noextending: 92 | o.write('\t'.join(map(str, [start[0], 0 if flag[start] else start[1], end[2], form(ecns[start])])) + '\n') 93 | else: 94 | o.write('\t'.join(map(str, [start[0], start[1], end[2], form(ecns[start])])) + '\n') 95 | 96 | start = pos[0] 97 | end = None 98 | precn = None 99 | tot = 0 100 | for x, g in enumerate(pos): 101 | if end is not None and (start[0] != g[0] or precn != ecns[g] or (noextending and end[2] != g[1])): 102 | out(start, end) 103 | tot += end[2] - start[1] 104 | start = g 105 | precn = ecns[g] 106 | end = g 107 | out(start, end) 108 | tot += end[2] - start[1] 109 | 110 | assert end == pos[-1], 'Error for the last bin' 111 | assert (not noextending or tot == totcov) and (noextending or tot >= totcov), 'Error in total length: {} written vs. {} expected'.format(tot, totcov) 112 | return e 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_calling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os 4 | import argparse 5 | import subprocess as sp 6 | import multiprocessing as mp 7 | import shlex 8 | import datetime 9 | import re 10 | 11 | import chisel 12 | 13 | src = os.path.dirname(chisel.__file__) 14 | from ..Utils import * 15 | 16 | 17 | def parse_args(): 18 | description = "CHISEL command to re-run the inference of allele- and haplotype-specific copy numbers, cell clustering, and plotting. This steps starts from estimated RDRs and BAFs." 19 | parser = argparse.ArgumentParser(description=description) 20 | parser.add_argument("INPUT", nargs='?', default='combo/combo.tsv', type=str, help="Input file with combined RDR and BAF per bin and per cell (default: combo/combo.tsv)") 21 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 22 | parser.add_argument("-A","--sensitivity", required=False, type=float, default=1.0, help="Sensitivity of model selection for ploidy (default: 1, increase this parameter to lower sensitivity to noisy data, adjust this value (e.g. 2, 4, ..., 10, ...) to better deal with high-variance data (e.g. low coverage, small number of cells, low number of phased SNPs, etc...)") 23 | parser.add_argument("-P","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)") 24 | parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)") 25 | parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)") 26 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 27 | args = parser.parse_args() 28 | 29 | if not os.path.isfile(args.INPUT): 30 | raise ValueError("Input file does not exist: {}".format(args.INPUT)) 31 | if not os.path.isdir(args.rundir): 32 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 33 | if args.seed and args.seed < 1: 34 | raise ValueError("The random seed must be positive!") 35 | if args.maxploidy < 3: 36 | raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!") 37 | if args.upperk < 1: 38 | raise ValueError("The maximum number of clusters must be positive!") 39 | if not args.jobs: 40 | args.jobs = mp.cpu_count() 41 | if args.jobs < 1: 42 | raise ValueError("The number of jobs must be positive!") 43 | 44 | return { 45 | "INPUT" : args.INPUT, 46 | "rundir" : args.rundir, 47 | "sensitivity" : args.sensitivity, 48 | "maxploidy" : args.maxploidy, 49 | "upperk" : args.upperk, 50 | "seed" : args.seed, 51 | "jobs" : args.jobs 52 | } 53 | 54 | 55 | def main(): 56 | log('Parsing and checking arguments', level='PROGRESS') 57 | args = parse_args() 58 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 59 | 60 | log('Setting directories', level='PROGRESS') 61 | dcal, dclo, dplo = setup(args, force=False) 62 | def get_comp(name): 63 | comp = os.path.join(src, name) 64 | if not os.path.isfile(comp): 65 | raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src)) 66 | return comp 67 | 68 | log('Calling', level='PROGRESS') 69 | cmd = 'python2.7 {} {} -A {} -P {} -K {} -j {}' 70 | cmd = cmd.format(get_comp('Caller.py'), args['INPUT'], args['sensitivity'], args['maxploidy'], args['upperk'], args['jobs']) 71 | if args['seed'] is not None: 72 | cmd += " --seed {}".format(args['seed']) 73 | runcmd(cmd, dcal, out='calls.tsv') 74 | calls = os.path.join(dcal, 'calls.tsv') 75 | 76 | log('Cloning', level='PROGRESS') 77 | cmd = 'python2.7 {} {}' 78 | cmd = cmd.format(get_comp('Cloner.py'), calls) 79 | if args['seed'] is not None: 80 | cmd += " --seed {}".format(args['seed']) 81 | runcmd(cmd, dclo, out='mapping.tsv') 82 | mapping = os.path.join(dclo, 'mapping.tsv') 83 | 84 | log('Plotting', level='PROGRESS') 85 | os.chdir(dplo) 86 | up = (lambda f : os.path.join(os.pardir, f)) 87 | cmd = 'python2.7 {} {} -m {}' 88 | cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping)) 89 | runcmd(cmd, './') 90 | os.chdir(os.pardir) 91 | 92 | 93 | def setup(args, force=True): 94 | dcal = os.path.join(args['rundir'], 'calls') 95 | if os.path.isdir(dcal): 96 | log("The calls sub-directory in the running directory already exists, results will be overwritten!", level='WARN') 97 | else: 98 | os.mkdir(dcal) 99 | 100 | dclo = os.path.join(args['rundir'], 'clones') 101 | if os.path.isdir(dclo): 102 | log("The clones sub-directory in the running directory already exists, results will be overwritten!", level='WARN') 103 | else: 104 | os.mkdir(dclo) 105 | 106 | dplo = os.path.join(args['rundir'], 'plots') 107 | if os.path.isdir(dplo): 108 | log("The plots sub-directory in the running directory already exists, results will be overwritten!\n", level='WARN') 109 | else: 110 | os.mkdir(dplo) 111 | 112 | return dcal, dclo, dplo 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_cloning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os 4 | import argparse 5 | import subprocess as sp 6 | import multiprocessing as mp 7 | import shlex 8 | import datetime 9 | import re 10 | 11 | import chisel 12 | 13 | src = os.path.dirname(chisel.__file__) 14 | from ..Utils import * 15 | 16 | 17 | def parse_args(): 18 | description = "CHISEL command to run the pipeline starting from inferred copy numbers." 19 | parser = argparse.ArgumentParser(description=description) 20 | parser.add_argument("INPUT", nargs='?', default='calls/calls.tsv', type=str, help="Input file with combined RDR and BAF per bin and per cell") 21 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 22 | parser.add_argument("-f", "--maxdiff", required=False, type=float, default=0.06, help="Maximum haplotype-specific distance between the genome of cells in the same clone (default: 0.06, when -1 is chosen the maximum cluster method of SciPy is used)") 23 | parser.add_argument("-s", "--minsize", required=False, type=int, default=14, help="Minimum number of cells in a subpopulation to define a clone (default: 14)") 24 | parser.add_argument("-r", "--refinement", required=False, type=float, default=None, help="Maximum difference to assign noisy cells to the closest clone (default: 0.0, note that 1.0 can be used to force the assigment of all cells)") 25 | parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)") 26 | args = parser.parse_args() 27 | 28 | if not os.path.isfile(args.INPUT): 29 | raise ValueError("Input file does not exist: {}".format(args.INPUT)) 30 | if not os.path.isdir(args.rundir): 31 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 32 | if args.seed and args.seed < 1: 33 | raise ValueError("The random seed must be positive!") 34 | if (args.maxdiff < 0.0 and args.maxdiff != 1.0) or args.maxdiff > 1.0: 35 | raise ValueError("Maximum distance must be in [0, 1] or equal to -1!") 36 | if args.minsize < 0: 37 | raise ValueError("Minimum number of cells in a clone must be positive!") 38 | 39 | return { 40 | "INPUT" : args.INPUT, 41 | "rundir" : args.rundir, 42 | "maxdiff" : args.maxdiff, 43 | "minsize" : args.minsize, 44 | "refinement" : args.refinement, 45 | "seed" : args.seed 46 | } 47 | 48 | 49 | def main(): 50 | log('Parsing and checking arguments', level='PROGRESS') 51 | args = parse_args() 52 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 53 | 54 | log('Setting directories', level='PROGRESS') 55 | dclo, dplo = setup(args, force=False) 56 | def get_comp(name): 57 | comp = os.path.join(src, name) 58 | if not os.path.isfile(comp): 59 | raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src)) 60 | return comp 61 | 62 | log('Cloning', level='PROGRESS') 63 | cmd = 'python2.7 {} {} -f {} -s {}' 64 | cmd = cmd.format(get_comp('Cloner.py'), args['INPUT'], args['maxdiff'], args['minsize']) 65 | if args['refinement'] is not None: 66 | cmd += " -r {}".format(args['refinement']) 67 | if args['seed'] is not None: 68 | cmd += " --seed {}".format(args['seed']) 69 | runcmd(cmd, dclo, out='mapping.tsv') 70 | mapping = os.path.join(dclo, 'mapping.tsv') 71 | 72 | log('Plotting', level='PROGRESS') 73 | os.chdir(dplo) 74 | up = (lambda f : os.path.join(os.pardir, f)) 75 | cmd = 'python2.7 {} {} -m {}' 76 | cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(args['INPUT']), up(mapping)) 77 | runcmd(cmd, './') 78 | os.chdir(os.pardir) 79 | 80 | 81 | def setup(args, force=True): 82 | dclo = os.path.join(args['rundir'], 'clones') 83 | if os.path.isdir(dclo): 84 | log("The clones sub-directory in the running directory already exists, results will be overwritten!", level='WARN') 85 | else: 86 | os.mkdir(dclo) 87 | 88 | dplo = os.path.join(args['rundir'], 'plots') 89 | if os.path.isdir(dplo): 90 | log("The plots sub-directory in the running directory already exists, results will be overwritten!", level='WARN') 91 | else: 92 | os.mkdir(dplo) 93 | 94 | return dclo, dplo 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_combocall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | os.environ["OMP_NUM_THREADS"] = "1" 5 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 6 | os.environ["MKL_NUM_THREADS"] = "1" 7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 9 | import argparse 10 | from subprocess import Popen 11 | import chisel 12 | 13 | src = os.path.dirname(chisel.__file__) 14 | from ..Utils import * 15 | 16 | 17 | def parse_args(): 18 | description = "CHISEL command to run the complete pipeline starting from RDRs and BAFs for one or multiple samples from previously executions of CHISEL or CHISEl preprocess." 19 | parser = argparse.ArgumentParser(description=description) 20 | parser.add_argument("INPUT", type=str, nargs='+', help="One or multiple CHISEL directory runs for different samples from which to combine RDRs and BAFs") 21 | parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome") 22 | parser.add_argument("--names", required=False, default=None, type=str, nargs='+', help="Sample names when combining multiple samples (default: idx used)") 23 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 24 | parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)") 25 | parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")") 26 | parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)") 27 | parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)") 28 | parser.add_argument("--addgccorr", required=False, default=False, action='store_true', help="Add additional custome correction for GC bias (default: disabled)") 29 | parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)") 30 | parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)") 31 | parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)") 32 | parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)") 33 | parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)") 34 | parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)") 35 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 36 | args = parser.parse_args() 37 | 38 | for indir in args.INPUT: 39 | if not os.path.isdir(indir): 40 | raise ValueError("Input directory does not exists: {}".format(indir)) 41 | rdr_file = os.path.join(indir, 'rdr', 'rdr.tsv') 42 | if not os.path.isfile(rdr_file): 43 | raise ValueError("Input directory does not contain RDR file: {}".format(rdr_file)) 44 | tot_file = os.path.join(indir, 'rdr', 'total.tsv') 45 | if not os.path.isfile(tot_file): 46 | raise ValueError("Input directory does not contain Total read file: {}".format(tot_file)) 47 | baf_file = os.path.join(indir, 'baf', 'baf.tsv') 48 | if not os.path.isfile(baf_file): 49 | raise ValueError("Input directory does not contain BAF file: {}".format(baf_file)) 50 | if not os.path.isdir(args.rundir): 51 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 52 | if args.seed and args.seed < 1: 53 | raise ValueError("The random seed must be positive!") 54 | if args.maxploidy < 3: 55 | raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!") 56 | if args.upperk < 1: 57 | raise ValueError("The maximum number of clusters must be positive!") 58 | 59 | if not os.path.isfile(args.reference): 60 | raise ValueError(error("Reference genome file does not exist: {}".format(args.reference))) 61 | refidx = ['{}.{}'.format(args.reference, ix) for ix in ['amb', 'ann', 'bwt', 'pac', 'sa']] 62 | if not all(os.path.isfile(f) for f in refidx): 63 | raise ValueError(error("Some of the BWA index files are missing, please make sure these are available and generated through the command \n\t``bwa index {}''.\n Expected files are: {}".format(args.reference, '\n'.join(refidx)))) 64 | 65 | blocksize = 0 66 | try: 67 | if args.blocksize[-2:] == "kb": 68 | blocksize = int(args.blocksize[:-2]) * 1000 69 | elif args.blocksize[-2:] == "Mb": 70 | blocksize = int(args.blocksize[:-2]) * 1000000 71 | else: 72 | blocksize = int(args.blocksize) 73 | except: 74 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 75 | 76 | if not args.jobs: 77 | args.jobs = mp.cpu_count() 78 | if args.jobs < 1: 79 | raise ValueError("The number of jobs must be positive!") 80 | 81 | bcftools = args.bcftools 82 | if not bcftools: 83 | bcftools = "bcftools" 84 | if which(bcftools) is None: 85 | raise ValueError("bcftools has not been found or is not executable!") 86 | 87 | samtools = args.samtools 88 | if not samtools: 89 | samtools = "samtools" 90 | if which(samtools) is None: 91 | raise ValueError("samtools has not been found or is not executable!") 92 | 93 | return { 94 | "input" : args.INPUT, 95 | "reference" : os.path.abspath(args.reference), 96 | "names" : args.names, 97 | "rundir" : args.rundir, 98 | "blocksize" : blocksize, 99 | "chromosomes" : args.chromosomes, 100 | "addgccorr" : args.addgccorr, 101 | "phasecorr" : not args.nophasecorr, 102 | "bcftools" : bcftools, 103 | "samtools" : samtools, 104 | "maxploidy" : args.maxploidy, 105 | "upperk" : args.upperk, 106 | "cellprefix" : args.cellprefix, 107 | "cellsuffix" : args.cellsuffix, 108 | "seed" : args.seed, 109 | "jobs" : args.jobs 110 | } 111 | 112 | 113 | def main(): 114 | log('Parsing and checking arguments', level='PROGRESS') 115 | args = parse_args() 116 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 117 | 118 | log('Setting directories', level='PROGRESS') 119 | dbaf, drdr, dcom, dcal, dclo, dplo = setup(args) 120 | def get_comp(name): 121 | comp = os.path.join(src, name) 122 | if not os.path.isfile(comp): 123 | raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src)) 124 | return comp 125 | 126 | lcel = os.path.join(drdr, 'total.tsv') 127 | if os.path.isfile(lcel): 128 | raise ValueError("Total read file {} already exists, please remove it or it'd get overwritten!".format(lcel)) 129 | rdr = os.path.join(drdr, 'rdr.tsv') 130 | if os.path.isfile(rdr): 131 | raise ValueError("RDR file {} already exists, please remove it or it'd get overwritten!".format(rdr)) 132 | baf = os.path.join(dbaf, 'baf.tsv') 133 | if os.path.isfile(baf): 134 | raise ValueError("BAF file {} already exists, please remove it or it'd get overwritten!".format(baf)) 135 | 136 | log('Aggregating previously-computed RDRs and BAFs', level='PROGRESS') 137 | aggregate(rdr, lcel, baf, args['input'], args['names']) 138 | 139 | log('Combining RDRs and BAFs', level='PROGRESS') 140 | cmd = 'python2.7 {} -r {} -b {} -j {} -k {} -l {}' 141 | cmd = cmd.format(get_comp('Combiner.py'), rdr, baf, args['jobs'], args['blocksize'], lcel) 142 | if args['seed'] is not None: 143 | cmd += " --seed {}".format(args['seed']) 144 | if args['addgccorr']: 145 | cmd += " --gccorr {}".format(args['reference']) 146 | if not args['phasecorr']: 147 | cmd += " --nophasecorr" 148 | runcmd(cmd, dcom, out='combo.tsv') 149 | com = os.path.join(dcom, 'combo.tsv') 150 | 151 | log('Calling', level='PROGRESS') 152 | cmd = 'python2.7 {} {} -P {} -K {} -j {}' 153 | cmd = cmd.format(get_comp('Caller.py'), com, args['maxploidy'], args['upperk'], args['jobs']) 154 | if args['seed'] is not None: 155 | cmd += " --seed {}".format(args['seed']) 156 | runcmd(cmd, dcal, out='calls.tsv') 157 | calls = os.path.join(dcal, 'calls.tsv') 158 | 159 | log('Cloning', level='PROGRESS') 160 | cmd = 'python2.7 {} {}' 161 | cmd = cmd.format(get_comp('Cloner.py'), calls) 162 | if args['seed'] is not None: 163 | cmd += " --seed {}".format(args['seed']) 164 | runcmd(cmd, dclo, out='mapping.tsv') 165 | mapping = os.path.join(dclo, 'mapping.tsv') 166 | 167 | log('Plotting', level='PROGRESS') 168 | os.chdir(dplo) 169 | up = (lambda f : os.path.join(os.pardir, f)) 170 | cmd = 'python2.7 {} {} -m {}' 171 | cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping)) 172 | runcmd(cmd, './') 173 | os.chdir(os.pardir) 174 | 175 | 176 | def aggregate(rdr, lcel, baf, input_dirs, names): 177 | if names is None or len(names) != len(input_dirs): 178 | names = list(range(len(input_dirs))) 179 | for indir, name in zip(input_dirs, names): 180 | log('Aggregating RDRs for {} with name {}'.format(indir, name), level='INFO') 181 | Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,$3,"{}_"$4,$5,$6,$7{}'.format('{', name, '}'), 182 | os.path.join(indir, 'rdr', 'rdr.tsv'), 183 | rdr), shell=True).communicate() 184 | for indir, name in zip(input_dirs, names): 185 | log('Aggregating Total reads for {} with name {}'.format(indir, name), level='INFO') 186 | Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print "{}_"$1,$2{}'.format('{', name, '}'), 187 | os.path.join(indir, 'rdr', 'total.tsv'), 188 | lcel), shell=True).communicate() 189 | for indir, name in zip(input_dirs, names): 190 | log('Aggregating BAFs for {} with name {}'.format(indir, name), level='INFO') 191 | Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,"{}_"$3,$4,$5{}'.format('{', name, '}'), 192 | os.path.join(indir, 'baf', 'baf.tsv'), 193 | baf), shell=True).communicate() 194 | return 195 | 196 | 197 | def setup(args): 198 | if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']): 199 | log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN') 200 | 201 | dbaf = os.path.join(args['rundir'], 'baf') 202 | if not os.path.isdir(dbaf): 203 | os.mkdir(dbaf) 204 | 205 | drdr = os.path.join(args['rundir'], 'rdr') 206 | if not os.path.isdir(drdr): 207 | os.mkdir(drdr) 208 | 209 | dcom = os.path.join(args['rundir'], 'combo') 210 | if not os.path.isdir(dcom): 211 | os.mkdir(dcom) 212 | 213 | dcal = os.path.join(args['rundir'], 'calls') 214 | if not os.path.isdir(dcal): 215 | os.mkdir(dcal) 216 | 217 | dclo = os.path.join(args['rundir'], 'clones') 218 | if not os.path.isdir(dclo): 219 | os.mkdir(dclo) 220 | 221 | dplo = os.path.join(args['rundir'], 'plots') 222 | if not os.path.isdir(dplo): 223 | os.mkdir(dplo) 224 | 225 | return dbaf, drdr, dcom, dcal, dclo, dplo 226 | 227 | 228 | if __name__ == '__main__': 229 | main() 230 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | os.environ["OMP_NUM_THREADS"] = "1" 5 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 6 | os.environ["MKL_NUM_THREADS"] = "1" 7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 9 | import argparse 10 | import chisel 11 | 12 | src = os.path.dirname(chisel.__file__) 13 | from ..Utils import * 14 | 15 | 16 | def parse_args(): 17 | description = "CHISEL command to run the complete pipeline starting from the 4 required data: (1) Barcoded single-cell BAM; (2) Matched-normal BAM; (3) Reference genome; (4) Phased VCF." 18 | parser = argparse.ArgumentParser(description=description) 19 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 20 | parser.add_argument("-t","--tumor", required=True, type=str, help="Barcoded single-cell BAM file") 21 | parser.add_argument("-n","--normal", required=True, type=str, help="Matched-normal BAM file") 22 | parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome") 23 | parser.add_argument("-l","--listphased", type=str, required=True, help="Phased SNPs file (lines of heterozygous germline SNPs must contain either 0|1 or 1|0)") 24 | parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"") 25 | parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)") 26 | parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")") 27 | parser.add_argument("-m","--minreads", type=int, required=False, default=300000, help="Minimum number total reads to select cells (default: 300000)") 28 | parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)") 29 | parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)") 30 | parser.add_argument("--addgccorr", required=False, default=False, action='store_true', help="Add additional custome correction for GC bias (default: disabled)") 31 | parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)") 32 | parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)") 33 | parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)") 34 | parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)") 35 | parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)") 36 | parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)") 37 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 38 | args = parser.parse_args() 39 | 40 | if not os.path.isdir(args.rundir): 41 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 42 | if not os.path.isfile(args.tumor): 43 | raise ValueError("Barcoded single-cell BAM file does not exist: {}".format(args.tumor)) 44 | if not os.path.isfile(args.normal): 45 | raise ValueError("Matched-normal BAM file does not exist: {}".format(args.normal)) 46 | if not os.path.isfile(args.reference): 47 | raise ValueError("Reference genome file does not exist: {}".format(args.reference)) 48 | if not os.path.isfile(args.listphased): 49 | raise ValueError("Phased SNPs file does not exist: {}".format(args.listphased)) 50 | if args.seed and args.seed < 1: 51 | raise ValueError("The random seed must be positive!") 52 | if args.minreads < 1: 53 | raise ValueError("The minimum number of reads must be positive!") 54 | if args.maxploidy < 3: 55 | raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!") 56 | if args.upperk < 1: 57 | raise ValueError("The maximum number of clusters must be positive!") 58 | 59 | size = 0 60 | try: 61 | if args.size[-2:] == "kb": 62 | size = int(args.size[:-2]) * 1000 63 | elif args.size[-2:] == "Mb": 64 | size = int(args.size[:-2]) * 1000000 65 | else: 66 | size = int(args.size) 67 | except: 68 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 69 | 70 | blocksize = 0 71 | try: 72 | if args.blocksize[-2:] == "kb": 73 | blocksize = int(args.blocksize[:-2]) * 1000 74 | elif args.blocksize[-2:] == "Mb": 75 | blocksize = int(args.blocksize[:-2]) * 1000000 76 | else: 77 | blocksize = int(args.blocksize) 78 | except: 79 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 80 | 81 | if not args.jobs: 82 | args.jobs = mp.cpu_count() 83 | if args.jobs < 1: 84 | raise ValueError("The number of jobs must be positive!") 85 | 86 | bcftools = args.bcftools 87 | if not bcftools: 88 | bcftools = "bcftools" 89 | if which(bcftools) is None: 90 | raise ValueError("bcftools has not been found or is not executable!") 91 | 92 | samtools = args.samtools 93 | if not samtools: 94 | samtools = "samtools" 95 | if which(samtools) is None: 96 | raise ValueError("samtools has not been found or is not executable!") 97 | 98 | return { 99 | "rundir" : args.rundir, 100 | "tumor" : args.tumor, 101 | "normal" : args.normal, 102 | "reference" : args.reference, 103 | "listphased" : args.listphased, 104 | "binsize" : size, 105 | "blocksize" : blocksize, 106 | "chromosomes" : args.chromosomes, 107 | "minreads" : args.minreads, 108 | "addgccorr" : args.addgccorr, 109 | "phasecorr" : not args.nophasecorr, 110 | "bcftools" : bcftools, 111 | "samtools" : samtools, 112 | "maxploidy" : args.maxploidy, 113 | "upperk" : args.upperk, 114 | "cellprefix" : args.cellprefix, 115 | "cellsuffix" : args.cellsuffix, 116 | "seed" : args.seed, 117 | "jobs" : args.jobs 118 | } 119 | 120 | 121 | def main(): 122 | log('Parsing and checking arguments', level='PROGRESS') 123 | args = parse_args() 124 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 125 | 126 | log('Setting directories', level='PROGRESS') 127 | dbaf, drdr, dcom, dcal, dclo, dplo = setup(args) 128 | def get_comp(name): 129 | comp = os.path.join(src, name) 130 | if not os.path.isfile(comp): 131 | raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src)) 132 | return comp 133 | 134 | log('Computing RDRs', level='PROGRESS') 135 | cmd = 'python2.7 {} -n {} -t {} -r {} -b {} -m {} -j {} -c \"{}\" --outdir {}' 136 | cmd = cmd.format(get_comp('RDREstimator.py'), args['normal'], args['tumor'], args['reference'], args['binsize'], args['minreads'], args['jobs'], args['chromosomes'], drdr) 137 | if args['samtools'] is not None: 138 | cmd += " -s {}".format(args['samtools']) 139 | cmd += " --cellprefix {}".format(args['cellprefix']) 140 | if args['cellsuffix'] is not None: 141 | cmd += " --cellsuffix {}".format(args['cellsuffix']) 142 | runcmd(cmd, drdr, out='rdr.tsv') 143 | lcel = os.path.join(drdr, 'total.tsv') 144 | rdr = os.path.join(drdr, 'rdr.tsv') 145 | 146 | log('Computing BAFs', level='PROGRESS') 147 | cmd = 'python2.7 {} -n {} -t {} -r {} -j {} -c {} -l {}' 148 | cmd = cmd.format(get_comp('BAFEstimator.py'), args['normal'], args['tumor'], args['reference'], args['jobs'], lcel, args['listphased']) 149 | if args['samtools'] is not None: 150 | cmd += " -s {}".format(args['samtools']) 151 | if args['bcftools'] is not None: 152 | cmd += " -b {}".format(args['bcftools']) 153 | cmd += " --cellprefix {}".format(args['cellprefix']) 154 | if args['cellsuffix'] is not None: 155 | cmd += " --cellsuffix {}".format(args['cellsuffix']) 156 | runcmd(cmd, dbaf, out='baf.tsv') 157 | baf = os.path.join(dbaf, 'baf.tsv') 158 | 159 | log('Combining RDRs and BAFs', level='PROGRESS') 160 | cmd = 'python2.7 {} -r {} -b {} -j {} -k {} -l {}' 161 | cmd = cmd.format(get_comp('Combiner.py'), rdr, baf, args['jobs'], args['blocksize'], lcel) 162 | if args['seed'] is not None: 163 | cmd += " --seed {}".format(args['seed']) 164 | if args['addgccorr']: 165 | cmd += " --gccorr {}".format(args['reference']) 166 | if not args['phasecorr']: 167 | cmd += " --nophasecorr" 168 | runcmd(cmd, dcom, out='combo.tsv') 169 | com = os.path.join(dcom, 'combo.tsv') 170 | 171 | log('Calling', level='PROGRESS') 172 | cmd = 'python2.7 {} {} -P {} -K {} -j {}' 173 | cmd = cmd.format(get_comp('Caller.py'), com, args['maxploidy'], args['upperk'], args['jobs']) 174 | if args['seed'] is not None: 175 | cmd += " --seed {}".format(args['seed']) 176 | runcmd(cmd, dcal, out='calls.tsv') 177 | calls = os.path.join(dcal, 'calls.tsv') 178 | 179 | log('Cloning', level='PROGRESS') 180 | cmd = 'python2.7 {} {}' 181 | cmd = cmd.format(get_comp('Cloner.py'), calls) 182 | if args['seed'] is not None: 183 | cmd += " --seed {}".format(args['seed']) 184 | runcmd(cmd, dclo, out='mapping.tsv') 185 | mapping = os.path.join(dclo, 'mapping.tsv') 186 | 187 | log('Plotting', level='PROGRESS') 188 | os.chdir(dplo) 189 | up = (lambda f : os.path.join(os.pardir, f)) 190 | cmd = 'python2.7 {} {} -m {}' 191 | cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping)) 192 | runcmd(cmd, './') 193 | os.chdir(os.pardir) 194 | 195 | 196 | def setup(args): 197 | if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']): 198 | log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN') 199 | 200 | dbaf = os.path.join(args['rundir'], 'baf') 201 | if not os.path.isdir(dbaf): 202 | os.mkdir(dbaf) 203 | 204 | drdr = os.path.join(args['rundir'], 'rdr') 205 | if not os.path.isdir(drdr): 206 | os.mkdir(drdr) 207 | 208 | dcom = os.path.join(args['rundir'], 'combo') 209 | if not os.path.isdir(dcom): 210 | os.mkdir(dcom) 211 | 212 | dcal = os.path.join(args['rundir'], 'calls') 213 | if not os.path.isdir(dcal): 214 | os.mkdir(dcal) 215 | 216 | dclo = os.path.join(args['rundir'], 'clones') 217 | if not os.path.isdir(dclo): 218 | os.mkdir(dclo) 219 | 220 | dplo = os.path.join(args['rundir'], 'plots') 221 | if not os.path.isdir(dplo): 222 | os.mkdir(dplo) 223 | 224 | return dbaf, drdr, dcom, dcal, dclo, dplo 225 | 226 | 227 | if __name__ == '__main__': 228 | main() 229 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_nonormal_combocall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | os.environ["OMP_NUM_THREADS"] = "1" 5 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 6 | os.environ["MKL_NUM_THREADS"] = "1" 7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 9 | import argparse 10 | from subprocess import Popen 11 | import chisel 12 | 13 | src = os.path.dirname(chisel.__file__) 14 | from ..Utils import * 15 | 16 | 17 | def parse_args(): 18 | description = "CHISEL command to run the complete pipeline starting from RDRs and BAFs for one or multiple samples from previously executions of CHISEL or CHISEl preprocess." 19 | parser = argparse.ArgumentParser(description=description) 20 | parser.add_argument("INPUT", type=str, nargs='+', help="One or multiple CHISEL directory runs for different samples from which to combine RDRs and BAFs") 21 | parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome") 22 | parser.add_argument("--names", required=False, default=None, type=str, nargs='+', help="Sample names when combining multiple samples (default: idx used)") 23 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 24 | parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)") 25 | parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")") 26 | parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)") 27 | parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)") 28 | parser.add_argument("--minimumsnps", required=False, type=float, default=0.08, help="Minimum SNP density per kb (default: 0.08)") 29 | parser.add_argument("--missingsnps", required=False, type=str, default="10,0", help="A,B counts for genomic bins without minimum minimum SNP density (default: 10,0 i.e. BAF=0)") 30 | parser.add_argument("--nogccorr", required=False, default=False, action='store_true', help="Disable correction for GC bias (default: enabled)") 31 | parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)") 32 | parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)") 33 | parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)") 34 | parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)") 35 | parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)") 36 | parser.add_argument("--simcov", required=False, type=float, default=2, help="Sequencing fold coverage of simulated normal BAM file (default: 2)") 37 | parser.add_argument("--binstats", required=False, type=int, default=None, help="Number of bins to sample per chromosome to estimate sequencing stats (default: all are used, fix a number for improving speed)") 38 | parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)") 39 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 40 | args = parser.parse_args() 41 | 42 | for indir in args.INPUT: 43 | if not os.path.isdir(indir): 44 | raise ValueError("Input directory does not exists: {}".format(indir)) 45 | rdr_file = os.path.join(indir, 'rdr', 'rdr.tsv') 46 | if not os.path.isfile(rdr_file): 47 | raise ValueError("Input directory does not contain RDR file: {}".format(rdr_file)) 48 | tot_file = os.path.join(indir, 'rdr', 'total.tsv') 49 | if not os.path.isfile(tot_file): 50 | raise ValueError("Input directory does not contain Total read file: {}".format(tot_file)) 51 | baf_file = os.path.join(indir, 'baf', 'baf.tsv') 52 | if not os.path.isfile(baf_file): 53 | raise ValueError("Input directory does not contain BAF file: {}".format(baf_file)) 54 | if not os.path.isdir(args.rundir): 55 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 56 | if args.seed and args.seed < 1: 57 | raise ValueError("The random seed must be positive!") 58 | if args.maxploidy < 3: 59 | raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!") 60 | if args.upperk < 1: 61 | raise ValueError("The maximum number of clusters must be positive!") 62 | if args.minimumsnps < 0.0: 63 | raise ValueError("The minimum SNP density must be >= 0.0!") 64 | if args.simcov <= 0.0: 65 | raise ValueError("The sequencing coverage of simulated normal must be >= 0.0!") 66 | if args.binstats is not None and args.binstats <= 0: 67 | raise ValueError("The number of bins for sequencing stats must be >= 0.0!") 68 | 69 | if not os.path.isfile(args.reference): 70 | raise ValueError(error("Reference genome file does not exist: {}".format(args.reference))) 71 | refidx = ['{}.{}'.format(args.reference, ix) for ix in ['amb', 'ann', 'bwt', 'pac', 'sa']] 72 | if not all(os.path.isfile(f) for f in refidx): 73 | raise ValueError(error("Some of the BWA index files are missing, please make sure these are available and generated through the command \n\t``bwa index {}''.\n Expected files are: {}".format(args.reference, '\n'.join(refidx)))) 74 | 75 | blocksize = 0 76 | try: 77 | if args.blocksize[-2:] == "kb": 78 | blocksize = int(args.blocksize[:-2]) * 1000 79 | elif args.blocksize[-2:] == "Mb": 80 | blocksize = int(args.blocksize[:-2]) * 1000000 81 | else: 82 | blocksize = int(args.blocksize) 83 | except: 84 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 85 | 86 | if not args.jobs: 87 | args.jobs = mp.cpu_count() 88 | if args.jobs < 1: 89 | raise ValueError("The number of jobs must be positive!") 90 | 91 | bcftools = args.bcftools 92 | if not bcftools: 93 | bcftools = "bcftools" 94 | if which(bcftools) is None: 95 | raise ValueError("bcftools has not been found or is not executable!") 96 | 97 | samtools = args.samtools 98 | if not samtools: 99 | samtools = "samtools" 100 | if which(samtools) is None: 101 | raise ValueError("samtools has not been found or is not executable!") 102 | 103 | return { 104 | "input" : args.INPUT, 105 | "reference" : os.path.abspath(args.reference), 106 | "names" : args.names, 107 | "rundir" : args.rundir, 108 | "blocksize" : blocksize, 109 | "chromosomes" : args.chromosomes, 110 | "phasecorr" : not args.nophasecorr, 111 | "bcftools" : bcftools, 112 | "samtools" : samtools, 113 | "maxploidy" : args.maxploidy, 114 | "upperk" : args.upperk, 115 | 'minimumsnps' : args.minimumsnps, 116 | 'missingsnps' : args.missingsnps, 117 | "cellprefix" : args.cellprefix, 118 | "cellsuffix" : args.cellsuffix, 119 | "gccorr" : not args.nogccorr, 120 | "phasecorr" : not args.nophasecorr, 121 | "simcov" : args.simcov, 122 | "binstats" : args.binstats, 123 | "seed" : args.seed, 124 | "jobs" : args.jobs 125 | } 126 | 127 | 128 | def main(): 129 | log('Parsing and checking arguments', level='PROGRESS') 130 | args = parse_args() 131 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 132 | 133 | log('Setting directories', level='PROGRESS') 134 | dbaf, drdr, dcom, dcal, dclo, dplo = setup(args) 135 | def get_comp(name): 136 | comp = os.path.join(src, name) 137 | if not os.path.isfile(comp): 138 | raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src)) 139 | return comp 140 | 141 | lcel = os.path.join(drdr, 'total.tsv') 142 | if os.path.isfile(lcel): 143 | raise ValueError("Total read file {} already exists, please remove it or it'd get overwritten!".format(lcel)) 144 | rdr = os.path.join(drdr, 'rdr.tsv') 145 | if os.path.isfile(rdr): 146 | raise ValueError("RDR file {} already exists, please remove it or it'd get overwritten!".format(rdr)) 147 | baf = os.path.join(dbaf, 'baf.tsv') 148 | if os.path.isfile(baf): 149 | raise ValueError("BAF file {} already exists, please remove it or it'd get overwritten!".format(baf)) 150 | 151 | log('Aggregating previously-computed RDRs and BAFs', level='PROGRESS') 152 | aggregate(rdr, lcel, baf, args['input'], args['names']) 153 | 154 | log('Combining RDRs and BAFs', level='PROGRESS') 155 | cmd = 'python2.7 {} -r {} -b {} -j {} -k {} -l {} --minimumsnps {} --missingsnps {}' 156 | cmd = cmd.format(get_comp('Combiner.py'), rdr, baf, args['jobs'], args['blocksize'], lcel, args['minimumsnps'], args['missingsnps']) 157 | if args['gccorr']: 158 | cmd += " --gccorr {}".format(args['reference']) 159 | if not args['phasecorr']: 160 | cmd += " --nophasecorr" 161 | if args['seed'] is not None: 162 | cmd += " --seed {}".format(args['seed']) 163 | runcmd(cmd, dcom, out='combo.tsv') 164 | com = os.path.join(dcom, 'combo.tsv') 165 | 166 | log('Calling', level='PROGRESS') 167 | cmd = 'python2.7 {} {} -P {} -K {} -j {}' 168 | cmd = cmd.format(get_comp('Caller.py'), com, args['maxploidy'], args['upperk'], args['jobs']) 169 | if args['seed'] is not None: 170 | cmd += " --seed {}".format(args['seed']) 171 | runcmd(cmd, dcal, out='calls.tsv') 172 | calls = os.path.join(dcal, 'calls.tsv') 173 | 174 | log('Cloning', level='PROGRESS') 175 | cmd = 'python2.7 {} {}' 176 | cmd = cmd.format(get_comp('Cloner.py'), calls) 177 | if args['seed'] is not None: 178 | cmd += " --seed {}".format(args['seed']) 179 | runcmd(cmd, dclo, out='mapping.tsv') 180 | mapping = os.path.join(dclo, 'mapping.tsv') 181 | 182 | log('Plotting', level='PROGRESS') 183 | os.chdir(dplo) 184 | up = (lambda f : os.path.join(os.pardir, f)) 185 | cmd = 'python2.7 {} {} -m {}' 186 | cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(calls), up(mapping)) 187 | runcmd(cmd, './') 188 | os.chdir(os.pardir) 189 | 190 | 191 | def aggregate(rdr, lcel, baf, input_dirs, names): 192 | if names is None or len(names) != len(input_dirs): 193 | names = list(range(len(input_dirs))) 194 | for indir, name in zip(input_dirs, names): 195 | log('Aggregating RDRs for {} with name {}'.format(indir, name), level='INFO') 196 | Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,$3,"{}_"$4,$5,$6,$7{}'.format('{', name, '}'), 197 | os.path.join(indir, 'rdr', 'rdr.tsv'), 198 | rdr), shell=True).communicate() 199 | for indir, name in zip(input_dirs, names): 200 | log('Aggregating Total reads for {} with name {}'.format(indir, name), level='INFO') 201 | Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print "{}_"$1,$2{}'.format('{', name, '}'), 202 | os.path.join(indir, 'rdr', 'total.tsv'), 203 | lcel), shell=True).communicate() 204 | for indir, name in zip(input_dirs, names): 205 | log('Aggregating BAFs for {} with name {}'.format(indir, name), level='INFO') 206 | Popen("awk -v OFS='\\t' '{}' {} >> {}".format('{}print $1,$2,"{}_"$3,$4,$5{}'.format('{', name, '}'), 207 | os.path.join(indir, 'baf', 'baf.tsv'), 208 | baf), shell=True).communicate() 209 | return 210 | 211 | 212 | def setup(args): 213 | if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']): 214 | log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN') 215 | 216 | dbaf = os.path.join(args['rundir'], 'baf') 217 | if not os.path.isdir(dbaf): 218 | os.mkdir(dbaf) 219 | 220 | drdr = os.path.join(args['rundir'], 'rdr') 221 | if not os.path.isdir(drdr): 222 | os.mkdir(drdr) 223 | 224 | dcom = os.path.join(args['rundir'], 'combo') 225 | if not os.path.isdir(dcom): 226 | os.mkdir(dcom) 227 | 228 | dcal = os.path.join(args['rundir'], 'calls') 229 | if not os.path.isdir(dcal): 230 | os.mkdir(dcal) 231 | 232 | dclo = os.path.join(args['rundir'], 'clones') 233 | if not os.path.isdir(dclo): 234 | os.mkdir(dclo) 235 | 236 | dplo = os.path.join(args['rundir'], 'plots') 237 | if not os.path.isdir(dplo): 238 | os.mkdir(dplo) 239 | 240 | return dbaf, drdr, dcom, dcal, dclo, dplo 241 | 242 | 243 | if __name__ == '__main__': 244 | main() 245 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_plotting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os 4 | import argparse 5 | import subprocess as sp 6 | import multiprocessing as mp 7 | import shlex 8 | import datetime 9 | import re 10 | 11 | import chisel 12 | 13 | src = os.path.dirname(chisel.__file__) 14 | from ..Utils import * 15 | 16 | 17 | def parse_args(): 18 | description = "CHISEL command to re-create the plots." 19 | parser = argparse.ArgumentParser(description=description) 20 | parser.add_argument("INPUT", nargs='?', default='calls/calls.tsv', type=str, help="Input file with inferred copy numbers (default: calls/calls.tsv)") 21 | parser.add_argument("-m", "--clonemap", required=False, type=str, default='clones/mapping.tsv', help="Clone map (default: not used, the cells will be clustered for plotting purposes)") 22 | parser.add_argument("-f", "--figformat", required=False, type=str, default='png', help="Format of output figures (default: png, the only other option is pdf)") 23 | parser.add_argument("-s", "--sample", required=False, type=int, default=20, help="Number of cells to sample (default: 20)") 24 | parser.add_argument("--excludenoisy", required=False, default=False, action='store_true', help="Exclude noisy cells from plots (default: False)") 25 | parser.add_argument("--gridsize", required=False, type=str, default='12,6', help="Grid dimenstions specified as comma-separated numbers (default: 12,6)") 26 | parser.add_argument("--plotsize", required=False, type=str, default='5,1.5', help="Plot dimenstions for RDR-BAF plots, specified as comma-separated numbers (default: 5,1.5)") 27 | parser.add_argument("--clussize", required=False, type=str, default='5,3', help="Grid dimenstions for clustered plots, specified as comma-separated numbers (default: 5,3)") 28 | parser.add_argument("--xmax", required=False, type=float, default=None, help="Maximum x-axis value (default: None)") 29 | parser.add_argument("--xmin", required=False, type=float, default=None, help="Minimum x-axis value (default: None)") 30 | parser.add_argument("--ymax", required=False, type=float, default=None, help="Maximum x-axis value (default: None)") 31 | parser.add_argument("--ymin", required=False, type=float, default=None, help="Minimum x-axis value (default: None)") 32 | args = parser.parse_args() 33 | 34 | if not os.path.isfile(args.INPUT): 35 | raise ValueError('ERROR: input file {} does not exist!'.format(args.INPUT)) 36 | if args.clonemap and not os.path.isfile(args.clonemap): 37 | raise ValueError('ERROR: the provided clone map does not exist!') 38 | if args.figformat not in ['pdf', 'png']: 39 | raise ValueError('ERROR: figure format must be either pdf or png!') 40 | if args.sample < 1: 41 | raise ValueError('ERROR: number of sampled cells must be positive!') 42 | 43 | return { 44 | 'input' : args.INPUT, 45 | 'clonemap' : args.clonemap, 46 | 'format' : args.figformat, 47 | 'sample' : args.sample, 48 | 'nonoisy' : args.excludenoisy, 49 | 'gridsize' : args.gridsize, 50 | 'plotsize' : args.plotsize, 51 | 'clussize' : args.clussize, 52 | 'xmax' : args.xmax, 53 | 'xmin' : args.xmin, 54 | 'ymax' : args.ymax, 55 | 'ymin' : args.ymin 56 | } 57 | 58 | 59 | def main(): 60 | log('Parsing and checking arguments', level='PROGRESS') 61 | args = parse_args() 62 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 63 | 64 | log('Setting directories', level='PROGRESS') 65 | dplo = 'plots' 66 | if os.path.isdir(dplo): 67 | log("The plots sub-directory in the running directory already exists, results will be overwritten!", level='WARN') 68 | else: 69 | os.mkdir(dplo) 70 | 71 | log('Plotting', level='PROGRESS') 72 | os.chdir(dplo) 73 | up = (lambda f : os.path.join(os.pardir, f)) 74 | cmd = 'python2.7 {} {} -m {} -f {} -s {}' 75 | cmd = cmd.format(os.path.join(src, 'Plotter.py'), up(args['input']), up(args['clonemap']), args['format'], args['sample']) 76 | cmd += ' --gridsize {}'.format(args['gridsize']) 77 | cmd += ' --plotsize {}'.format(args['plotsize']) 78 | cmd += ' --clussize {}'.format(args['clussize']) 79 | if args['nonoisy']: 80 | cmd += ' --excludenoisy' 81 | if args['xmax']: 82 | cmd += ' --xmax {}'.format(args['xmax']) 83 | if args['xmin']: 84 | cmd += ' --xmin {}'.format(args['xmin']) 85 | if args['ymax']: 86 | cmd += ' --ymax {}'.format(args['ymax']) 87 | if args['ymin']: 88 | cmd += ' --ymin {}'.format(args['ymin']) 89 | 90 | runcmd(cmd, './') 91 | os.chdir(os.pardir) 92 | 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os, sys 4 | os.environ["OMP_NUM_THREADS"] = "1" 5 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 6 | os.environ["MKL_NUM_THREADS"] = "1" 7 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 8 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 9 | import argparse 10 | import chisel 11 | 12 | src = os.path.dirname(chisel.__file__) 13 | from ..Utils import * 14 | 15 | 16 | def parse_args(): 17 | description = "Preprocess CHISEL command to compute RDRs and BAFs preprocess data from standard CHISEL inputs." 18 | parser = argparse.ArgumentParser(description=description) 19 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 20 | parser.add_argument("-t","--tumor", required=True, type=str, help="Barcoded single-cell BAM file") 21 | parser.add_argument("-n","--normal", required=True, type=str, help="Matched-normal BAM file") 22 | parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome") 23 | parser.add_argument("-l","--listphased", type=str, required=True, help="Phased SNPs file (lines of heterozygous germline SNPs must contain either 0|1 or 1|0)") 24 | parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"") 25 | parser.add_argument("-k", "--blocksize", required=False, type=str, default="50kb", help="Size of the haplotype blocks (default: 50kb, use 0 to disable)") 26 | parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")") 27 | parser.add_argument("-m","--minreads", type=int, required=False, default=300000, help="Minimum number total reads to select cells (default: 300000)") 28 | parser.add_argument("-p","--maxploidy", required=False, type=int, default=4, help="Maximum total copy number to consider for balanced cluster (default: 4, corresponding to a WGD)") 29 | parser.add_argument("-K","--upperk", required=False, type=int, default=100, help="Maximum number of bin clusters (default: 100, use 0 to consider maximum number of clusters)") 30 | parser.add_argument("--addgccorr", required=False, default=False, action='store_true', help="Add additional custome correction for GC bias (default: disabled)") 31 | parser.add_argument("--nophasecorr", required=False, default=False, action='store_true', help="Disable correction for given phasing bias (default: enabled)") 32 | parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)") 33 | parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)") 34 | parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)") 35 | parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)") 36 | parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)") 37 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 38 | args = parser.parse_args() 39 | 40 | if not os.path.isdir(args.rundir): 41 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 42 | if not os.path.isfile(args.tumor): 43 | raise ValueError("Barcoded single-cell BAM file does not exist: {}".format(args.tumor)) 44 | if not os.path.isfile(args.normal): 45 | raise ValueError("Matched-normal BAM file does not exist: {}".format(args.normal)) 46 | if not os.path.isfile(args.reference): 47 | raise ValueError("Reference genome file does not exist: {}".format(args.reference)) 48 | if not os.path.isfile(args.listphased): 49 | raise ValueError("Phased SNPs file does not exist: {}".format(args.listphased)) 50 | if args.seed and args.seed < 1: 51 | raise ValueError("The random seed must be positive!") 52 | if args.minreads < 1: 53 | raise ValueError("The minimum number of reads must be positive!") 54 | if args.maxploidy < 3: 55 | raise ValueError("The maximum total copy number to consider for balanced cluster must be at least 2!") 56 | if args.upperk < 1: 57 | raise ValueError("The maximum number of clusters must be positive!") 58 | 59 | size = 0 60 | try: 61 | if args.size[-2:] == "kb": 62 | size = int(args.size[:-2]) * 1000 63 | elif args.size[-2:] == "Mb": 64 | size = int(args.size[:-2]) * 1000000 65 | else: 66 | size = int(args.size) 67 | except: 68 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 69 | 70 | blocksize = 0 71 | try: 72 | if args.blocksize[-2:] == "kb": 73 | blocksize = int(args.blocksize[:-2]) * 1000 74 | elif args.blocksize[-2:] == "Mb": 75 | blocksize = int(args.blocksize[:-2]) * 1000000 76 | else: 77 | blocksize = int(args.blocksize) 78 | except: 79 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 80 | 81 | if not args.jobs: 82 | args.jobs = mp.cpu_count() 83 | if args.jobs < 1: 84 | raise ValueError("The number of jobs must be positive!") 85 | 86 | bcftools = args.bcftools 87 | if not bcftools: 88 | bcftools = "bcftools" 89 | if which(bcftools) is None: 90 | raise ValueError("bcftools has not been found or is not executable!") 91 | 92 | samtools = args.samtools 93 | if not samtools: 94 | samtools = "samtools" 95 | if which(samtools) is None: 96 | raise ValueError("samtools has not been found or is not executable!") 97 | 98 | return { 99 | "rundir" : args.rundir, 100 | "tumor" : args.tumor, 101 | "normal" : args.normal, 102 | "reference" : args.reference, 103 | "listphased" : args.listphased, 104 | "binsize" : size, 105 | "blocksize" : blocksize, 106 | "chromosomes" : args.chromosomes, 107 | "minreads" : args.minreads, 108 | "addgccorr" : args.addgccorr, 109 | "phasecorr" : not args.nophasecorr, 110 | "bcftools" : bcftools, 111 | "samtools" : samtools, 112 | "maxploidy" : args.maxploidy, 113 | "upperk" : args.upperk, 114 | "cellprefix" : args.cellprefix, 115 | "cellsuffix" : args.cellsuffix, 116 | "seed" : args.seed, 117 | "jobs" : args.jobs 118 | } 119 | 120 | 121 | def main(): 122 | log('Parsing and checking arguments', level='PROGRESS') 123 | args = parse_args() 124 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 125 | 126 | log('Setting directories', level='PROGRESS') 127 | dbaf, drdr = setup(args) 128 | def get_comp(name): 129 | comp = os.path.join(src, name) 130 | if not os.path.isfile(comp): 131 | raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src)) 132 | return comp 133 | 134 | log('Computing RDRs', level='PROGRESS') 135 | cmd = 'python2.7 {} -n {} -t {} -r {} -b {} -m {} -j {} -c \"{}\" --outdir {}' 136 | cmd = cmd.format(get_comp('RDREstimator.py'), args['normal'], args['tumor'], args['reference'], args['binsize'], args['minreads'], args['jobs'], args['chromosomes'], drdr) 137 | if args['samtools'] is not None: 138 | cmd += " -s {}".format(args['samtools']) 139 | cmd += " --cellprefix {}".format(args['cellprefix']) 140 | if args['cellsuffix'] is not None: 141 | cmd += " --cellsuffix {}".format(args['cellsuffix']) 142 | runcmd(cmd, drdr, out='rdr.tsv') 143 | lcel = os.path.join(drdr, 'total.tsv') 144 | rdr = os.path.join(drdr, 'rdr.tsv') 145 | 146 | log('Computing BAFs', level='PROGRESS') 147 | cmd = 'python2.7 {} -n {} -t {} -r {} -j {} -c {} -l {}' 148 | cmd = cmd.format(get_comp('BAFEstimator.py'), args['normal'], args['tumor'], args['reference'], args['jobs'], lcel, args['listphased']) 149 | if args['samtools'] is not None: 150 | cmd += " -s {}".format(args['samtools']) 151 | if args['bcftools'] is not None: 152 | cmd += " -b {}".format(args['bcftools']) 153 | cmd += " --cellprefix {}".format(args['cellprefix']) 154 | if args['cellsuffix'] is not None: 155 | cmd += " --cellsuffix {}".format(args['cellsuffix']) 156 | runcmd(cmd, dbaf, out='baf.tsv') 157 | baf = os.path.join(dbaf, 'baf.tsv') 158 | 159 | 160 | def setup(args): 161 | if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['baf', 'rdr', 'combo', 'calls', 'clones', 'plots']): 162 | log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN') 163 | 164 | dbaf = os.path.join(args['rundir'], 'baf') 165 | if not os.path.isdir(dbaf): 166 | os.mkdir(dbaf) 167 | 168 | drdr = os.path.join(args['rundir'], 'rdr') 169 | if not os.path.isdir(drdr): 170 | os.mkdir(drdr) 171 | 172 | return dbaf, drdr 173 | 174 | 175 | if __name__ == '__main__': 176 | main() 177 | -------------------------------------------------------------------------------- /src/chisel/bin/chisel_pseudonormal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os 4 | import argparse 5 | import shlex, shutil 6 | import multiprocessing as mp 7 | 8 | from multiprocessing import Lock, Value, Pool 9 | from collections import defaultdict 10 | from collections import Counter 11 | 12 | import numpy as np 13 | 14 | import chisel 15 | 16 | src = os.path.dirname(chisel.__file__) 17 | from ..Utils import * 18 | from ..RDREstimator import * 19 | 20 | 21 | def parse_args(): 22 | description = "CHISEL command to generate a pseudo-matched normal sample by extracting diploid cells from a barcoded single-cell BAM file." 23 | parser = argparse.ArgumentParser(description=description) 24 | parser.add_argument("INPUT", type=str, help="Barcoded single-cell BAM file") 25 | parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome") 26 | parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)") 27 | parser.add_argument("-e","--threshold", type=float, required=False, default=0.9, help="Minimum fraction of diploid genome to select diploid cells (default: 0.9)") 28 | parser.add_argument("-b","--size", type=str, required=False, default="5Mb", help="Bin size, with or without \"kb\" or \"Mb\"") 29 | parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")") 30 | parser.add_argument("-m","--minreads", type=int, required=False, default=100000, help="Minimum number total reads to select cells (default: 100000)") 31 | parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)") 32 | parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)") 33 | parser.add_argument("--tmpdir", required=False, default='_TMP', type=str, help="Temporary directory in running directory (default: _TMP)") 34 | parser.add_argument("-n","--normal", required=False, type=str, default="pseudonormal.bam", help="Name of the generated pseudo matched-normal BAM file (default: pseudonormal.bam)") 35 | parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)") 36 | parser.add_argument("--cellsuffix", type=str, required=False, default='', help="Suffix of cell barcode field in SAM format (default: none)") 37 | args = parser.parse_args() 38 | 39 | if not os.path.isdir(args.rundir): 40 | raise ValueError("Running directory does not exists: {}".format(args.rundir)) 41 | tmpdir = os.path.join(args.rundir, args.tmpdir) 42 | if os.path.isdir(tmpdir): 43 | raise ValueError("Temporary directory already exists within specified running direcotyr: {}".format(tmpdir)) 44 | if not os.path.isfile(args.INPUT): 45 | raise ValueError("Barcoded single-cell BAM file does not exist: {}".format(args.INPUT)) 46 | if not args.normal[:-4] != ".bam": 47 | raise ValueError("The provided output name does not end in .bam: {}".format(args.normal)) 48 | if not (0.0 <= args.threshold <= 1.0): 49 | raise ValueError("The provided threshold is not in [0, 1]: {}".format(args.threshold)) 50 | if not os.path.isfile(args.reference): 51 | raise ValueError("Reference genome file does not exist: {}".format(args.reference)) 52 | if args.minreads < 1: 53 | raise ValueError("The minimum number of reads must be positive!") 54 | 55 | size = 0 56 | try: 57 | if args.size[-2:] == "kb": 58 | size = int(args.size[:-2]) * 1000 59 | elif args.size[-2:] == "Mb": 60 | size = int(args.size[:-2]) * 1000000 61 | else: 62 | size = int(args.size) 63 | except: 64 | raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!") 65 | 66 | if not args.jobs: 67 | args.jobs = mp.cpu_count() 68 | if args.jobs < 1: 69 | raise ValueError("The number of jobs must be positive!") 70 | 71 | samtools = args.samtools 72 | if not samtools: 73 | samtools = "samtools" 74 | if which(samtools) is None: 75 | raise ValueError("samtools has not been found or is not executable!") 76 | 77 | return { 78 | "rundir" : args.rundir, 79 | "tmpdir" : tmpdir, 80 | "tumor" : args.INPUT, 81 | "thres" : args.threshold, 82 | "normal" : args.normal, 83 | "reference" : args.reference, 84 | "binsize" : size, 85 | "chromosomes" : args.chromosomes, 86 | "minreads" : args.minreads, 87 | "samtools" : samtools, 88 | "cellprefix" : args.cellprefix, 89 | "cellsuffix" : args.cellsuffix, 90 | "jobs" : args.jobs 91 | } 92 | 93 | 94 | def main(): 95 | log('Parsing and checking arguments') 96 | args = parse_args() 97 | log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO') 98 | 99 | log('Computing bins') 100 | chrs = args['chromosomes'].split() 101 | bins = get_bins(args['reference'], chrs, args['binsize'], bams=[args['tumor']], samtools=args['samtools']) 102 | counts = defaultdict(lambda : defaultdict(lambda : dict())) 103 | assert len(chrs) == len(bins.keys()) 104 | chrs = bins.keys() 105 | 106 | log('Counting reads on barcoded cells') 107 | counts = counting_cells(counts, args['tumor'], bins, args['samtools'], args['jobs'], args['cellprefix'], args['cellsuffix']) 108 | cells = set(e for c in counts for b in counts[c] for e in counts[c][b]) 109 | log('Number of identified cells: {}'.format(len(cells)), level='INFO') 110 | 111 | log('Computing total numbers of sequenced reads') 112 | total = reduce(inupdate, (Counter(counts[c][b]) for c in counts for b in counts[c])) 113 | 114 | log('Selecting all cells to consider for the analysis') 115 | if args['minreads']: 116 | cells = set(e for e in total if total[e] >= args['minreads']) 117 | log('Number of selected cells: {}'.format(len(cells)), level='INFO') 118 | 119 | log('Selecting diploid cells') 120 | diploid = sorted(set(filter((lambda e : isdiploid(counts, e, args['thres'])), cells))) 121 | dlist = os.path.join(args['rundir'], 'diploid.tsv') 122 | with open(dlist, 'w') as o: 123 | o.write('\n'.join(diploid) + '\n') 124 | log('Number of identified diploid cells: {}'.format(len(diploid)), level='INFO') 125 | 126 | if len(diploid) > 0: 127 | cov = (float(sum(total[e] for e in diploid)) * 100.0) / float(sum(b[1] - b[0] for c in counts for b in counts[c])) 128 | log('Approximate sequencing coverage of pseudo matched-normal sample: {}'.format(cov), level='INFO') 129 | 130 | log('Extracting sequencing reads from selected diploid cells') 131 | extracting_diploid(args['tumor'], args['samtools'], chrs, args['tmpdir'], dlist, args['jobs']) 132 | 133 | log('Merging and extracted sequencing reads and indexing the output pseduo matched-normal sample') 134 | merging_diploid(chrs, args['tmpdir'], args['samtools'], os.path.join(args['rundir'], args['normal'])) 135 | 136 | log('Removing temporary files') 137 | shutil.rmtree(args['tmpdir']) 138 | 139 | log('KTHXBYE') 140 | 141 | 142 | def isdiploid(counts, cell, THRES): 143 | rdr = np.array([counts[c][b][cell] for c in counts for b in counts[c] if cell in counts[c][b] and counts[c][b][cell] > 0]) 144 | base = np.sum(rdr) / float(rdr.shape[0]) 145 | assert base > 0, "Found a cell with no sequencing reads" 146 | rdr = rdr / base 147 | avg = 2.0 / (np.sum(rdr) / float(rdr.shape[0])) 148 | dip = (lambda t : np.sum(np.rint(t * rdr) == 2)) 149 | scale = max((avg + (x/100.0)*d for x in xrange(0, 100+1, 1) for d in {-1, 1}), key=dip) 150 | return (dip(scale) / float(rdr.shape[0])) >= THRES 151 | 152 | 153 | def extracting_diploid(bam, samt, chrs, tmpdir, dlist, J): 154 | lock = Lock() 155 | counter = Value('i', 0) 156 | assert not os.path.isdir(tmpdir) 157 | os.mkdir(tmpdir) 158 | initargs = (lock, counter, len(chrs), bam, samt, tmpdir, dlist) 159 | pool = Pool(processes=min(J, len(chrs)), initializer=init_extracting_diploid, initargs=initargs) 160 | res = {o for o in pool.imap_unordered(extract, chrs)} 161 | #if o.strip() != '': 162 | #raise ValueError("SAMtools raised the following error during extraction ofsequencing reads: {}".format(o)) 163 | return 164 | 165 | 166 | def init_extracting_diploid(lock, counter, l, bam, samt, _tmpdir, dlist): 167 | global bar, cmd_sam, cmd_gre, cmd_com, tmpdir 168 | bar = ProgressBar(total=l, length=min(l, 40), lock=lock, counter=counter, verbose=False) 169 | cmd_sam = "{} view -h -F 1796 -q 13 {} {}".format(samt, bam, "{}") 170 | cmd_gre = "grep -F -f {} -e \"@HD\" -e \"@SQ\" -e \"@RG\" -e \"@PG\" -e \"@CO\"".format(dlist) 171 | cmd_com = "{} sort -O bam -o {} -T {}".format(samt, "{}", "{}") 172 | tmpdir = _tmpdir 173 | 174 | 175 | def extract(c): 176 | cmd = cmd_sam.format(c) 177 | out = os.path.join(tmpdir, '{}.bam'.format(c)) 178 | tmp = os.path.join(tmpdir, '_TMP_{}'.format(c)) 179 | os.mkdir(tmp) 180 | sam = sp.Popen(shlex.split(cmd_sam.format(c)), stdout=sp.PIPE, stderr=sp.PIPE) 181 | gre = sp.Popen(shlex.split(cmd_gre), stdin=sam.stdout, stdout=sp.PIPE, stderr=sp.PIPE) 182 | stdout, stderr = sp.Popen(shlex.split(cmd_com.format(out, tmp)), stdin=gre.stdout, stdout=sp.PIPE, stderr=sp.PIPE).communicate() 183 | return stderr.strip() 184 | 185 | 186 | def merging_diploid(chrs, tmpdir, samt, out): 187 | cfiles = map((lambda c : os.path.join(tmpdir, '{}.bam'.format(c))), sorted(chrs, key=orderchrs)) 188 | assert all(os.path.isfile(f) for f in cfiles), "Extracted reads are missing for some files!" 189 | cmd = "{} merge -f {} {}".format(samt, out, ' '.join(cfiles)) 190 | stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate() 191 | #if stderr.strip() != '': 192 | # raise ValueError("SAMtools merging terminated with the following error: {}".format(stderr)) 193 | cmd = "{} index {}".format(samt, out) 194 | stdout, stderr = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE).communicate() 195 | #if stderr.strip() != '': 196 | # raise ValueError("SAMtools indexing terminated with the following error: {}".format(stderr)) 197 | return 198 | 199 | 200 | if __name__ == '__main__': 201 | main() 202 | -------------------------------------------------------------------------------- /src/chisel/bin/count.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk 2 | 3 | BEGIN{} 4 | { 5 | if ( match($0, /CB:Z:[ACGT]+/) ) 6 | { 7 | REF = $4 - 1; 8 | QUE = 0; 9 | CIG = $6; 10 | CEL = substr($0, RSTART+5, RLENGTH-5); 11 | while( match(CIG, /^[[:digit:]]+/) ) 12 | { 13 | N = substr(CIG, RSTART, RLENGTH); 14 | CIG = substr(CIG, RSTART+RLENGTH); 15 | if( match(CIG, /^[MIDNSHP=X]/) ) 16 | { 17 | C = substr(CIG, RSTART, RLENGTH); 18 | CIG = substr(CIG, RSTART+RLENGTH); 19 | if (C == "M" || C == "=" || C == "X") 20 | { 21 | REF += N; 22 | QUE += N; 23 | if (TAG <= REF) 24 | { 25 | X[CEL, substr($10, QUE - REF + TAG, 1)]++; 26 | next; 27 | }; 28 | } else if (C == "D" || C == "N") 29 | { 30 | REF += N; 31 | if (TAG <= REF) 32 | { 33 | X[CEL, "N"]++; 34 | next; 35 | } 36 | } else if (C == "I" || C == "S") 37 | { 38 | QUE += N; 39 | }; 40 | }; 41 | }; 42 | }; 43 | } 44 | END{ for (p in X) { split(p, x, SUBSEP); print x[1], x[2], X[x[1], x[2]] } } 45 | -------------------------------------------------------------------------------- /src/chisel/count.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk 2 | 3 | BEGIN{} 4 | { 5 | if ( match($0, /CB:Z:[ACGT]+/) ) 6 | { 7 | REF = $4 - 1; 8 | QUE = 0; 9 | CIG = $6; 10 | CEL = substr($0, RSTART+5, RLENGTH-5); 11 | while( match(CIG, /^[[:digit:]]+/) ) 12 | { 13 | N = substr(CIG, RSTART, RLENGTH); 14 | CIG = substr(CIG, RSTART+RLENGTH); 15 | if( match(CIG, /^[MIDNSHP=X]/) ) 16 | { 17 | C = substr(CIG, RSTART, RLENGTH); 18 | CIG = substr(CIG, RSTART+RLENGTH); 19 | if (C == "M" || C == "=" || C == "X") 20 | { 21 | REF += N; 22 | QUE += N; 23 | if (TAG <= REF) 24 | { 25 | X[CEL, substr($10, QUE - REF + TAG, 1)]++; 26 | next; 27 | }; 28 | } else if (C == "D" || C == "N") 29 | { 30 | REF += N; 31 | if (TAG <= REF) 32 | { 33 | X[CEL, "N"]++; 34 | next; 35 | } 36 | } else if (C == "I" || C == "S") 37 | { 38 | QUE += N; 39 | }; 40 | }; 41 | }; 42 | }; 43 | } 44 | END{ for (p in X) { split(p, x, SUBSEP); print x[1], x[2], X[x[1], x[2]] } } 45 | 46 | -------------------------------------------------------------------------------- /tests/allchecks.sh: -------------------------------------------------------------------------------- 1 | # Checks 2 | : ex: set ft=markdown ;:<<'```shell' # 3 | 4 | This script runs all the tests to check that the current CHISEL implementation is correct and behaves as expected. 5 | 6 | ## Set up 7 | 8 | ```shell 9 | set -e 10 | set -o xtrace 11 | PS4='[\t]' 12 | cd $( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) 13 | rm -rf X/ complete/ callingE/ cloningE/ plottingE/ pseudonormal/ 14 | :<<'```shell' # Ignore this line 15 | ``` 16 | 17 | ## Check function 18 | 19 | ```shell 20 | check () { 21 | if cmp $1 $2 22 | then 23 | echo "CHECK $3: TEST $1 SUCCESS!" 24 | else 25 | echo "CHECK $3: TEST $1 FAILED!" 26 | exit 1 27 | fi 28 | } 29 | :<<'```shell' # Ignore this line 30 | ``` 31 | 32 | ## Check complete 33 | 34 | ```shell 35 | mkdir X/ 36 | cp ../demos/complete/demo-complete.sh X/demo-complete.sh 37 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/complete.tar.gz | tar -xvz 38 | check complete.chk <(bash X/demo-complete.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "complete" 39 | for F in complete/*.png; do check ${F} X/plots/$(basename ${F}) "complete"; done 40 | check complete/calls.tsv X/calls/calls.tsv "complete" 41 | check complete/mapping.tsv X/clones/mapping.tsv "complete" 42 | rm -rf X/ complete/ 43 | :<<'```shell' # Ignore this line 44 | ``` 45 | 46 | ## Check callingE 47 | 48 | ```shell 49 | mkdir X/ 50 | cp ../demos/callingE/demo-callingE.sh X/demo-callingE.sh 51 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/callingE.tar.gz | tar -xvz 52 | check callingE.chk <(bash X/demo-callingE.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "callingE" 53 | for F in callingE/*.png; do check ${F} X/plots/$(basename ${F}) "callingE"; done 54 | check callingE/calls.tsv X/calls/calls.tsv "callingE" 55 | check callingE/mapping.tsv X/clones/mapping.tsv "callingE" 56 | rm -rf X/ callingE/ 57 | :<<'```shell' # Ignore this line 58 | ``` 59 | 60 | ## Check cloningE 61 | 62 | ```shell 63 | mkdir X/ 64 | cp ../demos/cloningE/demo-cloningE.sh X/demo-cloningE.sh 65 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/cloningE.tar.gz | tar -xvz 66 | check cloningE.chk <(bash X/demo-cloningE.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "cloningE" 67 | for F in cloningE/*.png; do check ${F} X/plots/$(basename ${F}) "cloningE"; done 68 | check cloningE/mapping.tsv X/clones/mapping.tsv "cloningE" 69 | rm -rf X/ cloningE/ 70 | :<<'```shell' # Ignore this line 71 | ``` 72 | 73 | ## Check plottingE 74 | 75 | ```shell 76 | mkdir X/ 77 | cp ../demos/plottingE/demo-plottingE.sh X/demo-plottingE.sh 78 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/plottingE.tar.gz | tar -xvz 79 | check plottingE.chk <(bash X/demo-plottingE.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "plottingE" 80 | for F in plottingE/*.png; do check ${F} X/plots/$(basename ${F}) "plottingE"; done 81 | rm -rf X/ plottingE/ 82 | :<<'```shell' # Ignore this line 83 | ``` 84 | 85 | ## Checking pseudonormal 86 | 87 | ```shell 88 | mkdir X/ 89 | cp ../demos/pseudonormal/demo-pseudonormal.sh X/demo-pseudonormal.sh 90 | curl -L https://github.com/raphael-group/chisel-data/raw/master/tests/pseudonormal.tar.gz | tar -xvz 91 | check pseudonormal.chk <(bash X/demo-pseudonormal.sh |& grep -v -e "Progress:" -e "UserWarning" -e "--:--:--" -e "chisel" -e "curl" -e "Speed" -e "gzip" -e "samtools" -e "rundir" -e "j " -e "J " -e "jobs" |& sed 's/\x1b\[[0-9;]*m//g' |& sed -u 's/\[[^]]*\]//g') "pseudonormal" 92 | check pseudonormal/diploid.tsv X/diploid.tsv "pseudonormal" 93 | rm -rf X/ pseudonormal/ 94 | :<<'```shell' # Ignore this line 95 | ``` 96 | 97 | ## Successful checks 98 | 99 | ```shell 100 | echo "ALL CHECKS PASSED SUCCESSFULLY!" 101 | exit $? 102 | ``` 103 | -------------------------------------------------------------------------------- /tests/callingE.chk: -------------------------------------------------------------------------------- 1 | + PS4='' 2 | : 3 | mkdir -p data 4 | export INPUT=data/combo.tsv 5 | INPUT=data/combo.tsv 6 | : 7 | Parsing and checking arguments 8 | Arguments: 9 | upperk : 100 10 | sensitivity : 1.0 11 | seed : 25 12 | INPUT : data/combo.tsv 13 | maxploidy : 4 14 | 15 | Setting directories 16 | Calling 17 | Parsing and checking arguments 18 | Arguments: 19 | significativity : 0.02 20 | sensitivity : 1.0 21 | seed : 25 22 | input : data/combo.tsv 23 | LB : 0 24 | scoring : False 25 | e : 0.05 26 | shift : 0.05 27 | maxploidy : 4 28 | fastscaling : False 29 | restarts : 200 30 | lord : 1 31 | UB : 100 32 | Reading combined RDRs and BAFs of barcoded cells 33 | Formatting RDRs and BAFs 34 | Clustering globally 35 | Computing for 100: 36 | Objective value for 100: 0.328880907635 37 | Computing for 50: 38 | Objective value for 50: 0.380881364939 39 | Computing for 75: 40 | Objective value for 75: 0.354660110754 41 | Computing for 62: 42 | Objective value for 62: 0.369479243234 43 | Computing for 56: 44 | Objective value for 56: 0.375508283812 45 | Computing for 53: 46 | Objective value for 53: 0.382415700949 47 | Computing for 54: 48 | Objective value for 54: 0.377394006258 49 | Computing for 53: 50 | Objective value for 53: 0.382415700949 51 | Computing for 54: 52 | Objective value for 54: 0.377394006258 53 | Estimating RDR and BAF of every cluster 54 | Selecting ploidies 55 | Number of cells for every ploidy' level: 56 | Cells with base ploidy 2: 401 57 | Cells with base ploidy 4: 1674 58 | Inferring copy numbers 59 | Phasing copy-number states along the genome 60 | Writing results 61 | Cloning 62 | Parsing and checking arguments 63 | Arguments: 64 | minsize : 14 65 | refinement : 0.0 66 | seed : 25 67 | maxdiff : 0.06 68 | input : ./calls/calls.tsv 69 | linkage : single 70 | Reading input 71 | Clustering cells in clones 72 | Selecting clones 73 | Number of identified clones: 6 74 | Refining clustering 75 | Number of discarded cells: 784 over 2075 in total 76 | Profiling clones 77 | Writing clone map 78 | Writing clone-corrected copy numbers in provided input 79 | Plotting 80 | Parsing and checking arguments 81 | Arguments: 82 | format : png 83 | plotsize : (5.0, 1.5) 84 | sample : 20 85 | xmin : None 86 | clonemap : .././clones/mapping.tsv 87 | nonoisy : False 88 | gridsize : (12.0, 6.0) 89 | ymax : None 90 | clussize : (5.0, 3.0) 91 | xmax : None 92 | ymin : None 93 | input : .././calls/calls.tsv 94 | Reading input 95 | Number of cells: 2075 96 | Number of bins: 570 97 | Setting style 98 | Reading clonemap 99 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png 100 | Plotting clustered RDR plots for 20 random cells in crdr.png 101 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png 102 | Plotting read-depth ratios in rdrs.png 103 | Plotting B-allele frequencies in bafs.png 104 | Plotting total copy numbers in totalcn.png 105 | Plotting total copy numbers corrected by clones in totalcn-corrected.png 106 | Plotting LOH in loh.png 107 | Plotting LOH corrected by clones in loh-corrected.png 108 | Plotting A-specific copy numbers in Aspecificcn.png 109 | Plotting A-specific copy numbers corrected by clones in Aspecificcn-corrected.png 110 | Plotting B-specific copy numbers in Bspecificcn.png 111 | Plotting B-specific copy numbers corrected by clones in Bspecificcn-corrected.png 112 | Plotting allele-specific copy numbers in allelecn.png 113 | Plotting allele-specific copy numbers corrected by clones in allelecn-corrected.png 114 | Plotting haplotype-specific copy numbers in haplotypecn.png 115 | Plotting haplotype-specific copy numbers corrected by clones in haplotypecn-corrected.png 116 | KTHKBYE! 117 | exit 0 118 | -------------------------------------------------------------------------------- /tests/cloningE.chk: -------------------------------------------------------------------------------- 1 | + PS4='' 2 | : 3 | mkdir -p data 4 | export INPUT=data/calls.tsv 5 | INPUT=data/calls.tsv 6 | : 7 | Parsing and checking arguments 8 | Arguments: 9 | minsize : 14 10 | refinement : None 11 | seed : 25 12 | maxdiff : 0.06 13 | INPUT : data/calls.tsv 14 | 15 | Setting directories 16 | Cloning 17 | Parsing and checking arguments 18 | Arguments: 19 | minsize : 14 20 | refinement : 0.0 21 | seed : 25 22 | maxdiff : 0.06 23 | input : data/calls.tsv 24 | linkage : single 25 | Reading input 26 | Clustering cells in clones 27 | Selecting clones 28 | Number of identified clones: 6 29 | Refining clustering 30 | Number of discarded cells: 689 over 2075 in total 31 | Profiling clones 32 | Writing clone map 33 | Writing clone-corrected copy numbers in provided input 34 | Plotting 35 | Parsing and checking arguments 36 | Arguments: 37 | format : png 38 | plotsize : (5.0, 1.5) 39 | sample : 20 40 | xmin : None 41 | clonemap : .././clones/mapping.tsv 42 | nonoisy : False 43 | gridsize : (12.0, 6.0) 44 | ymax : None 45 | clussize : (5.0, 3.0) 46 | xmax : None 47 | ymin : None 48 | input : ../data/calls.tsv 49 | Reading input 50 | Number of cells: 2075 51 | Number of bins: 570 52 | Setting style 53 | Reading clonemap 54 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png 55 | Plotting clustered RDR plots for 20 random cells in crdr.png 56 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png 57 | Plotting read-depth ratios in rdrs.png 58 | Plotting B-allele frequencies in bafs.png 59 | Plotting total copy numbers in totalcn.png 60 | Plotting total copy numbers corrected by clones in totalcn-corrected.png 61 | Plotting LOH in loh.png 62 | Plotting LOH corrected by clones in loh-corrected.png 63 | Plotting A-specific copy numbers in Aspecificcn.png 64 | Plotting A-specific copy numbers corrected by clones in Aspecificcn-corrected.png 65 | Plotting B-specific copy numbers in Bspecificcn.png 66 | Plotting B-specific copy numbers corrected by clones in Bspecificcn-corrected.png 67 | Plotting allele-specific copy numbers in allelecn.png 68 | Plotting allele-specific copy numbers corrected by clones in allelecn-corrected.png 69 | Plotting haplotype-specific copy numbers in haplotypecn.png 70 | Plotting haplotype-specific copy numbers corrected by clones in haplotypecn-corrected.png 71 | KTHKBYE! 72 | exit 0 73 | -------------------------------------------------------------------------------- /tests/complete.chk: -------------------------------------------------------------------------------- 1 | + PS4='' 2 | : 3 | mkdir -p data 4 | echo 'Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary.' 5 | Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary. 6 | export TUM=data/cells.bam 7 | TUM=data/cells.bam 8 | echo 'Downloading matched-normal BAM file from Zenodo, please be patient as downloading time may vary.' 9 | Downloading matched-normal BAM file from Zenodo, please be patient as downloading time may vary. 10 | export NOR=data/normal.bam 11 | NOR=data/normal.bam 12 | : 13 | echo 'Downloading human reference genome, please be patient as downloading time may vary.' 14 | Downloading human reference genome, please be patient as downloading time may vary. 15 | export REF=data/hg19.fa 16 | REF=data/hg19.fa 17 | export DIC=data/hg19.dict 18 | DIC=data/hg19.dict 19 | : 20 | export PHA=data/phases.tsv 21 | PHA=data/phases.tsv 22 | : 23 | Parsing and checking arguments 24 | Arguments: 25 | chromosomes : chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 26 | reference : data/hg19.fa 27 | normal : data/normal.bam 28 | blocksize : 50000 29 | seed : 12 30 | listphased : data/phases.tsv 31 | maxploidy : 4 32 | minreads : 100000 33 | binsize : 5000000 34 | tumor : data/cells.bam 35 | upperk : 100 36 | bcftools : None 37 | 38 | Setting directories 39 | Computing RDRs 40 | Parsing and checking arguments 41 | Arguments: 42 | tumor : data/cells.bam 43 | minreads : 100000 44 | chrs : 45 | normal : data/normal.bam 46 | list : None 47 | ref : data/hg19.fa 48 | bins : 5000000 49 | outdir : ./rdr 50 | Computing bins 51 | Counting reads on normal 52 | Counting reads on barcoded cells 53 | Evaluating set of found cells 54 | Computing total numbers of sequenced reads 55 | Selecting cells 56 | Number of selected cells: 100 57 | Writing the totals in ./rdr/total.tsv 58 | Estimating RDR 59 | KTHXBYE 60 | Computing BAFs 61 | Parsing and checking arguments 62 | Arguments: 63 | tumor : data/cells.bam 64 | gamma : 0.01 65 | phased : data/phases.tsv 66 | normal : data/normal.bam 67 | list : ./rdr/total.tsv 68 | ref : data/hg19.fa 69 | bcftools : bcftools 70 | Extracting chromosomes 71 | Chromosomes analyzed: chr6 72 | Total number of given phased positions: 95659 73 | Counting phased SNPs in matched normal 74 | Number of selected heterozygous SNPs: 93728 75 | Extracting SNP counts for all cells 76 | Reading cell list 77 | Writing A/B counts for selected phased SNPs across selected cells 78 | KTHXBYE 79 | Combining RDRs and BAFs 80 | Parsing and checking arguments 81 | Arguments: 82 | blocksize : 50000 83 | seed : 12 84 | maxerror : None 85 | restarts : 100 86 | minerror : 0.001 87 | rdr : ./rdr/rdr.tsv 88 | bootstrap : 100 89 | baf : ./baf/baf.tsv 90 | listofcells : ./rdr/total.tsv 91 | significance : 0.05 92 | Read list of cells 93 | Reading RDR 94 | Reading BAF 95 | Combining 96 | Printing combined RDR and BAF 97 | Calling 98 | Parsing and checking arguments 99 | Arguments: 100 | significativity : 0.02 101 | sensitivity : 1.0 102 | seed : 12 103 | input : ./combo/combo.tsv 104 | LB : 0 105 | scoring : False 106 | e : 0.05 107 | shift : 0.05 108 | maxploidy : 4 109 | fastscaling : False 110 | restarts : 200 111 | lord : 1 112 | UB : 100 113 | Reading combined RDRs and BAFs of barcoded cells 114 | Formatting RDRs and BAFs 115 | Clustering globally 116 | Computing for 35: 117 | Objective value for 35: 0.0 118 | Computing for 17: 119 | Objective value for 17: 0.00597601109068 120 | Computing for 8: 121 | Objective value for 8: 0.0131765568746 122 | Computing for 4: 123 | Objective value for 4: 0.0184374208269 124 | Computing for 2: 125 | Objective value for 2: 0.10191549988 126 | Computing for 3: 127 | Objective value for 3: 0.0256275628979 128 | Computing for 2: 129 | Objective value for 2: 0.10191549988 130 | Computing for 3: 131 | Objective value for 3: 0.0256275628979 132 | Estimating RDR and BAF of every cluster 133 | Selecting ploidies 134 | Number of cells for every ploidy' level: 135 | Cells with base ploidy 2: 100 136 | Inferring copy numbers 137 | Phasing copy-number states along the genome 138 | Writing results 139 | Cloning 140 | Parsing and checking arguments 141 | Arguments: 142 | minsize : 14 143 | refinement : 0.0 144 | seed : 12 145 | maxdiff : 0.06 146 | input : ./calls/calls.tsv 147 | linkage : single 148 | Reading input 149 | Clustering cells in clones 150 | Selecting clones 151 | Number of identified clones: 2 152 | Refining clustering 153 | Number of discarded cells: 0 over 100 in total 154 | Profiling clones 155 | Writing clone map 156 | Writing clone-corrected copy numbers in provided input 157 | Plotting 158 | Parsing and checking arguments 159 | Arguments: 160 | format : png 161 | plotsize : (5.0, 1.5) 162 | sample : 20 163 | xmin : None 164 | clonemap : .././clones/mapping.tsv 165 | nonoisy : False 166 | gridsize : (12.0, 6.0) 167 | ymax : None 168 | clussize : (5.0, 3.0) 169 | xmax : None 170 | ymin : None 171 | input : .././calls/calls.tsv 172 | Reading input 173 | Number of cells: 100 174 | Number of bins: 35 175 | Setting style 176 | Reading clonemap 177 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png 178 | Plotting clustered RDR plots for 20 random cells in crdr.png 179 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png 180 | Plotting read-depth ratios in rdrs.png 181 | Plotting B-allele frequencies in bafs.png 182 | Plotting total copy numbers in totalcn.png 183 | Plotting total copy numbers corrected by clones in totalcn-corrected.png 184 | Plotting LOH in loh.png 185 | Plotting LOH corrected by clones in loh-corrected.png 186 | Plotting A-specific copy numbers in Aspecificcn.png 187 | Plotting A-specific copy numbers corrected by clones in Aspecificcn-corrected.png 188 | Plotting B-specific copy numbers in Bspecificcn.png 189 | Plotting B-specific copy numbers corrected by clones in Bspecificcn-corrected.png 190 | Plotting allele-specific copy numbers in allelecn.png 191 | Plotting allele-specific copy numbers corrected by clones in allelecn-corrected.png 192 | Plotting haplotype-specific copy numbers in haplotypecn.png 193 | Plotting haplotype-specific copy numbers corrected by clones in haplotypecn-corrected.png 194 | KTHKBYE! 195 | exit 0 196 | -------------------------------------------------------------------------------- /tests/plottingE.chk: -------------------------------------------------------------------------------- 1 | + PS4='' 2 | : 3 | mkdir -p data 4 | export INPUT=data/calls.tsv 5 | INPUT=data/calls.tsv 6 | export MAPP=data/mapping.tsv 7 | MAPP=data/mapping.tsv 8 | : 9 | Parsing and checking arguments 10 | Arguments: 11 | format : png 12 | plotsize : 5,1.5 13 | sample : 20 14 | xmin : None 15 | clonemap : data/mapping.tsv 16 | nonoisy : False 17 | gridsize : 12,6 18 | ymax : None 19 | clussize : 5,3 20 | xmax : None 21 | ymin : None 22 | input : data/calls.tsv 23 | 24 | Setting directories 25 | Plotting 26 | Parsing and checking arguments 27 | Arguments: 28 | format : png 29 | plotsize : (5.0, 1.5) 30 | sample : 20 31 | xmin : None 32 | clonemap : ../data/mapping.tsv 33 | nonoisy : False 34 | gridsize : (12.0, 6.0) 35 | ymax : None 36 | clussize : (5.0, 3.0) 37 | xmax : None 38 | ymin : None 39 | input : ../data/calls.tsv 40 | Reading input 41 | Number of cells: 2075 42 | Number of bins: 570 43 | Setting style 44 | Reading clonemap 45 | Plotting RDR and mirrored BAF plots for 20 random cells in rbplot_mirrored.png 46 | Plotting clustered RDR plots for 20 random cells in crdr.png 47 | Plotting clustered-mirrored BAF plots for 20 random cells in cbaf.png 48 | Plotting read-depth ratios in rdrs.png 49 | Plotting B-allele frequencies in bafs.png 50 | Plotting total copy numbers in totalcn.png 51 | Plotting LOH in loh.png 52 | Plotting A-specific copy numbers in Aspecificcn.png 53 | Plotting B-specific copy numbers in Bspecificcn.png 54 | Plotting allele-specific copy numbers in allelecn.png 55 | Plotting haplotype-specific copy numbers in haplotypecn.png 56 | KTHKBYE! 57 | exit 0 58 | -------------------------------------------------------------------------------- /tests/pseudonormal.chk: -------------------------------------------------------------------------------- 1 | + PS4='' 2 | : 3 | mkdir -p data 4 | echo 'Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary.' 5 | Downloading tumor barcoded BAM file from Zenodo, please be patient as downloading time may vary. 6 | export BAM=data/cells.bam 7 | BAM=data/cells.bam 8 | : 9 | echo 'Downloading human reference genome, please be patient as downloading time may vary.' 10 | Downloading human reference genome, please be patient as downloading time may vary. 11 | export REF=data/hg19.fa 12 | REF=data/hg19.fa 13 | export DIC=data/hg19.dict 14 | DIC=data/hg19.dict 15 | : 16 | Parsing and checking arguments 17 | Arguments: 18 | chromosomes : chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 19 | reference : data/hg19.fa 20 | normal : pseudonormal.bam 21 | tmpdir : ./_TMP 22 | minreads : 100000 23 | binsize : 5000000 24 | tumor : data/cells.bam 25 | thres : 0.9 26 | 27 | Computing bins 28 | Counting reads on barcoded cells 29 | Computing total numbers of sequenced reads 30 | Selecting all cells to consider for the analysis 31 | Number of selected cells: 30 32 | Selecting diploid cells 33 | Number of identified diploid cells: 10 34 | Approximate sequencing coverage of pseudo matched-normal sample: 2.09471330866 35 | Extracting sequencing reads from selected diploid cells 36 | Merging and extracted sequencing reads and indexing the output pseduo matched-normal sample 37 | Removing temporary files 38 | KTHXBYE 39 | exit 0 40 | -------------------------------------------------------------------------------- /tests/pytests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | 5 | @pytest.fixture(scope='module') 6 | def input_folder(): 7 | return os.getenv('TEST_DIRECTORY', os.path.join(os.path.dirname(__file__), 'data', 'input')) 8 | -------------------------------------------------------------------------------- /tests/pytests/data/input/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !README.md 4 | -------------------------------------------------------------------------------- /tests/pytests/data/input/README.md: -------------------------------------------------------------------------------- 1 | Test data downloaded from zenodo goes in this folder. 2 | 3 | To download required files manually: 4 | ``` 5 | pip3 install zenodo-get 6 | python3 -m zenodo_get 10.5281/zenodo.3950299 7 | ``` 8 | -------------------------------------------------------------------------------- /tests/pytests/data/output/total.tsv: -------------------------------------------------------------------------------- 1 | normal 21614076 2 | ACACACCA 957619 3 | ACTACTGT 956113 4 | GGGGCGAC 1031993 5 | ACCCGAAA 953522 6 | GGCCCTAG 1033041 7 | GGCGCGCA 1033743 8 | ACCGTCCT 955027 9 | GGATAGCT 1033487 10 | AGCACGTA 953145 11 | GGGGGACC 1034588 12 | GGTCTTGG 1035622 13 | GTTTTGGC 1038136 14 | GAATGGAA 1035362 15 | ACGGAGAA 955108 16 | GCTGGGGG 1035464 17 | GCGAACCC 1034271 18 | AAACCCCG 957751 19 | GTATCGAT 1034150 20 | ACGGATTT 956703 21 | AGTCTCTC 954078 22 | ATCGGGGG 954968 23 | GGTGTCGC 1035138 24 | ATATCGAG 952700 25 | ATGCAAGC 956242 26 | AGCCAGCA 956451 27 | GGTGGATT 1034351 28 | AGTACCAT 954224 29 | GGAGACTG 1033339 30 | AGGGCAAT 956665 31 | ACAGAATA 955855 32 | ATATTTAC 956517 33 | GTAGTGAT 1034883 34 | GTACTCCC 1035802 35 | GATCCGCC 1034840 36 | ATCATAGT 955420 37 | ACCGGCCC 954656 38 | GGGCAATA 1034728 39 | AGAAGCGA 958082 40 | AATGTACA 955777 41 | AGACGATA 953727 42 | AGCGATGG 958039 43 | ATCGGTAT 954110 44 | GCGGAGTT 1033472 45 | ACCGCACA 954183 46 | AGGAGATA 956402 47 | AGCGTCGA 956780 48 | ATAGGGCC 956105 49 | GGTTAACA 1033195 50 | ACTATTTG 955162 51 | ATCGGGAC 954520 52 | AATCGGGC 954964 53 | GACGATTA 1034515 54 | GAGGCCGG 1033828 55 | AGATTAGT 956151 56 | GGCTGTAT 1035321 57 | AGGCCACT 957262 58 | GGAGGATT 1035972 59 | GCACCTCC 1035573 60 | GCAACCGA 1037662 61 | ACATCGTT 952277 62 | GGACTATG 1034823 63 | AAACGCCT 954376 64 | ATTCCGGT 956273 65 | GGTAATTC 1037230 66 | GCCCTCTC 1033039 67 | GACCACGG 1031569 68 | ACGTCGGA 957720 69 | GCCCATTC 1036101 70 | GTAGGATT 1033946 71 | ATGCGGTA 957359 72 | ACTAGACG 956299 73 | GCCTAACT 1033407 74 | AATTAGCC 955433 75 | GATTGATT 1036081 76 | AGATCGCT 956010 77 | GCTCTCCA 1034989 78 | AAGGGGTA 954521 79 | ACATGAGA 953597 80 | GATCTCGC 1035522 81 | GCTCGGTT 1037580 82 | GGTGGGAA 1034029 83 | GCCAGTCG 1034770 84 | ATTAGCTA 955147 85 | AGTATTAT 954554 86 | AGGAGTTA 954289 87 | ACATCACT 958150 88 | ATCATACT 956359 89 | GGCCCCCC 1035345 90 | GGTCAGGT 1035923 91 | GATGCCAC 1036401 92 | GGGCATTC 1035342 93 | ATCCATGG 956450 94 | GCTGTACC 1036280 95 | ACACCCAA 957480 96 | GCTATTTA 1034242 97 | GGGGGGGG 1035002 98 | GAGGCCCC 1034277 99 | GAGCAGTA 1036894 100 | GAGACCGA 1035678 101 | GACAACAA 1035912 -------------------------------------------------------------------------------- /tests/pytests/test_baf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import tempfile 4 | from chisel.BAFEstimator import main 5 | 6 | this_dir = os.path.dirname(__file__) 7 | DATA_FOLDER = os.path.join(this_dir, 'data') 8 | 9 | 10 | def test_baf(input_folder): 11 | with tempfile.NamedTemporaryFile('w') as f: 12 | # The complete chr6 phase data takes quite a while to process. 13 | # Here we test with just the first 1k rows of the phase data. 14 | with tempfile.NamedTemporaryFile('w') as phases_f: 15 | with open(os.path.join(input_folder, 'phases.tsv'), 'r') as phases_full: 16 | lines = [next(phases_full) for _ in range(1000)] 17 | phases_f.writelines(lines) 18 | phases_f.flush() 19 | main(args=[ 20 | '-n', os.path.join(input_folder, 'normal.bam'), 21 | '-t', os.path.join(input_folder, 'cells.bam'), 22 | '-r', os.path.join(input_folder, 'hg19.fa'), 23 | '-j', '1', 24 | '-c', os.path.join(DATA_FOLDER, 'output', 'total.tsv'), 25 | '-l', phases_f.name 26 | ], stdout_file=f.name) 27 | 28 | assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \ 29 | 'bb41cbc96c1f76020bb965f41a961884' 30 | -------------------------------------------------------------------------------- /tests/pytests/test_call.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import tempfile 4 | from chisel.Caller import main 5 | 6 | this_dir = os.path.dirname(__file__) 7 | DATA_FOLDER = os.path.join(this_dir, 'data') 8 | 9 | 10 | def test_call(): 11 | with tempfile.NamedTemporaryFile('w') as f: 12 | main(args=[ 13 | os.path.join(DATA_FOLDER, 'output', 'combo.tsv'), 14 | '-j', '1', 15 | '--seed', '12' 16 | ], stdout_file=f.name) 17 | 18 | assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \ 19 | 'be0b7472a1c8d6f8f96b9aa0fb8df3c9' 20 | -------------------------------------------------------------------------------- /tests/pytests/test_clone.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import shutil 4 | import tempfile 5 | from chisel.Cloner import main 6 | 7 | this_dir = os.path.dirname(__file__) 8 | DATA_FOLDER = os.path.join(this_dir, 'data') 9 | 10 | 11 | def test_clone(): 12 | with tempfile.NamedTemporaryFile('w') as f: 13 | # The Cloner.main function overwrites the input file! 14 | # Make a temporary copy for the purpose of testing the call. 15 | with tempfile.NamedTemporaryFile('w') as f_calls: 16 | input_file = os.path.join(DATA_FOLDER, 'output', 'calls.tsv') 17 | 18 | shutil.copy(input_file, f_calls.name) 19 | main(args=[ 20 | f_calls.name, 21 | '--seed', '12' 22 | ], stdout_file=f.name) 23 | 24 | assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \ 25 | '2b94dde27bc7f5e2c9215dc1f890f5f7' 26 | -------------------------------------------------------------------------------- /tests/pytests/test_combine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import tempfile 4 | from chisel.Combiner import main 5 | 6 | this_dir = os.path.dirname(__file__) 7 | DATA_FOLDER = os.path.join(this_dir, 'data') 8 | 9 | 10 | def test_combine(): 11 | with tempfile.NamedTemporaryFile('w') as f: 12 | main(args=[ 13 | '-r', os.path.join(DATA_FOLDER, 'output', 'rdr.tsv'), 14 | '-b', os.path.join(DATA_FOLDER, 'output', 'baf.tsv'), 15 | '-j', '1', 16 | '-s', '12' 17 | ], stdout_file=f.name) 18 | 19 | assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \ 20 | '569c320780744a704a629989cb6d4a88' 21 | -------------------------------------------------------------------------------- /tests/pytests/test_rdr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import tempfile 4 | from chisel.RDREstimator import main 5 | 6 | 7 | def test_rdr(input_folder): 8 | with tempfile.NamedTemporaryFile('w') as f: 9 | main(args=[ 10 | '-n', os.path.join(input_folder, 'normal.bam'), 11 | '-t', os.path.join(input_folder, 'cells.bam'), 12 | '-r', os.path.join(input_folder, 'hg19.fa'), 13 | '-b', '5Mb', 14 | '-m', '100000', 15 | '-c', 'chr6', 16 | '-j', '1' 17 | ], stdout_file=f.name) 18 | 19 | assert hashlib.md5(open(f.name, 'rb').read()).hexdigest() == \ 20 | 'eaf470105df0dcd8be7a995f1d6a8525' 21 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27 3 | 4 | [gh-actions] 5 | python = 6 | 2.7: py27 7 | 8 | [testenv] 9 | passenv = CI TRAVIS TRAVIS_* TEST_DIRECTORY 10 | 11 | deps = 12 | pytest 13 | 14 | commands = 15 | pytest tests/pytests --------------------------------------------------------------------------------