├── .gitattributes ├── Data └── resource │ ├── BCF_tools_dbs │ ├── merged.vcf.gz │ └── merged.vcf.gz.tbi │ ├── pon │ └── .gitkeep │ └── reference │ └── .gitkeep ├── LICENSE.txt ├── README.md ├── RNA-Mutect-WMN.py ├── collect_features.py ├── config.py ├── create_features.py ├── feature_example.csv ├── old_repo ├── LICENSE.txt ├── ML_pipeline.py ├── README.md ├── README.txt └── input_folder │ ├── test.txt │ └── train_1.txt ├── preprocess.py └── train.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.psd filter=lfs diff=lfs merge=lfs -text 2 | *.gz filter=lfs diff=lfs merge=lfs -text 3 | *.vcf.gz filter=lfs diff=lfs merge=lfs -text 4 | *.tar.gz filter=lfs diff=lfs merge=lfs -text 5 | -------------------------------------------------------------------------------- /Data/resource/BCF_tools_dbs/merged.vcf.gz: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:31c7f55cb413146e0dbb64aef22bea7e735995e12ef30ba31c2e6b662cca0e02 3 | size 142624031 4 | -------------------------------------------------------------------------------- /Data/resource/BCF_tools_dbs/merged.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yizhak-lab-ccg/RNA_MUTECt_WMN/18e177e6f09ab60f7b3564eaf91f638bdb0f16c6/Data/resource/BCF_tools_dbs/merged.vcf.gz.tbi -------------------------------------------------------------------------------- /Data/resource/pon/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yizhak-lab-ccg/RNA_MUTECt_WMN/18e177e6f09ab60f7b3564eaf91f638bdb0f16c6/Data/resource/pon/.gitkeep -------------------------------------------------------------------------------- /Data/resource/reference/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yizhak-lab-ccg/RNA_MUTECt_WMN/18e177e6f09ab60f7b3564eaf91f638bdb0f16c6/Data/resource/reference/.gitkeep -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Technion, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name Broad Institute, Inc. nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RNA-Mutect-WMN 2 | This pipeline implements the method described in 3 | [Estimating tumor mutational burden from RNA-sequencing without a matched-normal sample](https://www.nature.com/articles/s41467-022-30753-2), 4 | and **should be used after running [RNA_MuTect](https://github.com/broadinstitute/RNA_MUTECT_1.0-1).** 5 | This pipeline runs on a Linux machine only. 6 | 7 | ## Requirements 8 | 1. python3 packages: 9 | * pandas (1.1.5+) 10 | * NumPy (1.19.4+) 11 | * scikit-learn (0.23.2+) 12 | * matplotlib (3.3.3+) 13 | 2. [CAPY](https://github.com/getzlab/CApy/tree/master/capy) python package (0.1+) 14 | 3. [Funcotator](https://gatk.broadinstitute.org/hc/en-us/articles/360035889931-Funcotator-Information-and-Tutorial) as part of the [gatk](https://gatk.broadinstitute.org/hc/en-us/articles/360036194592-Getting-started-with-GATK4) package (4.2.6.1+) 15 | 4. Samtools: 16 | * [bgzip](http://www.htslib.org/doc/bgzip.html) (1.11+) 17 | * [bcftools](https://samtools.github.io/bcftools/bcftools.html) (1.8+) 18 | * [tabix](http://www.htslib.org/doc/tabix.html) (1.11+) 19 | 5. ~300 GB space: the 'resource' folder will be around 230 GB, and more space will be required (depending on the number of samples). 20 | 21 | ## Input files and directory tree 22 | Directory names can be changed in the [configuration](#configuration) file 23 | ``` 24 | Data/ 25 | 'cancer_dir'/ #project-specific 26 | input/ 27 | call_stats/ 28 | maf/ 29 | resource 30 | BCF_tools_dbs/ 31 | merged.vcf.gz #ESP db 32 | pon/ 33 | 'RNA_binary' 34 | 'DNA_binray' 35 | reference/ 36 | 'reference.fasta' 37 | 'reference.fasta.fai' 38 | 'reference.dict' 39 | ``` 40 | 41 | ## Configuration 42 | The `config.py` file should be configured by the user. 43 | 1. [**Directory configuration**](#input-files-and-directory-tree): 44 | 1. 'cancer_type' is the name of the project-specific directory. 45 | 2. other directories and file names can be changed using this file if desired. 46 | 2. **Learning configuration**: in this section, you can play with the learning parameters and features. 47 | 3. **Environment configuration** is used to configure some tools' locations. 48 | 1. `tools` is the location of the [samtools and GATK](#requirements) binaries 49 | 50 | ## Running instructions 51 | 52 | ### Inputs preparation 53 | 1. As mentioned before, the input of this tool is the output of [RNA-MuTect](https://github.com/broadinstitute/RNA_MUTECT_1.0-1). 54 | A cloud implementation can be found in [Terra](https://app.terra.bio/#workspaces/broad-firecloud-gtex/RNA_MuTect). 55 | * In order to run RNA-MuTect any normal sample can be used and it does not require the matched-normal sample. 56 | 2. Details for location of PoN files are in the [manuscript](https://www.nature.com/articles/s41467-022-30753-2) under 'Data Availability'. 57 | 3. The human reference genome hg19 reference files should be used. 58 | 4. After downloading the repo, [directory configuration](#Input-files-and-directory-tree) should be done, using the `config.py` file: 59 | * Under the 'Data' folder: 60 | * create a 'cancer_dir' folder and configure its name in `config.py`. 61 | * Under the 'cancer_dir' folder: 62 | * Create an 'input' folder, and under it a 'maf' and 'call_stats' folders. 63 | * Download 'call_stats_capture_paper_v1_3' files (RNA-MuTect output) into 'call_stats' folder. 64 | * Download 'maf_file_rna_final_paper_v1_3' files (RNA-MuTect output) into 'maf folder'. 65 | * Under the 'resource' folder: 66 | * download the pon binary files (DNA & RNA) into the 'pon' folder 67 | * download the reference files (including .fasta.fai and .dict files) into the 'reference' folder 68 | * configure downloaded file names in `config.py`. 69 | 70 | 71 | ### Run pipeline 72 | Run `RNA-Mutect-WMN.py` 73 | 74 | ## Results 75 | When the tool is finished successfully, a 'results' directory will be created under the specified 'cancer_dir'. inside 'results' directory: 76 | 1. Train results 77 | 1. mean recall and precision scores 78 | 2. mean recall and precision scores **per sample** + boxplot 79 | 2. 'somatics.maf': MAF file of all the variants classified as somatic by the tool. This should be further filtered using RNA-MuTect filtering steps as described in the [paper](https://www.nature.com/articles/s41467-022-30753-2) 80 | -------------------------------------------------------------------------------- /RNA-Mutect-WMN.py: -------------------------------------------------------------------------------- 1 | from collect_features import * 2 | from train import * 3 | from preprocess import * 4 | from create_features import * 5 | 6 | 7 | def create_somatics_maf(somatics): 8 | # create vcf from somatics 9 | tmp = somatics[ 10 | ['Chromosome', 'position', 'ref_allele', 'alt_allele']] 11 | tmp = tmp.drop_duplicates(subset=['Chromosome', 'position', 'alt_allele']) 12 | vcf = pd.DataFrame(columns=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']) 13 | vcf['#CHROM'] = tmp['Chromosome'] 14 | vcf['POS'] = tmp['position'] 15 | vcf['ID'] = '.' 16 | vcf['REF'] = tmp['ref_allele'] 17 | vcf['ALT'] = tmp['alt_allele'] 18 | vcf['QUAL'] = '.' 19 | vcf['FILTER'] = 'PASS' 20 | vcf['INFO'] = '.' 21 | # create the .vcf, add vcf headers, and save the vars in it 22 | vcf_to_maf = os.path.abspath(cancer_dir + '/somatics.vcf') 23 | with open(vcf_to_maf, 'w') as file: 24 | file.write('##fileformat=VCFv4.0\n') 25 | for i in range(1, 23): 26 | file.write(f"##contig=\n") 27 | file.write("##contig=\n") 28 | file.write("##contig=\n") 29 | vcf.to_csv(vcf_to_maf, mode='a', index=False, sep='\t') 30 | # run funcotator to get MAF file 31 | funco_datasource = os.path.abspath(f"{funcotator_dir}funcotator_dataSources.v1.7.20200521g") 32 | funco_output = os.path.abspath(f"{results_path}/somatics.maf") 33 | # run funcotator 34 | os.system(f"{tools}gatk Funcotator --variant {vcf_to_maf} --reference {reference_hg19} --ref-version hg19 " 35 | f"--data-sources-path {funco_datasource} --output {funco_output} --output-file-format MAF " 36 | f"--java-options '-Xmx6G' --force-b37-to-hg19-reference-contig-conversion --QUIET true") 37 | 38 | 39 | def test_models(test_set): 40 | os.system(f"rm -f {results_path}/test*") # clear old results 41 | X = test_set[features] 42 | y = test_set['is_real_keep'] 43 | sum_pred = np.zeros(len(y)) 44 | for model_file in os.listdir(models_path): 45 | model = pickle.load(open(os.path.abspath(models_path + model_file), 'rb')) 46 | pred = model.predict(X) 47 | sum_pred += pred 48 | 49 | test_set['pred'] = np.where(sum_pred > num_folds / 2, 1, 0) # choose label by majority vote 50 | 51 | # creating a MAF file (Funcotator output) from variants classified as somatic 52 | somatics = test_set[test_set['pred'] == 1] 53 | create_somatics_maf(somatics) 54 | 55 | 56 | def build_and_train(): 57 | preprocess() 58 | create_features() 59 | collect_features() 60 | train_set, test_set = prepare_train_test() 61 | print("Data is ready for training") 62 | train_models(train_set) 63 | print("finished training") 64 | return test_set 65 | 66 | 67 | def main(): 68 | test_set = build_and_train() 69 | test_models(test_set) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /collect_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | from os import path 5 | from pathlib import Path 6 | from capy import mut 7 | from config import * 8 | 9 | 10 | def extract_af(all_vars): 11 | print("extracting af from bcftools output...") 12 | for file_name in os.listdir(cancer_dir + "/BCF_TOOLS/final_outputs"): 13 | if "parsed" in file_name: 14 | continue 15 | cur_output_file_path = os.path.abspath(cancer_dir + "/BCF_TOOLS/final_outputs/" + file_name) 16 | db_type = os.path.splitext(file_name)[0].split("_")[0] # assumption: file name = _0002.txt 17 | col_names = ['#CHROM', 'POS', 'ALT', 'INFO'] 18 | with open(cur_output_file_path, "r") as output_file: 19 | lines = output_file.readlines() 20 | if len(lines) == 0: # output file is empty, move to the next file 21 | continue 22 | for index, line in enumerate(lines): 23 | if line.strip("\n").startswith("#CHROM"): 24 | break # now index = row number of the real header 25 | df = pd.read_table(cur_output_file_path, usecols=col_names, dtype={'#CHROM': 'str'}, skiprows=index) 26 | df = df.rename(columns={'#CHROM': 'Chromosome', 'POS': 'position', 'ALT': 'alt_allele'}) 27 | AF_symbol = 'AF' # if db_type in ['gnomad', 'genome1000']: 28 | if db_type == 'dbsnp': 29 | AF_symbol = 'CAF' 30 | if db_type == 'esp': 31 | AF_symbol = 'MAF' 32 | info_params = ( 33 | df["INFO"].str.split(";", expand=True).stack().str.split("=", expand=True).reset_index(level=1, drop=True)) 34 | add_params = pd.concat([df, info_params.groupby([info_params.index, info_params[0]])[1].sum().unstack()], 35 | axis=1).drop("INFO", axis=1) 36 | add_params = add_params[['Chromosome', 'position', 'alt_allele', AF_symbol]] 37 | if db_type == "dbsnp": 38 | add_params[AF_symbol] = add_params[AF_symbol].str.split(',').str[1] 39 | elif db_type == 'esp': 40 | add_params[AF_symbol] = add_params[AF_symbol].str.split(',').str[2] 41 | af_col = f'{db_type}_af' 42 | add_params = add_params.rename(columns={AF_symbol: af_col}) 43 | all_vars = pd.merge(all_vars, add_params, how='left') 44 | all_vars[db_type] = np.where(all_vars[af_col].isna(), 0, 1) # binary db column 45 | # replacing nan af values with means 46 | all_vars[af_col] = pd.to_numeric(all_vars[af_col]) 47 | af_mean = np.nanmean(all_vars[af_col], axis=0) 48 | all_vars[af_col] = np.where(all_vars[af_col].isna(), af_mean, all_vars[af_col]) 49 | print(f"finished extracting af from {db_type}") 50 | return all_vars 51 | 52 | 53 | def calc_pon(capy_input, sample_type, binary, ref, all_vars, output_path): 54 | capy_input[f"log_like_{sample_type}"] = mut.filter_mutations_against_token_PoN(M=capy_input, ponfile=binary, 55 | ref=ref) 56 | pons_input = capy_input[['chr', 'pos']].drop_duplicates() 57 | pons = pd.DataFrame(mut.get_pon(M=pons_input, ponfile=binary, ref=ref)) 58 | assert (pons.shape[0] == pons_input.shape[0]) 59 | assert (pons.shape[1] == 8) 60 | for index in range(8): 61 | pons_input[f'pon_{sample_type}_{index + 1}'] = pons[index] 62 | full_pon_output = pd.merge(capy_input, pons_input) 63 | assert (full_pon_output.shape[0] == capy_input.shape[0]) 64 | 65 | full_pon_output = full_pon_output.astype({"chr": str}) 66 | full_pon_output['chr'] = np.where(full_pon_output['chr'] == '23', 'X', full_pon_output['chr']) 67 | full_pon_output['chr'] = np.where(full_pon_output['chr'] == '24', 'Y', full_pon_output['chr']) 68 | full_pon_output = full_pon_output.rename( 69 | columns={"chr": "Chromosome", "pos": "position", "n_ref": "n_ref_count", "n_alt": "n_alt_count"}) 70 | 71 | chr_vars = all_vars[all_vars['Chromosome'] == '1'] 72 | chr_pons = full_pon_output[full_pon_output['Chromosome'] == '1'] 73 | chr_vars_with_pon = pd.merge(chr_vars, chr_pons, how='left') 74 | chr_vars_with_pon.to_csv(output_path, index=False) 75 | 76 | chrs = [str(x) for x in range(2, 23)] 77 | chrs += ['X', 'Y'] 78 | for cur_chr in chrs: 79 | chr_vars = all_vars[all_vars['Chromosome'] == cur_chr] 80 | rows_count = chr_vars.shape[0] 81 | chr_pons = full_pon_output[full_pon_output['Chromosome'] == cur_chr] 82 | chr_vars_with_pon = pd.merge(chr_vars, chr_pons, how='left') 83 | new_rows_count = chr_vars_with_pon.shape[0] 84 | if new_rows_count != rows_count: 85 | print(f"wrong rows in chr {cur_chr}!") 86 | chr_vars_with_pon.to_csv(output_path, mode='a', index=False, header=False) 87 | 88 | 89 | def extract_pon(all_vars): 90 | print("calculating pon") 91 | Path(pon_dir).mkdir(parents=True, exist_ok=True) 92 | # need to rename columns to use CApy methods 93 | capy_input = all_vars[['Chromosome', 'position', 'n_ref_count', 'n_alt_count']] 94 | capy_input = capy_input.drop_duplicates() 95 | capy_input = capy_input.rename( 96 | columns={"n_ref_count": "n_ref", "n_alt_count": "n_alt", "Chromosome": "chr", "position": "pos"}) 97 | 98 | # converting chromosomes names to numbers 99 | capy_input['chr'] = np.where(capy_input['chr'] == 'X', '23', capy_input['chr']) 100 | capy_input['chr'] = np.where(capy_input['chr'] == 'Y', '24', capy_input['chr']) 101 | capy_input = capy_input.astype({"chr": int}) 102 | all_vars_RNA_output_path = os.path.abspath(cancer_dir + "/all_vars_after_RNA_pon.csv") 103 | all_vars_DNA_output_path = os.path.abspath(cancer_dir + "/all_vars_after_DNA_pon.csv") 104 | calc_pon(capy_input, 'RNA', RNA_pon_binary, reference_hg19, all_vars, all_vars_RNA_output_path) 105 | all_vars_RNA_output = pd.read_csv(all_vars_RNA_output_path, dtype={'Chromosome': 'str'}) 106 | calc_pon(capy_input.drop(columns=['log_like_RNA']), 'DNA', DNA_pon_binary, reference_hg19, all_vars_RNA_output, 107 | all_vars_DNA_output_path) 108 | 109 | 110 | def extract_funco_data(all_vars): 111 | funco_output_path = os.path.abspath(cancer_dir + "/funco_output.vcf") 112 | with open(funco_output_path, "r", encoding='latin-1') as output_file: 113 | lines = output_file.readlines() 114 | for index,line in enumerate(lines): 115 | if line.strip("\n").startswith("##INFO"): 116 | funco_features = line.split("|") 117 | elif line.strip("\n").startswith("#CHROM"): 118 | break 119 | funco_features[0] = funco_features[0].split(": ")[1] 120 | funco_features[-1] = funco_features[-1].replace('">', '') 121 | funco_output = pd.read_table(funco_output_path, encoding='latin-1', dtype={'#CHROM': 'str'}, skiprows=index) 122 | funco_output = funco_output.rename(columns={'#CHROM': 'Chromosome', 'POS': 'position', 'ALT': 'alt_allele'}) 123 | funco_output["Chromosome"] = funco_output["Chromosome"].str.replace("chr", "") 124 | funco_output["INFO"] = funco_output["INFO"].str.replace("FUNCOTATION=\[", "") 125 | funco_output["INFO"] = funco_output["INFO"].str.replace("\]", "") 126 | funco_output[funco_features] = funco_output['INFO'].str.split('|', expand=True) 127 | funco_output = funco_output[ 128 | ["Chromosome", "position", "alt_allele", 'Gencode_34_hugoSymbol', 'Gencode_34_variantClassification', 129 | 'Gencode_34_variantType', 'Gencode_34_referenceContext', 'Gencode_34_gcContent']] 130 | funco_output = funco_output.rename(columns={'Gencode_34_hugoSymbol': 'Hugo_Symbol', 131 | 'Gencode_34_variantClassification': 'Variant_Classification', 132 | 'Gencode_34_variantType': 'Variant_Type', 133 | "Gencode_34_referenceContext": "ref_context", 134 | "Gencode_34_gcContent": "gc_content"}) 135 | all_vars = pd.merge(all_vars, funco_output, 136 | how='left') 137 | classification_to_remove = ['IGR', 'Intron', 'RNA', 'lincRNA'] 138 | all_vars["classification_to_remove"] = np.where(all_vars['Variant_Classification'].isin(classification_to_remove), 139 | 1, 0) 140 | return all_vars 141 | 142 | 143 | def collect_features(): 144 | print("running feature collection...") 145 | 146 | input_all_vars_path = os.path.abspath(cancer_dir + '/all_vars_after_preprocess.csv') 147 | if not path.isfile(input_all_vars_path): 148 | print('Missing output of preprocess phase, please run again.') 149 | return 150 | all_vars = pd.read_csv(input_all_vars_path, dtype={'Chromosome': 'str'}) 151 | 152 | all_vars_with_af = extract_af(all_vars) 153 | print("done af") 154 | all_vars_with_funco = extract_funco_data(all_vars_with_af) 155 | print("done funcotator") 156 | extract_pon(all_vars_with_funco) 157 | print("done pon") 158 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # dirs and files configuration # 2 | data_dir = "./Data" 3 | cancer_type = "" # must configure! 4 | cancer_dir = f"{data_dir}/{cancer_type}/" 5 | models_path = cancer_dir + "/models/" 6 | results_path = cancer_dir + "/results/" 7 | input_path = cancer_dir + "/input/" 8 | call_stats_files_dir = input_path + "call_stats/" 9 | maf_files_dir = input_path + "maf/" 10 | resource_dir = data_dir + "/resource/" 11 | pon_dir = resource_dir + "pon/" 12 | ref_dir = resource_dir + "reference/" 13 | bcftools_dir = resource_dir + "BCF_tools_dbs/" 14 | funcotator_dir = resource_dir + "funcotator/" 15 | # pon and ref file names must be configured! # 16 | RNA_pon_binary = pon_dir + "" 17 | DNA_pon_binary = pon_dir + "" 18 | reference_hg19 = ref_dir + "" 19 | 20 | assert(cancer_type != "") 21 | assert (RNA_pon_binary != pon_dir and DNA_pon_binary != pon_dir) 22 | assert (reference_hg19 != ref_dir) 23 | 24 | # learning configuration # 25 | num_train_samples = 100 26 | num_folds = 5 27 | # make sure num_train_samples divides by num folds! 28 | assert (num_train_samples % num_folds == 0) 29 | features = ['t_ref_count', 't_alt_count', 't_lod_fstar', 'tumor_f', 'dbsnp_af', 'dbsnp', 'esp_af', 30 | 'esp', 'thousand_af', 'thousand', 'gnomad_af', 'gnomad', 31 | 'classification_to_remove', 'log_like_RNA', 'log_like_DNA', 'pon_RNA_1', 'pon_RNA_2', 32 | 'pon_RNA_3', 'pon_RNA_4', 'pon_RNA_5', 'pon_RNA_6', 'pon_RNA_7', 'pon_RNA_8', 33 | 'pon_DNA_1', 'pon_DNA_2', 'pon_DNA_3', 'pon_DNA_4', 'pon_DNA_5', 'pon_DNA_6', 34 | 'pon_DNA_7', 'pon_DNA_8'] 35 | 36 | # environment configuration # 37 | tools = "/Local/md_keren/anaconda3/bin/" 38 | -------------------------------------------------------------------------------- /create_features.py: -------------------------------------------------------------------------------- 1 | from config import * 2 | from pathlib import Path 3 | import os 4 | 5 | 6 | def prepare_bcftools_input(bcftools_cancer_dir): 7 | raw_input_path = os.path.abspath(bcftools_cancer_dir + '/unique_variants.vcf') 8 | if not os.path.isfile(raw_input_path): 9 | if not os.path.isfile(f"{raw_input_path}.gz"): 10 | raise Exception("bcftools input is missing!") # if there's also no vcf file, can't continue! 11 | else: 12 | return 13 | else: # vcf exists, delete old versions and re-create them 14 | os.system(f"rm -f {raw_input_path}.gz") 15 | os.system(f"rm -f {raw_input_path}.gz.tbi") 16 | os.system(f"{tools}bgzip {raw_input_path}") # create the .gz file 17 | os.system(f"{tools}tabix -p vcf {raw_input_path}.gz") # create the index file 18 | 19 | 20 | def download_bcftools_dbs(): 21 | dbsnp_path = os.path.abspath(bcftools_dir + '/All_20180423.vcf.gz') 22 | dbsnp_link = "https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/All_20180423.vcf.gz" 23 | gnomad_path = os.path.abspath(bcftools_dir + '/gnomad.exomes.r2.1.1.sites.vcf.bgz') 24 | gnomad_link = "https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.vcf.bgz" 25 | thousand_genome_path = os.path.abspath(bcftools_dir + '/ALL.2of4intersection.20100804.genotypes.vcf.gz') 26 | thousand_genome_link = "https://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz" 27 | esp_path = os.path.abspath(bcftools_dir + '/merged.vcf.gz') # esp file is a required input 28 | dbs_files = {"dbsnp": [dbsnp_path, dbsnp_link], "gnomad": [gnomad_path, gnomad_link], 29 | "thousand_genome": [thousand_genome_path, thousand_genome_link], 30 | "esp": [esp_path]} 31 | for db, values in dbs_files.items(): 32 | if db == "esp": 33 | continue 34 | if not os.path.isfile(values[0]): 35 | os.system(f"wget -P {bcftools_dir} {values[1]}") 36 | if not os.path.isfile(values[0] + ".tbi"): 37 | os.system(f"wget -P {bcftools_dir} {values[1]}.tbi") 38 | print("all dbs are downloaded and ready") 39 | return dbs_files 40 | 41 | 42 | def run_bcftools_on_db(db_path, db_name, bcftools_cancer_dir): 43 | print(f"starting bcftools on {db_name}. This will take a while...") 44 | output_dir = os.path.abspath(bcftools_cancer_dir + "/bcftools_output") 45 | input_path = os.path.abspath(bcftools_cancer_dir + '/unique_variants.vcf.gz') 46 | final_outputs = os.path.abspath(bcftools_cancer_dir + "/final_outputs") 47 | # use bcftools to intersect variants with current db 48 | os.system(f"{tools}bcftools isec -p {output_dir} -w1 -Oz {db_path} {input_path}") 49 | # unzip intersection output 50 | os.system(f"{tools}bgzip -d -c {output_dir}/0002.vcf.gz > {output_dir}/0002.vcf") 51 | os.system(f"mv {output_dir}/0002.vcf {final_outputs}/{db_name}_0002.vcf") 52 | os.system(f"rm -f {output_dir}/*") 53 | 54 | 55 | def run_bcftools(): 56 | print("running bcftools...") 57 | # creating directories if needed 58 | bcftools_cancer_dir = os.path.abspath(cancer_dir + "/BCF_TOOLS/") 59 | Path(resource_dir + "/BCF_tools_dbs").mkdir(parents=True, exist_ok=True) 60 | Path(bcftools_cancer_dir + "/bcftools_output").mkdir(parents=True, exist_ok=True) 61 | Path(bcftools_cancer_dir + "/final_outputs").mkdir(parents=True, exist_ok=True) 62 | try: 63 | prepare_bcftools_input(bcftools_cancer_dir) 64 | print("bcftools input file is ready") 65 | except Exception as e: 66 | print(e) 67 | return 68 | dbs_files = download_bcftools_dbs() 69 | for db in dbs_files: 70 | run_bcftools_on_db(dbs_files[db][0], db, bcftools_cancer_dir) 71 | 72 | 73 | def run_funcotator(): 74 | print("running Funcotator...") 75 | # preparation for running 76 | Path(resource_dir + "/funcotator").mkdir(parents=True, exist_ok=True) 77 | if not os.path.isfile(f"{funcotator_dir}dataSources.v1.7.20200521g.tar.gz"): 78 | os.system(f"{tools}gatk FuncotatorDataSourceDownloader --germline --validate-integrity " 79 | f"--extract-after-download --output {funcotator_dir}dataSources.v1.7.20200521g.tar.gz") 80 | vcf_path = os.path.abspath(f"{cancer_dir}unique_variants.vcf") 81 | funco_datasource = os.path.abspath(f"{funcotator_dir}funcotator_dataSources.v1.7.20200521g") 82 | funco_output = os.path.abspath(f"{cancer_dir}funco_output.vcf") 83 | # run funcotator 84 | os.system(f"{tools}gatk Funcotator --variant {vcf_path} --reference {reference_hg19} --ref-version hg19 " 85 | f"--data-sources-path {funco_datasource} --output {funco_output} --output-file-format VCF " 86 | f"--java-options '-Xmx6G' --force-b37-to-hg19-reference-contig-conversion --QUIET true") 87 | 88 | 89 | def create_features(): 90 | run_bcftools() 91 | run_funcotator() 92 | -------------------------------------------------------------------------------- /feature_example.csv: -------------------------------------------------------------------------------- 1 | ,t_lod_fstar,tumor_f,t_ref_count,t_alt_count,dbsnp_af,dbsnp,esp_af,esp,thousand_af,thousand,gnomad_af,gnomad,classification_to_remove,log_like_RNA,pon_RNA_1,pon_RNA_2,pon_RNA_3,pon_RNA_4,pon_RNA_5,pon_RNA_6,pon_RNA_7,pon_RNA_8,log_like_DNA,pon_DNA_1,pon_DNA_2,pon_DNA_3,pon_DNA_4,pon_DNA_5,pon_DNA_6,pon_DNA_7,pon_DNA_8,is_somatic 2 | 0,131.857119,0.074543,658,53,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-2.207968705,1161,5277,72,1,0,1,0,0,-2.317618956,8206,120,8,0,0,0,0,0,1 3 | 1,411.001401,0.483051,122,114,0.430879502,0,17.7235512,0,0.461383628,0,4.71E-06,1,0,-2.677045727,1178,5284,46,4,0,0,0,0,-3.147952342,8105,228,1,0,0,0,0,0,1 4 | 2,463.726851,0.422951,176,129,0.430879502,0,0.0618,1,0.461383628,0,0.000155558,1,0,-1.590805612,1254,5198,56,4,0,0,0,0,-2.337229403,7532,781,21,0,0,0,0,0,1 5 | 3,669.096734,0.545181,151,181,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-4.191775037,1021,5442,45,4,0,0,0,0,-3.626277957,6619,1686,29,0,0,0,0,0,1 6 | 4,14.025322,0.307692,9,4,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.160141715,1209,5237,64,2,0,0,0,0,-3.03696518,46,8176,74,35,3,0,0,0,1 7 | 5,6.322359,0.041667,69,3,0.430879502,0,17.7235512,0,0.461383628,0,1.20E-05,1,0,-2.753416212,1419,5012,79,2,0,0,0,0,-2.182619488,49,8070,180,30,3,2,0,0,1 8 | 6,26.847231,0.307692,18,8,0.430879502,0,17.7235512,0,0.461383628,0,5.03E-05,1,0,-3.36323363,1873,4562,74,1,0,2,0,0,-3.202709458,48,8135,132,16,3,0,0,0,1 9 | 7,41.44039,0.647059,6,11,0.430879502,0,17.7235512,0,0.461383628,0,1.88E-05,1,0,-2.753623644,2148,4278,79,6,0,1,0,0,-3.706788746,45,8142,58,63,22,1,0,3,1 10 | 8,37.573675,0.625,6,10,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-4.361768512,36,6308,156,11,0,1,0,0,-3.253222887,3922,4324,86,2,0,0,0,0,1 11 | 9,114.833309,0.532258,29,33,0.0001997,1,17.7235512,0,0.002,1,0.000105122,1,0,-3.504433345,4928,1539,40,5,0,0,0,0,-2.757144826,7053,1257,22,2,0,0,0,0,1 12 | 10,114.354578,0.731707,11,30,0.430879502,0,17.7235512,0,0.461383628,0,0,1,0,-3.064491229,6502,9,1,0,0,0,0,0,-2.899452236,8327,5,2,0,0,0,0,0,1 13 | 11,15.961257,0.034826,194,7,0.430879502,0,17.7235512,0,0.461383628,0,7.98E-06,1,0,-2.710035593,3411,3021,76,4,0,0,0,0,-2.785499723,65,7837,382,48,1,1,0,0,1 14 | 12,96.208021,0.119863,257,35,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-2.157807571,6465,1,44,2,0,0,0,0,-2.591003679,48,7995,247,42,1,1,0,0,1 15 | 13,429.849054,0.344086,244,128,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-2.877333375,3628,2843,40,1,0,0,0,0,-3.048706884,236,7876,214,6,1,1,0,0,1 16 | 14,179.306036,0.363636,91,52,0.000599,1,0.0077,1,0.461383628,0,0.000115134,1,0,-2.039495317,3768,2695,48,1,0,0,0,0,-2.38625168,639,7441,246,8,0,0,0,0,1 17 | 15,66.111645,0.322581,42,20,0.430879502,0,17.7235512,0,0.461383628,0,1.19E-05,1,0,-4.236498101,6454,55,3,0,0,0,0,0,-4.84557243,8318,12,3,1,0,0,0,0,1 18 | 16,41.332245,0.48,13,12,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.438229183,6344,107,58,3,0,0,0,0,-3.226453328,8132,6,129,55,0,0,12,0,1 19 | 17,27.410985,0.32,17,8,0.430879502,0,17.7235512,0,0.461383628,0,2.51E-05,1,0,-2.008984181,6371,139,2,0,0,0,0,0,-1.663238455,8293,36,5,0,0,0,0,0,1 20 | 18,81.10276,0.19084,106,25,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.460971641,6301,205,5,1,0,0,0,0,-2.774749794,8269,62,3,0,0,0,0,0,1 21 | 19,23.530552,0.258065,23,8,0.0001997,1,17.7235512,0,0.461383628,0,1.22E-05,1,0,-2.875789093,6331,174,6,0,0,0,1,0,-3.134102282,8316,12,6,0,0,0,0,0,1 22 | 20,25.447626,0.186047,35,8,0.430879502,0,17.7235512,0,0.461383628,0,0,1,0,-2.387368192,1214,5210,82,6,0,0,0,0,-3.244840511,6274,2034,24,2,0,0,0,0,1 23 | 21,16.255942,0.217391,18,5,0.430879502,0,17.7235512,0,0.461383628,0,0.000217484,1,0,-2.780763421,24,6401,71,16,0,0,0,0,-2.63635645,50,8149,121,12,1,1,0,0,1 24 | 22,71.883987,0.37037,34,20,0.430879502,0,17.7235512,0,0.461383628,0,4.00E-06,1,0,-2.744245221,904,5544,57,7,0,0,0,0,-2.319997068,8317,12,4,1,0,0,0,0,1 25 | 23,187.187427,0.442623,68,54,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-2.510083132,1019,5445,44,4,0,0,0,0,-2.342398423,8325,9,0,0,0,0,0,0,1 26 | 24,238.456182,0.358696,118,66,0.430879502,0,17.7235512,0,0.461383628,0,7.98E-06,1,0,-2.911993804,1592,4436,414,57,0,10,3,0,-2.513497709,8318,11,5,0,0,0,0,0,1 27 | 25,436.290948,0.49187,125,121,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.20802177,2206,4264,41,1,0,0,0,0,-2.544789355,7690,637,7,0,0,0,0,0,1 28 | 26,230.852073,0.440559,80,63,0.430879502,0,17.7235512,0,0.461383628,0,3.98E-06,1,0,-2.799217169,1489,4973,48,2,0,0,0,0,-2.789292321,7632,679,22,1,0,0,0,0,1 29 | 27,17.566047,0.048276,138,7,0.430879502,0,17.7235512,0,0.461383628,0,1.99E-05,1,0,-2.690722804,1020,5416,71,4,0,1,0,0,-2.788792846,7589,738,6,1,0,0,0,0,1 30 | 28,41.294392,0.106061,118,14,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.546370859,597,5840,70,5,0,0,0,0,-3.943628963,7634,696,4,0,0,0,0,0,1 31 | 29,141.458242,0.371681,71,42,0.430879502,0,17.7235512,0,0.461383628,0,4.00E-06,1,0,-2.540154835,457,5959,91,5,0,0,0,0,-2.274214519,7635,688,10,1,0,0,0,0,1 32 | 30,316.940571,0.373984,154,92,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-2.454224275,783,5629,97,2,0,1,0,0,-2.637419504,8309,19,5,1,0,0,0,0,1 33 | 31,54.897586,0.102703,166,19,0.430879502,0,17.7235512,0,0.461383628,0,4.90E-06,1,0,-2.907958587,840,5578,86,8,0,0,0,0,-3.177023796,7641,681,11,1,0,0,0,0,1 34 | 32,13.342614,0.03871,149,6,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-2.874770174,1424,4694,346,38,0,6,4,0,-3.217620543,48,8216,56,10,4,0,0,0,1 35 | 33,24.671218,0.02963,393,12,0.430879502,0,17.7235512,0,0.461383628,0,1.50E-05,1,0,-3.120525486,1193,5196,115,6,0,1,1,0,-3.490673076,48,8029,211,38,7,1,0,0,1 36 | 34,263.095742,0.365639,144,83,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.062806366,1252,5186,68,5,0,1,0,0,-3.472017039,46,7920,190,131,26,21,0,0,1 37 | 35,155.134907,0.4,66,44,0.430879502,0,17.7235512,0,0.461383628,0,8.79E-06,1,0,-2.709594142,2062,4414,35,1,0,0,0,0,-3.219274283,47,8187,74,26,0,0,0,0,1 38 | 36,412.335105,0.746479,36,106,0.430879502,0,0.0154,1,0.461383628,0,2.05E-05,1,0,-2.377929509,5895,114,335,91,0,0,77,0,-2.377260574,7940,9,257,85,0,0,43,0,1 39 | 37,70.011109,0.109312,220,27,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.105247873,6374,132,6,0,0,0,0,0,-3.112828605,8317,11,6,0,0,0,0,0,1 40 | 38,258.044512,0.40884,107,74,0.430879502,0,17.7235512,0,0.461383628,0,1.31E-05,1,0,-2.815360758,6364,146,2,0,0,0,0,0,-3.196142951,8320,11,3,0,0,0,0,0,1 41 | 39,17.274972,0.5,5,5,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-3.724635729,4507,1983,17,5,0,0,0,0,-5.000997625,8308,23,3,0,0,0,0,0,1 42 | 40,98.381891,0.459016,33,28,0.430879502,0,17.7235512,0,0.461383628,0,0,1,0,-2.517357048,559,5879,62,11,1,0,0,0,-2.791716056,8317,14,3,0,0,0,0,0,1 43 | 41,10.432896,0.375,5,3,0.0001997,1,17.7235512,0,0.461383628,0,2.23E-05,1,0,-3.749532891,608,5840,56,7,1,0,0,0,-2.830345433,8249,85,0,0,0,0,0,0,1 44 | 42,15.749699,0.666667,2,4,0.430879502,0,17.7235512,0,0.461383628,0,4.56E-05,1,0,-3.145628236,544,5825,111,29,3,0,0,0,-2.354255254,7847,476,11,0,0,0,0,0,1 45 | 43,31.929975,0.409091,13,9,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,0,-2.86686887,546,5870,71,24,0,1,0,0,-2.586039696,7676,648,9,1,0,0,0,0,1 46 | 44,48.822562,0.441176,19,15,0.0007987,1,17.7235512,0,0.461383628,0,6.61E-05,1,0,-3.443316469,2083,4356,66,6,0,1,0,0,-2.936371111,7694,630,9,1,0,0,0,0,1 47 | 45,55.796675,0.410256,23,16,0.430879502,0,17.7235512,0,0.461383628,0,1.97E-05,1,0,-3.809009704,8,6372,102,23,4,3,0,0,-3.361106981,62,8039,215,18,0,0,0,0,1 48 | 46,32.475222,0.5,9,9,0.430879502,0,17.7235512,0,0.461383628,0,5.35E-06,1,0,-3.390604829,5673,832,7,0,0,0,0,0,-3.522098937,47,8205,65,16,0,1,0,0,1 49 | 47,400.06075,0.365979,246,142,0.430879502,0,17.7235512,0,0.461383628,0,4.03E-06,1,0,-3.042467814,866,5409,214,18,0,5,0,0,-2.925470743,7635,667,27,4,0,1,0,0,1 50 | 48,688.197433,0.475973,229,208,0.430879502,0,17.7235512,0,0.461383628,0,7.99E-06,1,0,-2.275896352,271,6162,79,0,0,0,0,0,-2.141424579,7640,669,22,2,1,0,0,0,1 51 | 49,34.306321,0.15942,58,11,0.430879502,0,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.00202271,3581,2852,72,7,0,0,0,0,-5.378088118,7863,466,5,0,0,0,0,0,0 52 | 50,21.716662,0.315789,13,6,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-20,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 53 | 51,32.652926,0.153846,55,10,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-9.317488137,2533,3915,63,1,0,0,0,0,-10.53299069,7842,485,7,0,0,0,0,0,0 54 | 52,9.800797,0.25,9,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.693377413,2533,3915,63,1,0,0,0,0,-5.774942662,7842,485,7,0,0,0,0,0,0 55 | 53,9.208005,0.09375,29,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-6.219273,2533,3915,63,1,0,0,0,0,-7.337998532,7842,485,7,0,0,0,0,0,0 56 | 54,38.355455,0.423077,15,11,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-20,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 57 | 55,34.195013,0.27027,27,10,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-19.99998312,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 58 | 56,8.847642,0.083333,33,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-11.71733502,2533,3915,63,1,0,0,0,0,-13.45420325,7842,485,7,0,0,0,0,0,0 59 | 57,26.790405,0.173913,38,8,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-14.49525904,2533,3915,63,1,0,0,0,0,-16.64692182,7842,485,7,0,0,0,0,0,0 60 | 58,22.702142,0.170732,34,7,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-19.99256655,2533,3915,63,1,0,0,0,0,-19.99999928,7842,485,7,0,0,0,0,0,0 61 | 59,8.851796,0.12,22,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-16.33770621,2533,3915,63,1,0,0,0,0,-19.30107803,7842,485,7,0,0,0,0,0,0 62 | 60,14.321512,0.104167,43,5,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-8.938317481,2533,3915,63,1,0,0,0,0,-10.15343098,7842,485,7,0,0,0,0,0,0 63 | 61,11.861314,0.153846,22,4,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-15.550764,2533,3915,63,1,0,0,0,0,-17.29710464,7842,485,7,0,0,0,0,0,0 64 | 62,52.183334,0.722222,5,13,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-12.95712078,2533,3915,63,1,0,0,0,0,-14.37713199,7842,485,7,0,0,0,0,0,0 65 | 63,16.123972,0.142857,36,6,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-17.81671303,2533,3915,63,1,0,0,0,0,-19.94725334,7842,485,7,0,0,0,0,0,0 66 | 64,30.893567,0.310345,20,9,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-19.99999248,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 67 | 65,20.186738,0.333333,12,6,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.296147853,2533,3915,63,1,0,0,0,0,-5.377654041,7842,485,7,0,0,0,0,0,0 68 | 66,38.259719,0.261905,31,11,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-20,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 69 | 67,14.870966,0.3125,11,5,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-17.65995304,2533,3915,63,1,0,0,0,0,-19.81340383,7842,485,7,0,0,0,0,0,0 70 | 68,14.140672,0.266667,11,4,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-19.99510219,2533,3915,63,1,0,0,0,0,-19.99999953,7842,485,7,0,0,0,0,0,0 71 | 69,14.677164,0.363636,7,4,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-20,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 72 | 70,55.226246,0.347826,30,16,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-18.9106246,2533,3915,63,1,0,0,0,0,-19.98816707,7842,485,7,0,0,0,0,0,0 73 | 71,36.677978,0.174603,52,11,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-9.113888577,2533,3915,63,1,0,0,0,0,-10.32919648,7842,485,7,0,0,0,0,0,0 74 | 72,6.427488,0.111111,16,2,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-7.479622704,2533,3915,63,1,0,0,0,0,-8.598923828,7842,485,7,0,0,0,0,0,0 75 | 73,127.077076,0.589286,23,33,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-15.44588815,2533,3915,63,1,0,0,0,0,-17.59944903,7842,485,7,0,0,0,0,0,0 76 | 74,12.761067,0.153846,22,4,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.392821118,2533,3915,63,1,0,0,0,0,-5.474346961,7842,485,7,0,0,0,0,0,0 77 | 75,16.052457,0.142857,30,5,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-10.80768404,2533,3915,63,1,0,0,0,0,-12.22457716,7842,485,7,0,0,0,0,0,0 78 | 76,26.325164,0.5,7,7,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-6.635971588,2533,3915,63,1,0,0,0,0,-7.754984051,7842,485,7,0,0,0,0,0,0 79 | 77,19.163159,0.230769,20,6,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.693377413,2533,3915,63,1,0,0,0,0,-5.774942662,7842,485,7,0,0,0,0,0,0 80 | 78,19.191502,0.153846,33,6,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-10.66229188,2533,3915,63,1,0,0,0,0,-12.0787966,7842,485,7,0,0,0,0,0,0 81 | 79,22.021069,0.152174,39,7,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-6.957467686,2533,3915,63,1,0,0,0,0,-8.076624262,7842,485,7,0,0,0,0,0,0 82 | 80,17.923853,0.384615,8,5,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-10.25793336,2533,3915,63,1,0,0,0,0,-11.47402131,7842,485,7,0,0,0,0,0,0 83 | 81,9.65996,0.15,17,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-20,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 84 | 82,6.496371,0.133333,13,2,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.392821118,2533,3915,63,1,0,0,0,0,-5.474346961,7842,485,7,0,0,0,0,0,0 85 | 83,16.09998,0.083333,66,6,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.392821118,2533,3915,63,1,0,0,0,0,-5.474346961,7842,485,7,0,0,0,0,0,0 86 | 84,16.764928,0.227273,17,5,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-14.83404476,2533,3915,63,1,0,0,0,0,-16.98684978,7842,485,7,0,0,0,0,0,0 87 | 85,41.95957,0.26087,34,12,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-11.3375611,2533,3915,63,1,0,0,0,0,-12.75562141,7842,485,7,0,0,0,0,0,0 88 | 86,32.309865,0.196078,41,10,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-12.69801087,2533,3915,63,1,0,0,0,0,-14.43945385,7842,485,7,0,0,0,0,0,0 89 | 87,11.862698,0.2,16,4,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-11.71733502,2533,3915,63,1,0,0,0,0,-13.45420325,7842,485,7,0,0,0,0,0,0 90 | 88,31.524295,0.080292,126,11,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-13.8049926,2533,3915,63,1,0,0,0,0,-15.95338669,7842,485,7,0,0,0,0,0,0 91 | 89,11.762143,0.210526,15,4,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-19.97888523,2533,3915,63,1,0,0,0,0,-19.99998201,7842,485,7,0,0,0,0,0,0 92 | 90,11.416838,0.074074,50,4,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-6.957467686,2533,3915,63,1,0,0,0,0,-8.076624262,7842,485,7,0,0,0,0,0,0 93 | 91,55.431491,0.290909,39,16,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-19.99999997,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 94 | 92,105.562643,0.246269,101,33,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-13.93127806,2533,3915,63,1,0,0,0,0,-15.67614454,7842,485,7,0,0,0,0,0,0 95 | 93,8.841808,0.111111,24,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-9.236624318,2533,3915,63,1,0,0,0,0,-10.64695881,7842,485,7,0,0,0,0,0,0 96 | 94,33.005816,0.2,40,10,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-8.938317481,2533,3915,63,1,0,0,0,0,-10.15343098,7842,485,7,0,0,0,0,0,0 97 | 95,41.586876,0.333333,24,12,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-19.99999991,2533,3915,63,1,0,0,0,0,-20,7842,485,7,0,0,0,0,0,0 98 | 96,8.176024,0.3,7,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-4.693377413,2533,3915,63,1,0,0,0,0,-5.774942662,7842,485,7,0,0,0,0,0,0 99 | 97,10.586357,0.230769,10,3,0.09585,1,17.7235512,0,0.461383628,0,0.273351271,0,1,-12.48073815,2533,3915,63,1,0,0,0,0,-13.90035899,7842,485,7,0,0,0,0,0,0 100 | -------------------------------------------------------------------------------- /old_repo/LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Technion, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name Broad Institute, Inc. nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /old_repo/ML_pipeline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn import metrics 6 | from sklearn.metrics import precision_score 7 | from sklearn.metrics import recall_score 8 | 9 | folderName = 'input_folder/' 10 | 11 | ### Generating and training 5 random forest models ### 12 | for i in range(1,6): 13 | modelName = folderName + 'model_' + str(i) +'.json' 14 | 15 | #loading the data for the current training group 16 | df = pd.read_table(folderName + 'train_' + str(i) + '.txt') 17 | 18 | #setting the data and the labels for the training group 19 | y_train = df.is_true_somatic 20 | X_train = df 21 | X_train = X_train.iloc[:, :-1] 22 | 23 | #generating and training the random forest model on the training group 24 | clf = RandomForestClassifier(n_estimators=50, random_state=42) 25 | clf.fit(X_train,y_train) 26 | pickle.dump(clf,open(modelName,'wb')) 27 | 28 | # loading the data for the current validation group 29 | df_val = pd.read_table(folderName + 'val_' + str(i) + '.txt') 30 | 31 | # setting the data and the labels for the validation group 32 | y_val = df_val.is_true_somatic 33 | X_val = df_val 34 | X_val = X_val.iloc[:, :-1] 35 | 36 | #predicting calssificatoin for validation group 37 | y_pred=clf.predict(X_val) 38 | 39 | # estimating models' performance on validation set 40 | print(metrics.confusion_matrix(y_val, y_pred)) 41 | print('first precision ', precision_score(y_val, y_pred)) 42 | print('first recall ' , recall_score(y_val, y_pred)) 43 | 44 | 45 | ### testing models perforance on test set ### 46 | test = pd.read_table(folderName + 'test.txt') 47 | test_data = test.iloc[:, :-1] 48 | real = test.is_true_somatic 49 | 50 | #for majority voting 51 | sum_pred = np.zeros(len(real)) 52 | 53 | # using all 5 trained models for prediction # 54 | for i in range(1,6): 55 | 56 | #loading the trained model 57 | modelName1 = folderName + 'model_' + str(i) + '.json' 58 | model = pickle.load(open(modelName1,'rb')) 59 | 60 | #making the predictions on the test group 61 | pred1 = model.predict(test_data) 62 | sum_pred += pred1 63 | 64 | #actual prediction is based on majority voting 65 | i=np.where(sum_pred>2) 66 | i=pd.DataFrame(i) 67 | i=i.T 68 | final_pred = np.zeros(len(sum_pred)) 69 | np.put(final_pred,i,1) 70 | 71 | # All predicted somatic mutations are tested via RNA-MuTect filtering steps. 72 | # Mutations that are filtered out will not be considered as somatic mutations. 73 | 74 | # estimating the performance on the test set after RNA-MuTect filtering steps 75 | print('final precision ', precision_score(real, final_pred)) 76 | print('final recall ' , recall_score(real, final_pred)) -------------------------------------------------------------------------------- /old_repo/README.md: -------------------------------------------------------------------------------- 1 | # RNA_MUTECt_WMN -------------------------------------------------------------------------------- /old_repo/README.txt: -------------------------------------------------------------------------------- 1 | --------------- 2 | RNA-MuTect-WMN 3 | --------------- 4 | 5 | ***************************************** 6 | The pipeline uses Python v3.9.1 7 | ***************************************** 8 | 9 | No installation is required. 10 | 11 | INPUT: 12 | The pipeline includes the following input files that should be placed in a folder named 'input_folder' (see example input files in 'input_folder'): 13 | 1. 5 text files for train sets 1-5, containing variants in the rows and all features in the column. Last column is the label (somatic=1; no somatic=0). 14 | 2. A similar text file containing all variants in the test set. 15 | 16 | OUTPUT: 17 | 1. 5 trained models 18 | 2. Precision and recall values for train and test groups 19 | * Running time is dependent on the number of variants in the dataset 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /old_repo/input_folder/test.txt: -------------------------------------------------------------------------------- 1 | t_ref_count t_alt_count t_lod_fstar tumor_f dbsnp_AF dbsnp esp_AF esp genome1000_AF genome1000 gnomad_AF gnomad variant_classification log_like_RNA log_like_DNA pon_RNA_1 pon_RNA_2 pon_RNA_3 pon_RNA_4 pon_RNA_5 pon_RNA_6 pon_RNA_7 pon_RNA_8 pon_DNA_1 pon_DNA_2 pon_DNA_3 pon_DNA_4 pon_DNA_5 pon_DNA_6 pon_DNA_7 pon_DNA_8 is_somatic 2 | 0 2 9.1542 1 0.454129178 0 19.50448813 0 0.252 1 0.303829019 0 0 -0.415080076 -1.363346303 331 3157 361 114 21 24 239 2265 7266 321 271 111 0 4 117 244 0 3 | 0 3 13.5312 1 0.454129178 0 19.50448813 0 0.996 1 0.303829019 0 1 -0.205160973 -1.907308455 709 0 832 878 0 0 3394 699 7379 0 601 250 0 0 104 0 0 4 | 0 3 12.231 1 0.454129178 0 19.50448813 0 0.480189101 0 0.303829019 0 0 -13.35734826 -13.4943421 3117 3346 46 3 0 0 0 0 2853 5372 107 2 0 0 0 0 0 5 | 0 27 109.976 1 0.9866 1 19.50448813 0 0.99 1 0.303829019 0 0 -0.149636802 -0.886225039 467 0 658 773 0 0 3623 991 5946 1 872 432 0 0 550 533 0 6 | 0 2 8.5541 1 0.454129178 0 19.50448813 0 0.480189101 0 0.303829019 0 1 -0.858241389 -15.22187625 2958 0 1708 942 0 0 899 5 8321 8 5 0 0 0 0 0 0 7 | -------------------------------------------------------------------------------- /old_repo/input_folder/train_1.txt: -------------------------------------------------------------------------------- 1 | t_ref_count t_alt_count t_lod_fstar tumor_f dbsnp_AF dbsnp esp_AF esp genome1000_AF genome1000 gnomad_AF gnomad variant_classification log_like_RNA log_like_DNA pon_RNA_1 pon_RNA_2 pon_RNA_3 pon_RNA_4 pon_RNA_5 pon_RNA_6 pon_RNA_7 pon_RNA_8 pon_DNA_1 pon_DNA_2 pon_DNA_3 pon_DNA_4 pon_DNA_5 pon_DNA_6 pon_DNA_7 pon_DNA_8 is_somatic 2 | 0 16 65.6315 1 0.6214 1 19.50448813 0 0.59 1 0.303829019 0 0 -0.103361404 -12.97097575 512 274 287 292 0 6 2518 2623 8154 0 171 9 0 0 0 0 0 3 | 0 36 159.5744 1 0.454129178 0 19.50448813 0 0.984 1 0.303829019 0 0 -0.015754971 -1.261888876 83 7 70 72 0 0 417 5863 6438 0 919 521 0 0 451 5 0 4 | 0 9 38.3934 1 0.454129178 0 19.50448813 0 0.861 1 0.303829019 0 1 -0.228140873 -0.081815623 1031 38 814 778 0 0 2787 1064 539 152 448 288 0 4 543 6360 0 5 | 0 5 22.4854 1 0.477 1 19.50448813 0 0.468 1 0.303829019 0 0 -0.233954897 -1.349283661 901 887 455 451 0 17 2275 1526 6840 42 789 290 0 0 348 25 0 6 | 0 49 207.265 1 0.9972 1 19.50448813 0 0.996 1 0.303829019 0 1 -0.001335881 -1.002298966 9 0 5 6 0 0 104 6388 6311 0 772 422 0 0 791 38 0 7 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from config import * 5 | from pathlib import Path 6 | 7 | 8 | def create_df(path, file_type): 9 | assert (file_type in ["call_stats", "maf"]) 10 | if file_type == "call_stats": 11 | col_names = ['contig', 'position', 'ref_allele', 'alt_allele', 'tumor_name', 12 | 'total_reads', 'tumor_power', 'contaminant_lod', 't_ref_count', 't_alt_count', 13 | 't_ref_sum', 't_alt_sum', 'n_ref_count', 'n_alt_count', 'n_ref_sum', 'n_alt_sum', 14 | 't_lod_fstar', 'tumor_f', 'judgement', 'failure_reasons', 'context'] 15 | # read relevant columns from call_stats file 16 | df = pd.read_table(path, usecols=col_names, header=1, dtype={'contig': 'str'}) 17 | df = df.rename(columns={"tumor_name": "Tumor_Sample_Barcode", "contig": "Chromosome"}) 18 | 19 | if file_type == "maf": 20 | col_names = ["Chromosome", "Start_position", "Tumor_Sample_Barcode"] 21 | df = pd.read_table(path, usecols=col_names, dtype={'Chromosome': 'str'}) 22 | df = df.rename(columns={"Start_position": "position"}) 23 | 24 | return df 25 | 26 | 27 | def filter_call_stats_df(df): 28 | df.failure_reasons = df.failure_reasons.fillna('') 29 | # remove results of GL chromosome 30 | df = df[~df.Chromosome.str.contains("GL")] 31 | # remove results of MT chromosome 32 | df = df[df.Chromosome != 'MT'] 33 | # keep only non-fail or valid fail reasons (=germline) 34 | non_germ_failure = ["clustered_read_position", "fstar_tumor_lod", "nearby_gap_events", "seen_in_panel_of_normals", 35 | "poor_mapping_region_alternate_allele_mapq", "poor_mapping_region_mapq0", 36 | "possible_contamination", 37 | "strand_artifact", "triallelic_site"] 38 | df = df[(df.failure_reasons == "") | ~(df.failure_reasons.str.contains('|'.join(non_germ_failure)))] 39 | return df 40 | 41 | 42 | def mark_noise(all_vars): 43 | # only vars that also exist in the MAF files are not noise. 44 | # after this function, the value in the "is_real_keep" columns indicates if the var is real or noise: 45 | # 1 is real and 0 is noise. 46 | dfs = [] 47 | for maf_file in os.listdir(maf_files_dir): 48 | cur_maf_path = os.path.abspath(maf_files_dir + maf_file) 49 | df = create_df(cur_maf_path, file_type="maf") 50 | dfs.append(df) 51 | all_mafs = pd.concat(dfs, ignore_index=True) 52 | all_mafs = all_mafs.drop_duplicates() 53 | merged = pd.merge(all_vars, all_mafs, how='left', indicator='Exist') 54 | all_vars['is_real_keep'] = np.where(merged.Exist == 'both', 1, 0) 55 | return all_vars 56 | 57 | 58 | def process_col_stats(): 59 | dfs = [] 60 | for file in os.listdir(call_stats_files_dir): 61 | cur_file_path = os.path.abspath(call_stats_files_dir + file) 62 | df = create_df(cur_file_path, file_type="call_stats") 63 | df = filter_call_stats_df(df) 64 | dfs.append(df) 65 | all_vars = pd.concat(dfs, ignore_index=True) 66 | all_vars = all_vars.drop_duplicates() 67 | # converting chromosomes names to ints for sorting 68 | all_vars['Chromosome'] = np.where(all_vars['Chromosome'] == 'X', '23', all_vars['Chromosome']) 69 | all_vars['Chromosome'] = np.where(all_vars['Chromosome'] == 'Y', '24', all_vars['Chromosome']) 70 | all_vars = all_vars.astype({"Chromosome": int}) 71 | # sorting 72 | all_vars = all_vars.sort_values(by=['Chromosome', 'position'], ignore_index=True) 73 | 74 | # converting back to strings 75 | all_vars = all_vars.astype({"Chromosome": str}) 76 | all_vars['Chromosome'] = np.where(all_vars['Chromosome'] == '23', 'X', all_vars['Chromosome']) 77 | all_vars['Chromosome'] = np.where(all_vars['Chromosome'] == '24', 'Y', all_vars['Chromosome']) 78 | all_vars = mark_noise(all_vars) 79 | all_vars.to_csv(os.path.abspath(cancer_dir + "/all_vars_after_preprocess.csv"), index=False) 80 | return all_vars 81 | 82 | 83 | def create_vcf(all_vars): 84 | tmp = all_vars[ 85 | ['Chromosome', 'position', 'ref_allele', 'alt_allele']] 86 | tmp = tmp.drop_duplicates(subset=['Chromosome', 'position', 'alt_allele']) 87 | vcf = pd.DataFrame(columns=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']) 88 | vcf['#CHROM'] = tmp['Chromosome'] 89 | vcf['POS'] = tmp['position'] 90 | vcf['ID'] = '.' 91 | vcf['REF'] = tmp['ref_allele'] 92 | vcf['ALT'] = tmp['alt_allele'] 93 | vcf['QUAL'] = '.' 94 | vcf['FILTER'] = 'PASS' 95 | vcf['INFO'] = '.' 96 | 97 | Path(cancer_dir + "/BCF_TOOLS").mkdir(parents=True, exist_ok=True) 98 | # create the unique_variants.vcf, add vcf headers, and save the vars in it 99 | bcf_tools_file_path = os.path.abspath(cancer_dir + '/BCF_TOOLS/unique_variants.vcf') 100 | with open(bcf_tools_file_path, 'w') as file: 101 | file.write('##fileformat=VCFv4.0\n') 102 | for i in range(1, 23): 103 | file.write(f"##contig=\n") 104 | file.write("##contig=\n") 105 | file.write("##contig=\n") 106 | vcf.to_csv(bcf_tools_file_path, mode='a', index=False, sep='\t') 107 | os.system(f"cp {bcf_tools_file_path} {cancer_dir}unique_variants.vcf") 108 | 109 | 110 | def preprocess(): 111 | print("Preprocessing the input...") 112 | all_vars = process_col_stats() 113 | create_vcf(all_vars) 114 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from config import * 4 | from sklearn.model_selection import KFold 5 | from sklearn.ensemble import RandomForestClassifier 6 | import pickle 7 | from sklearn.metrics import precision_score 8 | from sklearn.metrics import recall_score 9 | import os 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | def prepare_train_test(): 14 | os.system(f"rm -f {cancer_dir}/*_set.csv") 15 | final_cols_to_use = features + ['Chromosome', 'position', 'Tumor_Sample_Barcode', 'judgement', 'is_real_keep', 16 | 'ref_allele', 'alt_allele'] 17 | full_data = pd.read_csv(os.path.abspath(cancer_dir + "/all_vars_after_DNA_pon.csv"), 18 | usecols=final_cols_to_use,dtype={'Chromosome': 'str'}) 19 | full_data = full_data.drop_duplicates() 20 | 21 | # making sure all features are numeric and not nan 22 | for feature in features: 23 | full_data[feature] = pd.to_numeric(full_data[feature]) 24 | cur_mean = np.nanmean(full_data[feature], axis=0) 25 | full_data[feature] = np.where(full_data[feature].isna(), cur_mean, full_data[feature]) 26 | # split to train-test 27 | sample_names = full_data["Tumor_Sample_Barcode"].drop_duplicates() 28 | train_sample_names = set(np.random.choice(sample_names, num_train_samples, replace=False)) 29 | test_sample_names = set(sample_names) - train_sample_names 30 | assert (not any(name in train_sample_names for name in test_sample_names)) 31 | assert (not any(name in test_sample_names for name in train_sample_names)) 32 | test_set = full_data[full_data["Tumor_Sample_Barcode"].isin( 33 | test_sample_names)] 34 | assert (test_set['Tumor_Sample_Barcode'].drop_duplicates().shape[0] == ( 35 | full_data['Tumor_Sample_Barcode'].drop_duplicates().shape[0] - num_train_samples)) 36 | train_set = full_data[full_data["Tumor_Sample_Barcode"].isin(train_sample_names)] 37 | assert (train_set['Tumor_Sample_Barcode'].drop_duplicates().shape[0] == num_train_samples) 38 | # removing noise variants, keeping only germline (REJECT) or somatic (KEEP and is_real_keep=1) 39 | train_set = train_set[~(train_set['judgement'] == 'KEEP') | ~( 40 | train_set['is_real_keep'] == 0)] 41 | return train_set, test_set 42 | 43 | 44 | def train_models(train_set): 45 | if not os.path.isdir(models_path): 46 | os.mkdir(models_path) 47 | else: 48 | os.system(f"rm -f {models_path}/*") # clear old models 49 | if not os.path.isdir(results_path): 50 | os.mkdir(results_path) 51 | else: 52 | os.system(f"rm -f {results_path}/train*") # clear old results 53 | cv = KFold(n_splits=num_folds) 54 | train_samples = train_set['Tumor_Sample_Barcode'].drop_duplicates().tolist() 55 | precision_scores, recall_scores, validation_samples, folds = [], [], [], [] 56 | with open(os.path.abspath(results_path + "/train_results.txt"), 'w') as results: 57 | results.write("Training scores\n\n") 58 | # cross validation of the train samples 59 | for (train, validation), i in zip(cv.split(train_samples), range(1, num_folds + 1)): 60 | folds += [i] * len(validation) 61 | cur_train_samples = np.array(train_samples)[train] 62 | cur_validation_samples = np.array(train_samples)[validation] 63 | validation_samples += cur_validation_samples.tolist() 64 | assert (not any(name in cur_train_samples for name in cur_validation_samples)) 65 | assert (not any(name in cur_validation_samples for name in cur_train_samples)) 66 | cur_train = train_set.loc[train_set["Tumor_Sample_Barcode"].isin(cur_train_samples), :] 67 | cur_validation = train_set.loc[train_set["Tumor_Sample_Barcode"].isin(cur_validation_samples), :].copy( 68 | deep=True) 69 | 70 | # train current model 71 | clf = RandomForestClassifier(n_estimators=50, random_state=42) 72 | clf.fit(cur_train[features], cur_train['is_real_keep']) 73 | model_path = os.path.abspath(f"{models_path}model_{str(i)}.json") 74 | pickle.dump(clf, open(model_path, 'wb')) 75 | 76 | # validation 77 | y_pred = clf.predict(cur_validation[features]) 78 | y_true = cur_validation['is_real_keep'] 79 | 80 | # calc model scores 81 | precision = precision_score(y_true, y_pred) 82 | recall = recall_score(y_true, y_pred) 83 | results.write(f"Model {i}:\n") 84 | results.write(f'\tPrecision score: {precision}\n') 85 | results.write(f'\tRecall score: {recall}\n\n') 86 | 87 | # calc scores per sample 88 | cur_validation.loc[:, 'pred'] = y_pred 89 | for sample in cur_validation_samples: 90 | cur_pred = cur_validation[cur_validation["Tumor_Sample_Barcode"] == sample]['pred'] 91 | cur_truth = cur_validation[cur_validation["Tumor_Sample_Barcode"] == sample]['is_real_keep'] 92 | precision_scores.append(precision_score(cur_truth, cur_pred)) 93 | recall_scores.append(recall_score(cur_truth, cur_pred)) 94 | del cur_truth 95 | del cur_pred 96 | del cur_train 97 | del cur_validation 98 | del cur_train_samples 99 | del cur_validation_samples 100 | # save scores per sample file 101 | scores = pd.DataFrame( 102 | {'Fold': folds, 'Sample': validation_samples, 'Precision': precision_scores, 'Recall': recall_scores}) 103 | scores.to_csv(os.path.abspath(results_path + "/train_scores_per_sample.csv"), index=False) 104 | assert (scores.shape[0] == scores['Sample'].drop_duplicates().shape[0]) 105 | # save results boxplot 106 | fig = plt.figure() 107 | fig.suptitle(f'Validation Set (n={num_train_samples})', fontsize=20) 108 | scores[['Precision', 'Recall']].boxplot(grid=False) 109 | fig.savefig(os.path.abspath(results_path + "/train_boxplot")) 110 | --------------------------------------------------------------------------------