├── garmire_SSrGE ├── __init__.py ├── examples.py ├── config.py ├── generate_refgenome_index.py ├── load_data.py ├── linear_cross_validation.py ├── extract_data.py ├── extract_matrices_from_dataset.py ├── multiprocess_fitting.py └── ssrge.py ├── requirements.txt ├── garmire_SNV_calling ├── __init__.py ├── bash_utils.py ├── make_GSMID_sampleID_csv.py ├── check_star_overall_quality.py ├── check_fastqc_stats.py ├── generate_bsseeker_genome_index.py ├── generate_STAR_genome_index.py ├── deploy_BSseeker_call_methylation.py ├── process_multiple_generic.py ├── process_snv_calling_with_monovar.py ├── compute_frequency_matrix.py ├── deploy_bismark.py ├── parse_10x_bam_file_to_fastq_files.py ├── process_annotate_snv.py ├── deploy_BSseeker.py ├── process_fastqc_report.py ├── process_multiple_snv.py ├── deploy_star.py ├── process_freebayes.py ├── config.py └── process_snv_GATK.py ├── garmire_download_ncbi_sra ├── __init__.py ├── argv.py ├── remove_sra.py ├── extract_data.py ├── config.py ├── download_data.py └── download_soft_file.py ├── img └── workflow.png ├── .bumpversion.cfg ├── .gitignore ├── setup.py ├── test ├── test_snv_optional.py ├── test_download.py ├── test_extract_matrices.py ├── test_snv.py └── test_ssrge.py ├── README_download_ncbi_rsa.md ├── README_snv_calling.md ├── example └── jones_pancreatic_cancer.soft └── README.md /garmire_SSrGE/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /garmire_SNV_calling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /garmire_download_ncbi_sra/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lanagarmire/SSrGE/HEAD/img/workflow.png -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | files = ./setup.py 3 | commit = True 4 | tag = True 5 | current_version = 2.0.2 6 | 7 | -------------------------------------------------------------------------------- /garmire_download_ncbi_sra/argv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Instanciate argument passed 3 | Also, helper to print when -H or -h is passed as argument 4 | """ 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.nt 3 | *.err 4 | *.out 5 | *.csv 6 | *egg-info* 7 | build/* 8 | dist/* 9 | *data/* 10 | *~ 11 | *tmp/* 12 | *#* 13 | *.cache* 14 | */slurm/bash_variable.sh 15 | -------------------------------------------------------------------------------- /garmire_download_ncbi_sra/remove_sra.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ 4 | remove sra file 5 | """ 6 | 7 | from garmire_download_ncbi_sra.config import PATH_DATA 8 | from os import popen 9 | 10 | 11 | def main(): 12 | rm_sra() 13 | 14 | def rm_sra(): 15 | """extract sra file""" 16 | path_seq = PATH_DATA + '/fastq/' 17 | popen('rm {0}*.sra'.format(path_seq)).read() 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys, os 3 | 4 | VERSION = '2.0.2' 5 | 6 | setup(name='garmire_SSrGE', 7 | version=VERSION, 8 | description="compute SNV from RNA-seq following GATK recommendations", 9 | long_description="""""", 10 | classifiers=[], 11 | keywords='', 12 | author='Olivier Poirion (PhD)', 13 | author_email='opoirion@hawaii.edu', 14 | url='', 15 | license='MIT', 16 | packages=find_packages(exclude=['examples', 'tests']), 17 | include_package_data=True, 18 | zip_safe=False, 19 | install_requires=[ 20 | 'numpy', 21 | 'scipy', 22 | 'scikit-learn', 23 | 'tabulate'], 24 | ) 25 | -------------------------------------------------------------------------------- /garmire_SNV_calling/bash_utils.py: -------------------------------------------------------------------------------- 1 | from subprocess import call 2 | 3 | from sys import stdout as STDOUT 4 | 5 | 6 | def printf(msg): 7 | """ 8 | """ 9 | print(msg) 10 | 11 | 12 | def exec_cmd(cmd, stdout=STDOUT): 13 | """ 14 | """ 15 | if stdout is None: 16 | stdout = STDOUT 17 | 18 | try: 19 | answer = call(cmd.split(), stdout=stdout) 20 | except Exception: 21 | raise Exception('error when launching {0} \n cannot execute the command!'.format(cmd)) 22 | 23 | try: 24 | assert(answer == 0) 25 | except Exception: 26 | raise Exception('{0} return a non 0 code!'.format(cmd)) 27 | 28 | call('echo ### cmd: {0} succesfull ###\n'.format(cmd).split(), stdout=stdout) 29 | -------------------------------------------------------------------------------- /test/test_snv_optional.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from garmire_SNV_calling.config import JAVA 4 | from garmire_SNV_calling.config import SNPEFF 5 | from garmire_SNV_calling.config import FEATURE_COUNT 6 | from garmire_SNV_calling.config import FASTQC 7 | 8 | from commands import getstatusoutput 9 | 10 | 11 | class TestPackage(unittest.TestCase): 12 | """ """ 13 | def test_snpeff(self): 14 | """assert that snpEff is installed""" 15 | self.assertFalse(getstatusoutput("{0} -jar {1} -version".format(JAVA, SNPEFF))[0]) 16 | 17 | def test_featurecount(self): 18 | """assert that featureCounts is installed""" 19 | self.assertFalse(getstatusoutput("{0} -v".format(FEATURE_COUNT))[0]) 20 | 21 | def test_fastqc(self): 22 | """assert that fastQC is installed""" 23 | self.assertFalse(getstatusoutput("{0} -version".format(FASTQC))[0]) 24 | 25 | 26 | if __name__ == "__main__": 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /garmire_SNV_calling/make_GSMID_sampleID_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | read soft summary file from GEO page and map GSE to cell name 5 | """ 6 | 7 | from garmire_SNV_calling.config import GLOBAL_DATA_ROOT 8 | from garmire_SNV_calling.config import PROJECT_NAME 9 | from garmire_SNV_calling.config import SOFT_PATH 10 | 11 | from os.path import isfile 12 | 13 | import re 14 | 15 | def main(): 16 | csv_path = "{0}/{1}/{1}.csv"\ 17 | .format(GLOBAL_DATA_ROOT, PROJECT_NAME) 18 | 19 | if not isfile(SOFT_PATH): 20 | print "error! no file: {0}".format(SOFT_PATH) 21 | return 1 22 | 23 | f_soft = open(SOFT_PATH, 'r').read() 24 | f_csv = open(csv_path, 'w') 25 | 26 | gse_list = re.findall("(?<=\^SAMPLE \= )\w+", f_soft) 27 | id_list = re.findall("(?<=!Sample_title \= ).+(?!\n)", f_soft) 28 | 29 | for gse, ids in zip(gse_list, id_list): 30 | ids = ids.replace(' ', '_') 31 | f_csv.write("{0};{1}\n".format(gse, ids)) 32 | 33 | print "done" 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /garmire_SNV_calling/check_star_overall_quality.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ check overall statistics for all log files from star aligner""" 4 | 5 | from os import listdir 6 | from os.path import isfile 7 | import re 8 | 9 | from garmire_SNV_calling.config import OUTPUT_PATH_STAR 10 | from garmire_SNV_calling.config import PATH_OUTPUT 11 | 12 | 13 | def main(): 14 | make_aligner_quality_csv() 15 | 16 | def make_aligner_quality_csv(): 17 | regex = "(?<=Uniquely mapped reads \% \|\t)[0-9]+\.[0-9]+" 18 | regex = re.compile(regex) 19 | 20 | stats = {} 21 | 22 | for folder in listdir(OUTPUT_PATH_STAR): 23 | log_file = "{0}/{1}/Log.final.out"\ 24 | .format(OUTPUT_PATH_STAR, folder) 25 | 26 | if not isfile(log_file): 27 | continue 28 | 29 | stats[folder] = regex.findall( 30 | open(log_file, 'r').read())[0] 31 | 32 | f_csv = open(PATH_OUTPUT + '/aligner_unique_read.csv', 'w') 33 | 34 | for key in stats: 35 | f_csv.write('{0};{1}\n'.format(key, stats[key])) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /garmire_SNV_calling/check_fastqc_stats.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ check overall statistics for all log files from fastqc report""" 4 | 5 | from os import listdir 6 | from os.path import isfile 7 | import re 8 | 9 | from collections import Counter 10 | 11 | from garmire_SNV_calling.config import PATH_OUTPUT 12 | 13 | PATH_OUTPUT_FASTQC = PATH_OUTPUT + '/fastqc/data/' 14 | 15 | 16 | def main(): 17 | make_aligner_quality_csv() 18 | 19 | def make_aligner_quality_csv(): 20 | """ """ 21 | regex_status = "(?<=Sequence Duplication Levels\t)\w+" 22 | regex_status = re.compile(regex_status) 23 | 24 | stats_status = {} 25 | 26 | for folder in listdir(PATH_OUTPUT_FASTQC): 27 | log_file = "{0}/{1}/fastqc_data.txt"\ 28 | .format(PATH_OUTPUT_FASTQC, folder) 29 | 30 | if not isfile(log_file): 31 | continue 32 | 33 | read = open(log_file, 'r').read() 34 | status = regex_status.findall(read)[0] 35 | sample = folder.rsplit('_fastqc', 1)[0] 36 | stats_status[sample] = status 37 | 38 | f_csv = open(PATH_OUTPUT + '/deduplicated_check.csv', 'w') 39 | 40 | for key in stats_status: 41 | f_csv.write('{0};{1}\n'.format(key, stats_status[key])) 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /test/test_download.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from os import popen 4 | 5 | from os.path import isfile 6 | from os.path import isdir 7 | 8 | from garmire_download_ncbi_sra.config import FASTQ_DUMP 9 | from garmire_download_ncbi_sra.config import PATH_DATA 10 | from garmire_download_ncbi_sra.config import PATH_SOFT 11 | 12 | from garmire_download_ncbi_sra.download_data import get_urls 13 | 14 | import urllib2 15 | 16 | class TestPackage(unittest.TestCase): 17 | """ """ 18 | def test_fastq_dump(self): 19 | """assert that fastq-dump exists""" 20 | self.assertIsNotNone(popen(FASTQ_DUMP)) 21 | 22 | def test_is_path(self): 23 | """assert that data folder exits""" 24 | self.assertTrue(isdir(PATH_DATA)) 25 | 26 | def test_is_soft(self): 27 | """assert that soft file exits""" 28 | self.assertTrue(isfile(PATH_SOFT)) 29 | 30 | def test_is_urls(self): 31 | """assert that urls can be extracted from soft files""" 32 | urls = get_urls() 33 | 34 | self.assertTrue(len(urls)) 35 | 36 | def test_connect_to_urls(self): 37 | """assert that the first url can be reached""" 38 | urls = get_urls() 39 | gsm, url = urls[0] 40 | 41 | self.assertTrue(gsm.count('GSM')) 42 | self.assertTrue(urllib2.urlopen(url)) 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /garmire_SNV_calling/generate_bsseeker_genome_index.py: -------------------------------------------------------------------------------- 1 | """generate STAR GENOME INDEX""" 2 | 3 | from sys import stdout as sys_stdout 4 | from os import popen 5 | from os import mkdir 6 | from os.path import isdir 7 | from os.path import split as pathsplit 8 | 9 | from distutils.dir_util import mkpath 10 | 11 | from garmire_SNV_calling.config import BSSEEKER2_REP 12 | from garmire_SNV_calling.config import BSSEQ_INDEX_PATH 13 | from garmire_SNV_calling.config import REF_GENOME 14 | from garmire_SNV_calling.config import PYTHON 15 | from garmire_SNV_calling.config import BOWTIE_REP 16 | 17 | 18 | ################ VARIABLE ################ 19 | REF_GENOME_PATH = pathsplit(REF_GENOME)[0] 20 | ########################################## 21 | 22 | 23 | def main(): 24 | """ """ 25 | bsseq_index_path = BSSEQ_INDEX_PATH 26 | print "######## computing BS-seq index ########\npath:{0}\n"\ 27 | .format(bsseq_index_path) 28 | 29 | if not isdir(bsseq_index_path): 30 | mkpath(bsseq_index_path) 31 | 32 | cmd = "{0} {1}/bs_seeker2-build.py -f {2}"\ 33 | " --aligner=bowtie2 -p {3} --db {4} -r"\ 34 | .format(PYTHON, 35 | BSSEEKER2_REP, 36 | REF_GENOME, 37 | BOWTIE_REP, 38 | REF_GENOME_PATH 39 | ) 40 | 41 | stdout = popen(cmd) 42 | c = stdout.read(1) 43 | 44 | while c: 45 | sys_stdout.write(c) 46 | sys_stdout.flush() 47 | c = stdout.read(1) 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /garmire_SNV_calling/generate_STAR_genome_index.py: -------------------------------------------------------------------------------- 1 | """generate STAR GENOME INDEX""" 2 | 3 | from sys import stdout as sys_stdout 4 | from os import popen 5 | 6 | from os.path import isdir 7 | from distutils.dir_util import mkpath 8 | 9 | from garmire_SNV_calling.config import PATH_STAR_SOFTWARE 10 | from garmire_SNV_calling.config import STAR_INDEX_PATH 11 | from garmire_SNV_calling.config import ANNOTATION_PATH 12 | from garmire_SNV_calling.config import REF_GENOME 13 | from garmire_SNV_calling.config import STAR_THREADS 14 | from garmire_SNV_calling.config import STAR_INDEX_READ_LENGTH 15 | 16 | 17 | def main(): 18 | """ """ 19 | star_index_path = "{0}READ{1}/".format(STAR_INDEX_PATH.rstrip('/'), 20 | STAR_INDEX_READ_LENGTH) 21 | print "######## computing STAR index ########\npath:{0}\n"\ 22 | .format(star_index_path) 23 | 24 | if not isdir(star_index_path): 25 | mkpath(star_index_path) 26 | 27 | cmd = "{0} --runMode genomeGenerate --runThreadN {1}"\ 28 | " --genomeDir {2} --genomeFastaFiles {3} --sjdbGTFfile {4}"\ 29 | " --sjdbOverhang {5}"\ 30 | .format( 31 | PATH_STAR_SOFTWARE, 32 | STAR_THREADS, 33 | star_index_path, 34 | REF_GENOME, 35 | ANNOTATION_PATH, 36 | STAR_INDEX_READ_LENGTH 37 | ) 38 | stdout = popen(cmd) 39 | c = stdout.read(1) 40 | 41 | while c: 42 | sys_stdout.write(c) 43 | sys_stdout.flush() 44 | c = stdout.read(1) 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /garmire_download_ncbi_sra/extract_data.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ 4 | remove sra file 5 | """ 6 | from os.path import isdir 7 | from os import mkdir 8 | from os import listdir 9 | from os import popen 10 | 11 | from fnmatch import fnmatch 12 | 13 | from garmire_download_ncbi_sra.config import PATH_DATA 14 | from garmire_download_ncbi_sra.config import FASTQ_DUMP 15 | from garmire_download_ncbi_sra.config import FASTQ_DUMP_OPTION 16 | from garmire_download_ncbi_sra.config import LIMIT 17 | from garmire_download_ncbi_sra.config import NB_CPU 18 | 19 | 20 | from multiprocessing import Pool 21 | 22 | 23 | ############ VARIABLE ############ 24 | PATH_SEQ = PATH_DATA + '/fastq/' 25 | ################################## 26 | 27 | 28 | def main(): 29 | fastq_dump() 30 | 31 | def fastq_dump(): 32 | """extract sra file""" 33 | count = 0 34 | 35 | print('extracting .sra files into: {0}'.format(PATH_SEQ)) 36 | 37 | file_list = [] 38 | 39 | for fil in listdir(PATH_SEQ): 40 | if not fnmatch(fil, '*.sra'): 41 | continue 42 | 43 | file_list.append(fil) 44 | 45 | count += 1 46 | 47 | if LIMIT and count > LIMIT: 48 | break 49 | 50 | pool = Pool(NB_CPU) 51 | 52 | pool.map(_fastq_dump, file_list) 53 | 54 | def _fastq_dump(fil): 55 | """ """ 56 | print('go to extraction for file:', fil) 57 | fil = fil.rsplit('.', 1)[0] 58 | 59 | if not isdir("{0}/{1}".format(PATH_SEQ, fil)): 60 | mkdir("{0}/{1}".format(PATH_SEQ, fil)) 61 | popen('{3} {2} -v {0}/{1}.sra -O {0}/{1}/'\ 62 | .format(PATH_SEQ, fil, FASTQ_DUMP_OPTION, FASTQ_DUMP)).read() 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /garmire_SSrGE/examples.py: -------------------------------------------------------------------------------- 1 | """ example """ 2 | 3 | from scipy.sparse import csr_matrix 4 | import numpy as np 5 | 6 | 7 | def create_example_matrix_v1(nb_cells=100, nb_snvs=6, nb_genes=5): 8 | """ 9 | create a random feature matrix and infer Y according to coefs W 10 | Four sparse coefs are set into W 11 | """ 12 | X = csr_matrix(np.random.random((nb_cells, nb_snvs))) 13 | W = np.zeros((nb_snvs, nb_genes)) 14 | 15 | 16 | W[0][1] = 5 17 | W[0][0] = 5 18 | W[1][1] = 5 19 | W[1][0] = 5 20 | W[3][3] = 2 21 | W[5][4] = 6 22 | 23 | Y = (X * W) 24 | 25 | return X, Y, W 26 | 27 | def create_example_matrix_v2(nb_cells=100, nb_snvs=6, nb_genes=5): 28 | """ 29 | create a random feature matrix and infer Y according to coefs W 30 | Four sparse coefs are set into W 31 | create fake snv list and fake gene list 32 | """ 33 | gene_list = ['KRAS', 34 | 'HLA-A', 35 | 'HLA-B', 36 | 'HLA-C', 37 | 'SPARC', 38 | 'SARAF', 39 | 'EIF3K', 40 | 'ALDH', 41 | ] 42 | 43 | 44 | X = csr_matrix(np.random.random((nb_cells, nb_snvs))) 45 | W = np.zeros((nb_snvs, nb_genes)) 46 | 47 | gene_id_list = [gene_list[i] if i < len(gene_list) else i 48 | for i in range(nb_genes)] 49 | 50 | snv_id_list = [(gene_id_list[i], i) 51 | if i < nb_genes else (gene_id_list[0], i) 52 | for i in range(nb_snvs)] 53 | 54 | W[0][0] = 5 55 | W[1][0] = 5 56 | W[3][3] = 2 57 | W[5][4] = 6 58 | 59 | Y = (X * W) 60 | 61 | return X, Y, gene_id_list, snv_id_list 62 | 63 | def create_example_matrix_v3(nb_cells=100, nb_snvs=6, nb_genes=5): 64 | """ 65 | create a random feature matrix and infer Y according to coefs W 66 | Four sparse coefs are set into W 67 | Additionally, create CNV matrix using Y 68 | """ 69 | X, Y, W = create_example_matrix_v1() 70 | C = np.random.randint(0,10, (Y.shape)) 71 | 72 | return X, Y, C, W 73 | 74 | def create_example_matrix_v4(nb_cells=100, nb_snvs=6, nb_genes=5): 75 | """ 76 | create a random feature matrix and infer Y according to coefs W 77 | Four sparse coefs are set into W 78 | Additionally, create CNV matrix using Y 79 | """ 80 | X, Y, gene_id_list, snv_id_list = create_example_matrix_v2() 81 | C = np.random.randint(0,10, (Y.shape)) 82 | 83 | return X, Y, C, gene_id_list, snv_id_list 84 | -------------------------------------------------------------------------------- /test/test_extract_matrices.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from garmire_SSrGE.extract_matrices_from_dataset import ExtractMatrix 4 | 5 | from garmire_SSrGE.config import EXPRESSION_MATRIX_FOLDER_PATH 6 | from garmire_SSrGE.config import VCF_FOLDER_PATH 7 | from garmire_SSrGE.config import GTF_PATH 8 | from garmire_SSrGE.config import INDEX_SAVE_PATH 9 | 10 | from garmire_SSrGE.generate_refgenome_index import main as make_index 11 | 12 | from os.path import isdir 13 | from os.path import isfile 14 | 15 | import warnings 16 | 17 | 18 | class TestPackage(unittest.TestCase): 19 | """ """ 20 | def test_gtf_path(self): 21 | """ 22 | test if the GTF path defined in config exists 23 | """ 24 | self.assertTrue(isfile(GTF_PATH)) 25 | 26 | def test_gtf_index(self): 27 | """ 28 | test the gtf index creation 29 | """ 30 | self.assertTrue(make_index()) 31 | 32 | def test_gtf_index_path(self): 33 | """ 34 | test the gtf index path 35 | """ 36 | self.assertTrue(INDEX_SAVE_PATH) 37 | 38 | def test_vcf_dir_exits(self): 39 | """ 40 | test if the vcf directory defined in config exists 41 | """ 42 | self.assertTrue(isdir(VCF_FOLDER_PATH)) 43 | 44 | def test_snv_matrices(self): 45 | """ 46 | test if the snv extraction matrix on one sample 47 | """ 48 | extract_matrix = ExtractMatrix(limit=1) 49 | matrix = extract_matrix.extract_SNV_mat() 50 | 51 | if isinstance(matrix, type(None)): 52 | warnings.warn( 53 | 'SNV matrix is None beacause vcf folder is not defined!') 54 | return 55 | 56 | self.assertTrue(matrix.shape) 57 | 58 | def test_expression_matrix_dir_exits(self): 59 | """ 60 | test if the expression matrix directory defined in config exists 61 | """ 62 | self.assertTrue(isdir(EXPRESSION_MATRIX_FOLDER_PATH)) 63 | 64 | def test_ge_matrices(self): 65 | """ 66 | test if the gene expression extraction matrix on one sample 67 | """ 68 | extract_matrix = ExtractMatrix(limit=1) 69 | matrix = extract_matrix.extract_GE_mat() 70 | 71 | if isinstance(matrix, type(None)): 72 | warnings.warn( 73 | 'gene expression matrix is none beacause GE folder is not defined!') 74 | return 75 | 76 | self.assertTrue(matrix.shape) 77 | 78 | 79 | if __name__ == "__main__": 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /garmire_SNV_calling/deploy_BSseeker_call_methylation.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | from multiprocessing import Pool 4 | 5 | from os import popen 6 | 7 | from os.path import isdir 8 | from os.path import isfile 9 | from os.path import split as pathsplit 10 | 11 | from glob import glob 12 | 13 | from time import sleep 14 | from random import random 15 | 16 | from fnmatch import fnmatch 17 | 18 | from distutils.dir_util import mkpath 19 | 20 | from garmire_SNV_calling.config import PATH_OUTPUT 21 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN 22 | from garmire_SNV_calling.config import BSSEEKER2_REP 23 | from garmire_SNV_calling.config import REF_GENOME 24 | from garmire_SNV_calling.config import PYTHON 25 | 26 | 27 | ################ VARIABLE ################################## 28 | 29 | BAM_PATH = PATH_OUTPUT + '/BSseeker/' 30 | PROCESS_THREADS = 2 31 | BISMARK_OPTION = '' 32 | REF_GENOME_PATH = pathsplit(REF_GENOME)[0] 33 | REF_GENOME_RRBS_DB = REF_GENOME_PATH + '/genome.fa_rrbs_20_500_bowtie2/' 34 | ############################################################ 35 | 36 | 37 | sleep(2 * random()) 38 | if not isdir(BAM_PATH): 39 | mkpath(BAM_PATH) 40 | 41 | 42 | def main(): 43 | pool = Pool(PROCESS_THREADS) 44 | # process_one_file(glob(BAM_PATH + '/*')[0]) 45 | pool.map(process_one_file, glob(BAM_PATH + '/*')) 46 | 47 | def process_one_file(folder): 48 | """ """ 49 | print(folder) 50 | if isfile(folder): 51 | return False 52 | 53 | if PATTERN and not fnmatch(folder, PATTERN): 54 | return False 55 | 56 | print("====> folder to be processed:", folder) 57 | 58 | input_bam_file_name = glob(folder + '/*.bam') 59 | 60 | if not input_bam_file_name: 61 | print('no bam file detected for :{0}\nskipping...'\ 62 | .format(folder)) 63 | return False 64 | 65 | if len(input_bam_file_name) > 1: 66 | print('multiple bam files detected: {0}. selecting the first'.format( 67 | input_bam_file_name)) 68 | 69 | input_bam = input_bam_file_name[0] 70 | output_file = input_bam.rsplit('.', 1)[0] + '.CpG.CGmap' 71 | 72 | cmd = "{0} {1}/bs_seeker2-call_methylation.py -i {2} --CGmap {3} --db {4} --txt " \ 73 | .format(PYTHON, 74 | BSSEEKER2_REP, 75 | input_bam, 76 | output_file, 77 | REF_GENOME_RRBS_DB, 78 | ) 79 | 80 | _run_cmd(cmd) 81 | _run_cmd('rm {0}/*_sorted*'.format(folder)) 82 | 83 | return True 84 | 85 | 86 | def _run_cmd(cmd, *args): 87 | """run cmd""" 88 | popen(cmd).read() 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /garmire_SSrGE/config.py: -------------------------------------------------------------------------------- 1 | from garmire_SNV_calling.config import ANNOTATION_PATH 2 | from garmire_SNV_calling.config import PATH_OUTPUT as PROJECT_PATH 3 | from garmire_SNV_calling.config import SOFT_PATH as SNV_CALLING_SOFT_PATH 4 | 5 | """ 6 | CONFIG file for SSrGE 7 | 8 | Principal default values for SSrGE class parameters 9 | 10 | The config file gives also parameters in order to 11 | extract SNV and GE matrices from a given project 12 | 13 | """ 14 | 15 | ######## SSrGE VARIABLE ############################## 16 | TIME_LIMIT = 5 17 | # time limit for one linear regression model 18 | MIN_OBS_FOR_REGRESS = 10 19 | # Min number of cell having non null gene expression 20 | # to infer a sparse linear model 21 | NB_THREADS = 4 22 | # Number of threads to run in parallel 23 | CROSS_VAL_NFOLD = 5 24 | # Number of folds to perform the cross validation 25 | ###################################################### 26 | 27 | 28 | ##################################### vcf and ecpression data ####################### 29 | # Paths used to create the SNVs and the gene expression matrices 30 | # The folder architecture used by default is the one from the SNV calling package 31 | # see the file ./garmire_SSrGE/garmire_SNV_calling/config.py 32 | # All the paths defined bellow can be overwritted using a user defined path instead 33 | 34 | # path to save the GTF index 35 | GTF_PATH = ANNOTATION_PATH 36 | # path used in the SNV_calling module 37 | SOFT_PATH = SNV_CALLING_SOFT_PATH # OPTIONNAL, path of the .soft file from ncbi 38 | # internal index used to link SNVs and genes 39 | INDEX_SAVE_PATH = "{0}/gtf_index/".format(PROJECT_PATH) 40 | 41 | # the path for the folders containing the expression matrix files 42 | # one folder per single cell and each folder contains a unique expression matrix (.txt) file 43 | EXPRESSION_MATRIX_FOLDER_PATH = '/STAR/' 44 | # the name of the gene expression matrix present inside each single-cell folder 45 | GENE_MATRIX_NAME = 'matrix_counts.txt' 46 | 47 | # the SNV caller used 48 | USED_CALLER = 'MONOVAR' # {'MONOVAR', 'GATK'} 49 | 50 | ######################## Monovar caller ############################################### 51 | # The folder containing the .vcf files produced by Monovar and the .txt input files 52 | VCF_MONOVAR_PATH = '/data/monovar/' 53 | 54 | ######################## GATK caller ################################################## 55 | # the name of the folder containing the folders containing the .vcf files 56 | # one folder per single cell and each single-cell folder contains a unique .vcf file 57 | # the path for the folders containing the .vcf files 58 | VCF_FOLDER_PATH = '/data/' 59 | # the name of the file containing the vcf inside each folder 60 | VCF_NAME = 'snv_filtered.vcf' 61 | ###################################################################################### 62 | -------------------------------------------------------------------------------- /garmire_SNV_calling/process_multiple_generic.py: -------------------------------------------------------------------------------- 1 | """ generic class to perform multi-processing """ 2 | 3 | from multiprocessing import Process 4 | from multiprocessing import Queue 5 | 6 | from time import sleep 7 | 8 | 9 | class MPI(): 10 | """generic multiprocessing class""" 11 | def __init__(self, input_list, 12 | ProcessClass, 13 | nb_processes=1, 14 | verbose=True): 15 | """ 16 | id_list: list should be a list of input 17 | ProcessClass: class with process method and id attribute 18 | """ 19 | self.input_queue = Queue() 20 | self.processes = [] 21 | self.verbose = verbose 22 | self.nb_processes = nb_processes 23 | self.ProcessClass = ProcessClass 24 | 25 | for inpt in input_list: 26 | self.input_queue.put(inpt) 27 | 28 | for i in range(nb_processes): 29 | self.processes.append( 30 | MultiprocessingInstance( 31 | input_queue=self.input_queue, 32 | ProcessClass=ProcessClass, 33 | id=i) 34 | ) 35 | 36 | def _run(self): 37 | for p in self.processes: 38 | p.start() 39 | 40 | while self.input_queue.qsize(): 41 | for p in self.processes: 42 | if p.exitcode: 43 | raise KeyboardInterrupt 44 | sleep(1) 45 | 46 | def run(self): 47 | if self.verbose: 48 | rep = raw_input( 49 | 'launching {0} processes with class {1} continue? (Y/n)'\ 50 | .format(self.nb_processes, self.ProcessClass)) 51 | if rep != 'Y': 52 | return 53 | 54 | try: 55 | self._run() 56 | 57 | except KeyboardInterrupt: 58 | for p in self.processes: 59 | p.terminate() 60 | 61 | class MultiprocessingInstance(Process): 62 | """ 63 | generic multiprocessing class 64 | """ 65 | def __init__(self, input_queue, ProcessClass, id): 66 | """ 67 | input_queue: Multiprocessing.Queue 68 | ProcessClass: class with process method and id attribute 69 | """ 70 | self.input_queue = input_queue 71 | self.id = id 72 | self.process_instance = ProcessClass(id=id) 73 | Process.__init__(self) 74 | 75 | def run(self): 76 | while self.input_queue.qsize(): 77 | try: 78 | sample = self.input_queue.get(True, 0.2) 79 | except Exception as e: 80 | print "exception:{0}".format(e) 81 | continue 82 | else: 83 | print "sample for sample {0} with id {1}"\ 84 | .format(sample, self.id) 85 | self.process_instance.process(sample) 86 | -------------------------------------------------------------------------------- /garmire_download_ncbi_sra/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | NCBI RNAseq dataset downloader 3 | 4 | **** CONFIG FILE **** 5 | 6 | """ 7 | from argparse import ArgumentParser 8 | 9 | ARGPARSER = ArgumentParser(description='Argument for the SRA downloading pipeline', 10 | prefix_chars='-') 11 | 12 | ARGPARSER.add_argument('-project_name', 13 | help='name of the project folder and where to find the fastq files (default: sample_test)', 14 | default="sample_test", 15 | metavar='str') 16 | 17 | ARGPARSER.add_argument('-dl_nb_threads', 18 | help=' number of CPU to be used to extract the .sra files (default: 4)', 19 | default=4, 20 | type=int, 21 | metavar='int') 22 | 23 | ARGPARSER.add_argument('-nb_cpus', 24 | help=' number of CPU to be used to extract the .sra files', 25 | default=4, 26 | type=int, 27 | metavar='int') 28 | 29 | ARGPARSER.add_argument('-max_nb_samples', 30 | help=' max number of samples downloaded (default None)', 31 | default=0, 32 | type=int, 33 | metavar='int') 34 | 35 | ARGPARSER.add_argument('-soft_id', 36 | help=' SRA ID used to download the corresponding .soft file (example: "GSE79457")', 37 | default="", 38 | type=str, 39 | metavar='str') 40 | 41 | ARGS = ARGPARSER.parse_known_args()[0] 42 | 43 | ############ Variables ################################################# 44 | # The name of the project (defining the name of the folder) 45 | PROJECT_NAME = ARGS.project_name 46 | # The name of the project (defining the name of the folder) 47 | PROJECT_NAME = ARGS.project_name 48 | # The absolute path where the project will be created 49 | PATH_DATA = "/data/results/{0}".format(PROJECT_NAME) 50 | # and the SRA files downloaded and extracted 51 | # path toward the .soft file (with the corresponding ftp addresses for the .sra files) 52 | PATH_SOFT = "{0}/{1}.soft".format(PATH_DATA, PROJECT_NAME) 53 | # number of threads to use for downloading rsa files 54 | NB_THREADS = ARGS.dl_nb_threads 55 | # number of CPU to be used to extract the .sra files 56 | NB_CPU = ARGS.nb_cpus 57 | # path to the fastq software 58 | FASTQ_DUMP = "fastq-dump" 59 | # options to use to extract the sra (using fastqdump) 60 | # "--split-3 -B is the default" and it is strongly recommended to keep it 61 | FASTQ_DUMP_OPTION = "--split-3 -B" 62 | # define the maximum number of sra files to be downloaded 63 | LIMIT = ARGS.max_nb_samples 64 | #soft ID 65 | SOFT_ID = ARGS.soft_id 66 | ######################################################################## 67 | -------------------------------------------------------------------------------- /garmire_SNV_calling/process_snv_calling_with_monovar.py: -------------------------------------------------------------------------------- 1 | from config import PATH_OUTPUT 2 | from config import OUTPUT_PATH_STAR 3 | from config import MONOVAR_REP 4 | from config import MONOVAR_SAMTOOLS 5 | from config import REF_GENOME 6 | from config import PYTHON 7 | from config import NB_PROCESS_SNV 8 | 9 | from os.path import isdir 10 | from os import mkdir 11 | 12 | from glob import glob 13 | 14 | from random import sample 15 | 16 | from os import remove 17 | from os import popen 18 | 19 | import re 20 | 21 | from multiprocessing import Pool 22 | 23 | 24 | ######## LOCAL VARIABLES ############################ 25 | PATH_MONOVAR = '{0}/monovar/'.format(PATH_OUTPUT) 26 | CHUNCK_SIZE = 20 27 | THREAD_NB = 3 28 | ##################################################### 29 | 30 | 31 | if not isdir(PATH_MONOVAR): 32 | mkdir(PATH_MONOVAR) 33 | 34 | 35 | def main(): 36 | """ """ 37 | create_list_file() 38 | launch_monovar() 39 | 40 | def launch_monovar(): 41 | """ 42 | """ 43 | cmd_list = [] 44 | 45 | for fil in glob('{0}/monovar_input*.txt'.format(PATH_MONOVAR)): 46 | 47 | cmd = '{0} mpileup -BQ0 -d10000 -f {1} -b {6} '\ 48 | '| {3} {4}/src/monovar.py -p 0.002 -a 0.2 -t 0.05 -f {1}'\ 49 | ' -b {6} -m {5} -o {7}.vcf'\ 50 | .format(MONOVAR_SAMTOOLS, 51 | REF_GENOME, 52 | PATH_MONOVAR, 53 | PYTHON, 54 | MONOVAR_REP, 55 | NB_PROCESS_SNV, 56 | fil, fil.rsplit('.', 1)[0]) 57 | 58 | cmd_list.append(cmd) 59 | 60 | pool = Pool(THREAD_NB) 61 | pool.map(_multiprocessing_func, cmd_list) 62 | 63 | def _multiprocessing_func(cmd): 64 | """ """ 65 | print('###### command launched:\n{0}\n########'.format(cmd)) 66 | fil = re.findall('-f (?P.+?) -b', cmd)[0] 67 | 68 | popen(cmd).read() 69 | 70 | print('monovar finished for: {0}').format(fil) 71 | 72 | def create_list_file(): 73 | """ 74 | create the input file used by monovar containing all the input files 75 | """ 76 | for fil in glob('{0}/monovar_input*'.format(PATH_MONOVAR)): 77 | remove(fil) 78 | 79 | file_list = set() 80 | 81 | for folder in glob('{0}/*'.format(OUTPUT_PATH_STAR)): 82 | if not isdir(folder): 83 | continue 84 | 85 | file_list.add('{0}/Aligned.sortedByCoord.out.bam'.format(folder)) 86 | 87 | nb_file = 0 88 | 89 | chunck_size = CHUNCK_SIZE 90 | 91 | while file_list: 92 | if len(file_list) < chunck_size: 93 | chunck_size = len(file_list) 94 | 95 | sample_list = sample(file_list, chunck_size) 96 | file_list = file_list.difference(sample_list) 97 | 98 | f_input = open('{0}/monovar_input_{1}.txt'.format(PATH_MONOVAR, nb_file), 'w') 99 | 100 | for fil in sample_list: 101 | f_input.write('{0}\n'.format(fil)) 102 | 103 | nb_file += 1 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /garmire_SSrGE/generate_refgenome_index.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ Generate homemade reference genome gtf index using python objects """ 4 | 5 | from collections import defaultdict 6 | import re 7 | from time import time 8 | import cPickle 9 | 10 | from os.path import isdir 11 | from distutils.dir_util import mkpath 12 | 13 | from garmire_SSrGE.config import INDEX_SAVE_PATH 14 | from garmire_SSrGE.config import GTF_PATH 15 | 16 | 17 | def main(): 18 | t = time() 19 | print('loading index...') 20 | index_start, index_end = load_indexed_gene_annotations() 21 | position_index = create_position_indexes(index_start, index_end) 22 | print('done in {0} s'.format(time() - t)) 23 | save_indexes(index_start, index_end, position_index) 24 | 25 | return True 26 | 27 | def save_indexes(index_start, 28 | index_end, 29 | position_index, 30 | save_path=INDEX_SAVE_PATH): 31 | """ """ 32 | if not isdir(save_path): 33 | r = raw_input("{0} doesn't exist create it? (y/N)".format(save_path)) 34 | if r != 'y': 35 | return 36 | 37 | mkpath(save_path) 38 | 39 | with open(save_path + 'index_start.pickle', 'w') as f: 40 | cPickle.dump(index_start, f) 41 | with open(save_path + 'index_end.pickle', 'w') as f: 42 | cPickle.dump(index_end, f) 43 | with open(save_path + 'position_index.pickle', 'w') as f: 44 | cPickle.dump(position_index, f) 45 | print('data saved') 46 | 47 | def create_position_indexes(index_start, index_end): 48 | """ 49 | create ordered set of gene first NA position 50 | according to an index per chromosome 51 | """ 52 | position_index = defaultdict(defaultdict) 53 | for key in index_start: 54 | position_index['start'][key] = sorted(index_start[key].keys()) 55 | for key in index_end: 56 | position_index['end'][key] = sorted(index_end[key].keys()) 57 | 58 | return position_index 59 | 60 | def load_indexed_gene_annotations(gtf_path=GTF_PATH): 61 | """ 62 | load index of genes according to chromosomes annotations: 63 | chr1 unknown exon 11874 12227 . + . gene_id "DDX11L1"; gene_name "DDX11L1"; transcript_id "NR_046018"; tss_id "TSS16932"; 64 | chr1 unknown exon 12613 12721 . + . gene_id "DDX11L1"; gene_name "DDX11L1"; transcript_id "NR_046018"; tss_id "TSS16932"; 65 | """ 66 | regex = re.compile('gene\_id "(?P.+)"\; gene') 67 | f = open(gtf_path, "r") 68 | index_start = defaultdict(dict) 69 | index_end = defaultdict(dict) 70 | 71 | for line in f: 72 | line = line.split('\t') 73 | 74 | if int(line[3]) not in index_start[line[0]]: 75 | index_start[line[0]][int(line[3])] = [] 76 | 77 | if int(line[4]) not in index_end[line[0]]: 78 | index_end[line[0]][int(line[4])] = [] 79 | 80 | 81 | index_start[line[0]][int(line[3])].append( 82 | (int(line[4]), 83 | regex.findall(line[8])[0])) 84 | index_end[line[0]][int(line[4])].append( 85 | (int(line[3]), 86 | regex.findall(line[8])[0])) 87 | 88 | return index_start, index_end 89 | 90 | 91 | if __name__ =="__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /garmire_SSrGE/load_data.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """load data """ 4 | 5 | from garmire_SSrGE.config import PROJECT_PATH 6 | from garmire_SSrGE.config import SOFT_PATH 7 | 8 | from garmire_SSrGE.generate_refgenome_index import INDEX_SAVE_PATH 9 | from garmire_SSrGE.generate_refgenome_index import main as generate_refgenome 10 | 11 | import cPickle 12 | import re 13 | 14 | from time import time 15 | from os.path import isfile 16 | 17 | from collections import defaultdict 18 | 19 | 20 | def load_indexes(path_indexes=INDEX_SAVE_PATH): 21 | t = time() 22 | 23 | if not isfile(path_indexes + 'index_start.pickle'): 24 | print('indexes not found. Creating indexes...') 25 | generate_refgenome() 26 | 27 | with open(path_indexes + 'index_start.pickle', 'r') as f: 28 | index_start = cPickle.load(f) 29 | with open(path_indexes + 'index_end.pickle', 'r') as f: 30 | index_end = cPickle.load(f) 31 | with open(path_indexes + 'position_index.pickle', 'r') as f: 32 | position_index = cPickle.load(f) 33 | 34 | print('gene position indexes loaded in {0} s'.format(time() - t)) 35 | return index_start, index_end, position_index 36 | 37 | def process_line_from_vcf_file(line): 38 | """ process one line from the svf file""" 39 | 40 | if line[0] == '#': 41 | return 42 | line = line.split('\t') 43 | 44 | snv_id = None 45 | 46 | # process only passed SNV 47 | if line[6] != 'PASS': 48 | return 49 | 50 | # take annotation 51 | if line[2] != '.': 52 | snv_id = line[2] 53 | 54 | chrid, start = line[0], int(line[1]) 55 | end = start 56 | return chrid, start, end, snv_id 57 | 58 | def load_gsm_and_sample_names_from_soft(soft_path=SOFT_PATH): 59 | """ 60 | load GSM and sample names from soft 61 | 62 | return: 63 | dict(GSM:sample name) 64 | """ 65 | if not soft_path: 66 | return defaultdict(str) 67 | 68 | regex_gsm = re.compile("(?<=\^SAMPLE = )GSM[0-9]+") 69 | regex_name = re.compile("(?<=!Sample_title = ).+(?=\n)") 70 | 71 | if not isfile(soft_path): 72 | return {} 73 | 74 | read = open(soft_path, 'r').read() 75 | 76 | gsms = regex_gsm.findall(read) 77 | names = regex_name.findall(read) 78 | 79 | return defaultdict(str, zip(gsms, names)) 80 | 81 | def process_line_from_annotated_vcf_file(line): 82 | """ process one line from the vcf file""" 83 | 84 | if line[0] == '#': 85 | return 86 | line = line.split('\t') 87 | 88 | # process only passed SNV 89 | if line[6] != 'PASS': 90 | return 91 | 92 | if ONLY_KNOWN_SNV: 93 | # process only SNV with a known id 94 | if line[2] == '.': 95 | return 96 | results = [] 97 | annotations = line[7].split(';')[-1].split(',') 98 | 99 | for annotation in annotations: 100 | annotation = annotation.split('|') 101 | 102 | if len(annotation) < 7: 103 | continue 104 | if not annotation[1]: 105 | continue 106 | 107 | result = {'type': annotation[1], 108 | 'impact': annotation[2], 109 | 'gene impacted': annotation[3], 110 | 'feature type': annotation[5], 111 | 'biotype': annotation[7] 112 | } 113 | results.append(result) 114 | chrid, start = line[0], int(line[1]) 115 | return (annotation[3], start), results 116 | -------------------------------------------------------------------------------- /garmire_SNV_calling/compute_frequency_matrix.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | from os.path import isfile 4 | from os.path import isdir 5 | 6 | from os import listdir 7 | 8 | from os.path import getsize 9 | from distutils.dir_util import mkpath 10 | 11 | from garmire_SNV_calling.config import FEATURE_COUNT 12 | from garmire_SNV_calling.config import ANNOTATION_PATH 13 | from garmire_SNV_calling.config import MATRIX_OUTPUT_PATH as OUTPUT_PATH 14 | from garmire_SNV_calling.config import OUTPUT_PATH_STAR as STAR_PATH 15 | from garmire_SNV_calling.config import STAR_THREADS 16 | 17 | from multiprocessing import Pool 18 | 19 | from garmire_SNV_calling.bash_utils import exec_cmd 20 | 21 | 22 | ############ VARIABLE ################ 23 | DEFAULT_ALIGNER = 'STAR' 24 | 25 | OUTPUT_FILENAME = { 26 | 'STAR': 'Aligned.sortedByCoord.out.bam' 27 | } 28 | 29 | PATH_DICT = { 30 | 'STAR': STAR_PATH 31 | } 32 | ###################################### 33 | 34 | 35 | def main(): 36 | if DEFAULT_ALIGNER not in OUTPUT_FILENAME.keys(): 37 | raise Exception('{0} not a regular aligner!'\ 38 | .format(DEFAULT_ALIGNER)) 39 | 40 | aligner_path=PATH_DICT[DEFAULT_ALIGNER] 41 | output_filename = OUTPUT_FILENAME[DEFAULT_ALIGNER] 42 | 43 | do_expression_profile(aligner_path, output_filename) 44 | 45 | 46 | def do_expression_profile(aligner_path, output_filename): 47 | """ 48 | compute expression matrix according to aligner path results 49 | and output_filename (ex: output.bam) 50 | """ 51 | cmd_list = [] 52 | 53 | for folder in listdir(aligner_path): 54 | 55 | if not isdir(aligner_path + folder): 56 | print('not a folder! continuing', folder) 57 | continue 58 | 59 | bam_file = "{0}/{1}/{2}"\ 60 | .format(aligner_path, folder, output_filename) 61 | 62 | if not isfile(bam_file): 63 | print('no bam file for {0}'.format(bam_file)) 64 | continue 65 | 66 | out_folder = "{0}/{1}/{2}"\ 67 | .format(OUTPUT_PATH, DEFAULT_ALIGNER, folder) 68 | 69 | out_file = '{0}/{1}'.format(out_folder, "matrix_count.txt") 70 | 71 | if isfile(out_file) and getsize(out_file): 72 | print('expression matrix already exists for: {0}'.format(out_folder)) 73 | continue 74 | 75 | cmd_list.append((bam_file, out_folder)) 76 | 77 | pool = Pool(STAR_THREADS) 78 | pool.map(_multiprocess_func, cmd_list) 79 | 80 | 81 | def _multiprocess_func(inp): 82 | """ """ 83 | bam_file, out_folder = inp 84 | bam_file_to_expression_matrix(bam_file, out_folder) 85 | 86 | 87 | def bam_file_to_expression_matrix( 88 | bam_file, 89 | out_folder, 90 | feature_count=FEATURE_COUNT, 91 | annotation_path=ANNOTATION_PATH, 92 | stdout=None, 93 | matrix_name="matrix_counts.txt"): 94 | """ """ 95 | if not isdir(out_folder): 96 | mkpath(out_folder) 97 | 98 | cmd = "{0} -pPBCM --primary -T 1 -a {1} -o {2}/{4}"\ 99 | " {3}".format(feature_count, 100 | annotation_path, 101 | out_folder, 102 | bam_file, 103 | matrix_name) 104 | print('launching cmd: {0}\n'.format(cmd)) 105 | try: 106 | exec_cmd(cmd, stdout) 107 | except Exception as e: 108 | print('exception with featureCount cmd: {0}\n'.format(cmd)) 109 | print('exception: {0}\n'.format(e)) 110 | 111 | assert(isfile('{0}/{1}'.format(out_folder, matrix_name))) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /garmire_SNV_calling/deploy_bismark.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | 3 | from os import popen 4 | from subprocess import Popen 5 | from subprocess import PIPE 6 | 7 | from os import listdir 8 | from os import mkdir 9 | from os.path import isdir 10 | from os.path import isfile 11 | from os.path import getsize 12 | from os.path import split as pathsplit 13 | 14 | import re 15 | 16 | from sys import stdout as STDOUT 17 | 18 | from glob import glob 19 | 20 | from time import sleep 21 | from random import random 22 | from sys import argv 23 | from sys import stdout 24 | 25 | from fnmatch import fnmatch 26 | 27 | from distutils.dir_util import mkpath 28 | 29 | from garmire_SNV_calling.config import FASTQ_PATH 30 | from garmire_SNV_calling.config import PATH_OUTPUT 31 | from garmire_SNV_calling.config import REF_GENOME 32 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN 33 | 34 | 35 | ################ VARIABLE ################################## 36 | BISMARK_SOFTWARE = '/home/opoirion/prog/Bismark/bismark' 37 | OUTPUT_PATH = PATH_OUTPUT + '/bismark/' 38 | THREADS = 4 39 | PROCESS_THREADS = 4 40 | BISMARK_OPTION = '' 41 | REF_GENOME_DIR = pathsplit(REF_GENOME)[0] 42 | ############################################################ 43 | 44 | 45 | sleep(2 * random()) 46 | if not isdir(OUTPUT_PATH): 47 | mkpath(OUTPUT_PATH) 48 | 49 | 50 | def main(): 51 | pool = Pool(PROCESS_THREADS) 52 | # process_one_file(listdir(FASTQ_PATH)[0]) 53 | pool.map(process_one_file, listdir(FASTQ_PATH)) 54 | 55 | def process_one_file(fil): 56 | """ """ 57 | if isfile(FASTQ_PATH + fil): 58 | return 59 | 60 | if PATTERN and not fnmatch(fil, PATTERN): 61 | return 62 | 63 | print "====> file to be aligned:", fil 64 | 65 | if not isdir(OUTPUT_PATH + fil): 66 | mkdir(OUTPUT_PATH + fil) 67 | 68 | bam_file_name = glob(OUTPUT_PATH + fil + '/*.bam') 69 | 70 | if bam_file_name \ 71 | and getsize(bam_file_name[0]): 72 | print 'bam file result alreay exists for:{0}\nskipping...'\ 73 | .format(bam_file_name[0]) 74 | return 75 | 76 | fastq_str = "" 77 | 78 | fastq_files = list(set(glob(FASTQ_PATH + fil + '/*.fastq'))) 79 | print 'fastq files founds: {0}'.format(fastq_files) 80 | 81 | if len(fastq_files) > 2: 82 | print 'tow many fastq files!' 83 | return 84 | 85 | elif len(fastq_files) == 2: 86 | fastq_1 = [fastq for fastq in fastq_files 87 | if re.match('.+_1\.fastq', fastq)] 88 | 89 | assert(fastq_1) 90 | 91 | fastq_1 = fastq_1[0] 92 | 93 | fastq_2 = [fastq for fastq in fastq_files 94 | if re.match('.+_2\.fastq', fastq,)] 95 | assert(fastq_2) 96 | 97 | fastq_2 = fastq_2[0] 98 | fastq_str = ' -1 {0} -2 {1} '.format(fastq_1, fastq_2) 99 | 100 | elif len(fastq_files) == 1: 101 | fastq_str = ' {0} '.format(fastq_files[0]) 102 | 103 | stdout = open(OUTPUT_PATH + fil + "/log.out", 'w') 104 | 105 | if not fastq_str: 106 | print 'no fastq file found for:{0}!\nskipping'.format(fil) 107 | return 108 | 109 | cmd = "{0} -p {1} -o {2} --temp_dir {2} {3} --genome {4} {5} > {2}/stdlog.out" \ 110 | .format(BISMARK_SOFTWARE, 111 | THREADS, 112 | OUTPUT_PATH + fil + "/", 113 | BISMARK_OPTION, 114 | REF_GENOME_DIR, 115 | fastq_str 116 | ) 117 | 118 | _run_cmd(cmd, stdout) 119 | 120 | 121 | def _run_cmd(cmd, stdout): 122 | """run cmd""" 123 | 124 | process = popen(cmd).read() 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | -------------------------------------------------------------------------------- /test/test_snv.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | 4 | from os import popen 5 | from os import listdir 6 | 7 | from commands import getstatusoutput 8 | 9 | from os.path import isfile 10 | from os.path import isdir 11 | 12 | from garmire_SNV_calling.config import OUTPUT_ROOT 13 | 14 | from garmire_SNV_calling.config import TYPE_VAR 15 | from garmire_SNV_calling.config import FASTQ_PATH 16 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN 17 | 18 | from garmire_SNV_calling.config import JAVA 19 | from garmire_SNV_calling.config import GATK_DIR 20 | from garmire_SNV_calling.config import GATK_JAR 21 | from garmire_SNV_calling.config import PICARD_DIR 22 | from garmire_SNV_calling.config import PATH_STAR_SOFTWARE 23 | from garmire_SNV_calling.config import SAMTOOLS 24 | from garmire_SNV_calling.config import FREEBAYES 25 | 26 | from fnmatch import fnmatch 27 | 28 | 29 | class TestPackage(unittest.TestCase): 30 | """ """ 31 | def test_output_root(self): 32 | """assert that OUTPUT_ROOT folder exists""" 33 | self.assertTrue(isdir(OUTPUT_ROOT)) 34 | 35 | # def test_soft(self): 36 | # """assert that .soft file exists""" 37 | # self.assertTrue(isfile(SOFT_PATH)) 38 | 39 | def test_ref_genome(self): 40 | """assert that ref genome file exists""" 41 | for typ in TYPE_VAR: 42 | self.assertTrue(isfile(TYPE_VAR[typ]['REF_GENOME'])) 43 | 44 | # def test_annotation_path(self): 45 | # """assert that STAR ref folder exists""" 46 | # for typ in TYPE_VAR: 47 | # self.assertTrue(isdir(pathsplit( 48 | # TYPE_VAR[typ]['STAR_INDEX_PATH'])[0])) 49 | 50 | def test_gtf_file(self): 51 | """assert that GTF file exists""" 52 | for typ in TYPE_VAR: 53 | self.assertTrue(isfile(TYPE_VAR[typ]['ANNOTATION_PATH'])) 54 | 55 | def test_dbsnp(self): 56 | """assert that dbsnp vcf file exists""" 57 | for typ in TYPE_VAR: 58 | self.assertTrue(isfile(TYPE_VAR[typ]['DBSNP'])) 59 | 60 | def test_vcf_resources(self): 61 | """assert that additional vcf file exist""" 62 | for typ in TYPE_VAR: 63 | for vcf in TYPE_VAR[typ]['VCF_RESOURCES']: 64 | self.assertTrue(isfile(vcf)) 65 | 66 | def test_fastq_path(self): 67 | """assert that fastq path exists""" 68 | self.assertTrue(isdir(FASTQ_PATH)) 69 | 70 | def test_fastq_path_not_empty(self): 71 | """assert that fastq path not empty""" 72 | self.assertTrue(len(listdir(FASTQ_PATH))) 73 | 74 | def test_fastq_path_with_folders_with_fastqfile(self): 75 | """assert that fastq folder exists and that .fastq files are inside""" 76 | 77 | for fastq_folder in listdir(FASTQ_PATH): 78 | if isfile(FASTQ_PATH + fastq_folder): 79 | continue 80 | if PATTERN and not fnmatch(fastq_folder, PATTERN): 81 | continue 82 | 83 | folder = "{0}/{1}".format(FASTQ_PATH, fastq_folder) 84 | 85 | print 'testing if {0} is empty'.format(folder) 86 | self.assertTrue(filter(lambda fil: fnmatch(fil, '*.fastq'), 87 | listdir(folder))) 88 | 89 | def test_java(self): 90 | """assert that java is installed and > 1.8""" 91 | res = getstatusoutput('{0} -version'.format(JAVA))[1] 92 | self.assertIsNotNone(res) 93 | 94 | version = res.split('"')[1].rsplit('.', 1)[0] 95 | self.assertTrue(float(version) >= 1.8) 96 | 97 | def test_GATK(self): 98 | """assert that GATK .jar file exists""" 99 | self.assertTrue(isfile('{0}/{1}'.format(GATK_DIR, GATK_JAR))) 100 | 101 | def test_freebayes(self): 102 | """assert that freebayes file exists""" 103 | self.assertTrue(isfile(FREEBAYES)) 104 | 105 | def test_samtools(self): 106 | """assert that freebayes file exists""" 107 | self.assertTrue(isfile(SAMTOOLS)) 108 | 109 | def test_picard_tools(self): 110 | """assert that picard-tools .jar files exist""" 111 | self.assertTrue(isfile('{0}/picard.jar'.format(PICARD_DIR))) 112 | self.assertTrue(isfile('{0}/picard-lib.jar'.format(PICARD_DIR))) 113 | 114 | def test_STAR_aligner(self): 115 | """assert that STAR aligner bin exists and return version""" 116 | self.assertIsNotNone(popen('{0} --version'.format(PATH_STAR_SOFTWARE))) 117 | 118 | 119 | if __name__ == "__main__": 120 | unittest.main() 121 | -------------------------------------------------------------------------------- /garmire_SSrGE/linear_cross_validation.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | from sklearn.model_selection import KFold 4 | from garmire_SSrGE.ssrge import SSrGE 5 | 6 | from garmire_SSrGE.config import CROSS_VAL_NFOLD 7 | 8 | import numpy as np 9 | 10 | 11 | def debug(): 12 | """ 13 | #### DEBUG #### 14 | **** Test function **** 15 | """ 16 | from garmire_SSrGE.examples import create_example_matrix_v1 17 | 18 | X, Y, W = create_example_matrix_v1() 19 | 20 | cross_val = LinearCrossVal( 21 | model='LASSO', 22 | SNV_mat=X, 23 | GE_mat=Y 24 | ) 25 | 26 | path = cross_val.regularization_path('alpha', [0.01, 0.1, 0.2]) 27 | 28 | return path 29 | 30 | 31 | class LinearCrossVal(): 32 | """ 33 | Class to perform cross-validation 34 | """ 35 | def __init__(self, 36 | SNV_mat, 37 | GE_mat, 38 | n_folds=CROSS_VAL_NFOLD, 39 | verbose=True, 40 | **ssrge_params): 41 | """ """ 42 | if GE_mat.shape[0] == SNV_mat.shape[0] and \ 43 | GE_mat.shape[1] != SNV_mat.shape[1]: 44 | GE_mat = GE_mat.T 45 | 46 | self.SNV_mat = SNV_mat 47 | self.GE_mat = GE_mat 48 | self.verbose = verbose 49 | self.n_folds = n_folds 50 | 51 | self.ssrge_params = ssrge_params 52 | 53 | self.errs_models = None 54 | self.errs_empty_models = None 55 | self.nb_coefs_list = None 56 | self.nb_models = None 57 | self.nb_model_mean = None 58 | self.regularization_value_list = None 59 | 60 | def regularization_path(self, param_name, value_list): 61 | """ 62 | :param_name: str the name of the param to test 63 | :value_list: list(float) 64 | """ 65 | self.err_model_mean = [] 66 | self.err_empty_mean = [] 67 | self.nb_coefs_mean = [] 68 | self.nb_model_mean = [] 69 | self.intercept_mean = [] 70 | self.regularization_value_list = value_list 71 | 72 | for value in value_list: 73 | self.ssrge_params[param_name] = value 74 | 75 | (errs_models, 76 | errs_null_models, 77 | nb_coefs, 78 | nb_models, 79 | intercepts 80 | ) = self.fit() 81 | 82 | self.err_model_mean.append(errs_models) 83 | self.err_empty_mean.append(errs_null_models) 84 | self.nb_model_mean.append(nb_models) 85 | self.nb_coefs_mean.append(nb_coefs) 86 | self.intercept_mean.append(intercepts) 87 | 88 | if self.verbose: 89 | print('\nmean error model:', errs_models) 90 | print('mean error null model:', errs_null_models) 91 | print('mean number of model:', nb_models) 92 | print('mean number of eeSNVs:', nb_coefs) 93 | 94 | return self.err_model_mean 95 | 96 | def fit(self): 97 | """ """ 98 | i = 0 99 | 100 | errs_models = [] 101 | errs_null_models = [] 102 | nb_coefs = [] 103 | nb_models = [] 104 | intercepts = [] 105 | 106 | print('\n######## cross validation\n####parameters:{0}'\ 107 | .format(self.ssrge_params)) 108 | 109 | ssrge = SSrGE(**self.ssrge_params) 110 | 111 | kfold = KFold(n_splits=self.n_folds, 112 | shuffle=True) 113 | 114 | for train, test in kfold.split(self.SNV_mat): 115 | i += 1 116 | print('\n## fold nb {0}'.format(i)) 117 | 118 | X_train = self.SNV_mat[train] 119 | Y_train = self.GE_mat.T[train].T 120 | 121 | X_test = self.SNV_mat[test] 122 | Y_test = self.GE_mat.T[test].T 123 | 124 | ssrge.fit(X_train, Y_train) 125 | 126 | score, score_null = ssrge.score(X_test, Y_test) 127 | 128 | errs_models.append(score) 129 | errs_null_models.append(score_null) 130 | nb_coefs.append(len(ssrge.eeSNV_weight)) 131 | nb_models.append(len(ssrge.intercepts)) 132 | intercepts.append(np.mean(list(ssrge.intercepts.values()))) 133 | 134 | return (np.mean(errs_models), 135 | np.mean(errs_null_models), 136 | np.mean(nb_coefs), 137 | np.mean(nb_models), 138 | np.mean(intercepts) 139 | ) 140 | 141 | 142 | if __name__ == '__main__': 143 | debug() 144 | -------------------------------------------------------------------------------- /test/test_ssrge.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class TestPackage(unittest.TestCase): 5 | """ """ 6 | def test_multiprocess(self): 7 | """ Test multiprocess procedure """ 8 | from garmire_SSrGE.multiprocess_fitting import debug 9 | 10 | g_index, coefs, intercepts = debug() 11 | 12 | self.assertTrue(g_index) 13 | self.assertTrue(coefs) 14 | self.assertTrue(coefs[0]) 15 | self.assertTrue(sum(intercepts)) 16 | 17 | 18 | def test_ssrge(self): 19 | """test ssrge procedure""" 20 | from garmire_SSrGE.examples import create_example_matrix_v1 21 | from garmire_SSrGE.ssrge import SSrGE 22 | 23 | X, Y, W = create_example_matrix_v1() 24 | ssrge = SSrGE(alpha=0.01) 25 | 26 | ssrge.fit(X, Y) 27 | self.assertTrue(ssrge.eeSNV_weight) 28 | 29 | Xr = ssrge.transform(X) 30 | 31 | self.assertTrue(Xr.sum()) 32 | self.assertTrue(Xr.shape[0] == X.shape[0]) 33 | self.assertTrue(Xr.shape[1] < X.shape[1]) 34 | 35 | snv_ranked = ssrge.rank_eeSNVs() 36 | 37 | self.assertTrue(snv_ranked) 38 | 39 | score = ssrge.score(X,Y) 40 | 41 | self.assertTrue(score[0] < score[1]) 42 | 43 | def test_ssrge_elasticnet(self): 44 | """test ssrge procedure with elasticnet model""" 45 | from garmire_SSrGE.examples import create_example_matrix_v1 46 | from garmire_SSrGE.ssrge import SSrGE 47 | 48 | X, Y, W = create_example_matrix_v1() 49 | ssrge = SSrGE(alpha=0.01, model='ElasticNet') 50 | 51 | ssrge.fit(X, Y) 52 | self.assertTrue(ssrge.eeSNV_weight) 53 | 54 | Xr = ssrge.transform(X) 55 | 56 | self.assertTrue(Xr.sum()) 57 | self.assertTrue(Xr.shape[0] == X.shape[0]) 58 | self.assertTrue(Xr.shape[1] <= X.shape[1]) 59 | 60 | snv_ranked = ssrge.rank_eeSNVs() 61 | 62 | self.assertTrue(snv_ranked) 63 | 64 | score = ssrge.score(X,Y) 65 | 66 | self.assertTrue(score[0] < score[1]) 67 | 68 | def test_ssrge_cnv(self): 69 | """test ssrge procedure with cnv matrix""" 70 | from garmire_SSrGE.examples import create_example_matrix_v3 71 | from garmire_SSrGE.ssrge import SSrGE 72 | 73 | X, Y, C, W = create_example_matrix_v3() 74 | ssrge = SSrGE(alpha=0.01) 75 | 76 | ssrge.fit(X, Y, C) 77 | self.assertTrue(ssrge.eeSNV_weight) 78 | 79 | Xr = ssrge.transform(X) 80 | 81 | self.assertTrue(Xr.sum()) 82 | self.assertTrue(Xr.shape[0] == X.shape[0]) 83 | self.assertTrue(Xr.shape[1] < X.shape[1]) 84 | 85 | snv_ranked = ssrge.rank_eeSNVs() 86 | 87 | self.assertTrue(snv_ranked) 88 | 89 | score = ssrge.score(X,Y) 90 | 91 | self.assertTrue(score[0] < score[1]) 92 | 93 | def test_ssrge_rank_gene(self): 94 | """test ssrge and rank genes and snvs""" 95 | from garmire_SSrGE.examples import create_example_matrix_v2 96 | from garmire_SSrGE.ssrge import SSrGE 97 | 98 | X, Y, gene_id_list, snv_id_list = create_example_matrix_v2() 99 | ssrge = SSrGE( 100 | snv_id_list=snv_id_list, 101 | gene_id_list=gene_id_list, 102 | nb_ranked_features=2, 103 | alpha=0.01) 104 | 105 | ssrge.fit(X, Y) 106 | self.assertTrue(ssrge.eeSNV_weight) 107 | 108 | Xr = ssrge.transform(X) 109 | 110 | self.assertTrue(Xr.sum()) 111 | self.assertTrue(Xr.shape[0] == X.shape[0]) 112 | self.assertTrue(Xr.shape[1] < X.shape[1]) 113 | 114 | snv_ranked = ssrge.rank_eeSNVs() 115 | 116 | self.assertTrue(snv_ranked) 117 | 118 | self.assertTrue(len(ssrge.retained_snvs) == ssrge.nb_ranked_features) 119 | self.assertTrue(len(ssrge.retained_genes) == ssrge.nb_ranked_features) 120 | 121 | score = ssrge.score(X,Y) 122 | 123 | self.assertTrue(score[0] < score[1]) 124 | 125 | subgroup = ssrge.rank_features_for_a_subgroup(range(10)) 126 | 127 | self.assertTrue(len(subgroup.gene_expr_distrib[gene_id_list[0]]) == 10) 128 | self.assertTrue(subgroup.snv_weights_distrib) 129 | self.assertTrue(subgroup.exp_snv_distrib_comp) 130 | 131 | def test_cross_validation(self): 132 | """test cross validation procedure""" 133 | 134 | from garmire_SSrGE.linear_cross_validation import debug 135 | 136 | path = debug() 137 | self.assertTrue(path) 138 | 139 | 140 | if __name__ == "__main__": 141 | unittest.main() 142 | -------------------------------------------------------------------------------- /README_download_ncbi_rsa.md: -------------------------------------------------------------------------------- 1 | # Download SRA files from NCBI (GEO) 2 | 3 | This module provides scripts to download and extract SRA files for High-throughput genomic data from NCBI (GEO portal) using NCBI .soft file 4 | 5 | 6 | 7 | # SRA project download using docker 8 | 9 | ## Requirements 10 | * docker 11 | * possible root access 12 | * 13.8 GB of free memory (docker image) 13 | 14 | ## installation (local) 15 | 16 | ```bash 17 | docker pull opoirion/ssrge 18 | mkdir // 19 | cd // 20 | PATHDATA=`pwd` 21 | ``` 22 | 23 | ## usage 24 | 25 | The pipeline consists of 3 steps (for downloading the data) and 4 steps for aligning and calling SNVs: 26 | 27 | ```bash 28 | # Download 29 | docker run --rm opoirion/ssrge download_soft_file -h 30 | docker run --rm opoirion/ssrge download_sra -h 31 | docker run --rm opoirion/ssrge extract_sra -h 32 | ``` 33 | 34 | ## example 35 | 36 | Let's download and process 2 samples from GSE79457 in a project name test_n2 37 | 38 | ```bash 39 | # download of the soft file containing the metadata for GSE79457 40 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_soft_file -project_name test_n2 -soft_id GSE79457 41 | # download sra files 42 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_sra -project_name test_n2 -max_nb_samples 2 43 | # exctract sra files 44 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge extract_sra -project_name test_n2 45 | # rm sra files (optionnal) 46 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge rm_sra -project_name test_n2 47 | ``` 48 | 49 | # Installation from github (*not updated!!* => Use the docker image for now) 50 | 51 | 52 | ## Requirements 53 | * [python 2 (>=2.7)](https://www.python.org/download/releases/2.7.2/) 54 | * The only external software needed is [fastq-dump](http://ncbi.github.io/sra-tools/install_config.html) to extract the .sra files. Path toward the executable must be given to the config file or parsed as argument 55 | * A folder with the name of the project must be created and the absolute path toward that folder must be given to the config file or parsed as argument 56 | * The .soft file from [NCBI GEO](http://www.ncbi.nlm.nih.gov/geo/) website file related to the project must be downloaded (and put into the project folder (default)) 57 | * link for dataset description GEO webpage [example](http://ftp.ncbi.nlm.nih.gov/geo/series/GSE85nnn/GSE85183/soft/) 58 | * An example soft file is also available in the ./example/ folder of the repository (default folder) 59 | 60 | ## configuration 61 | * all global variables can be set into the file ./garmire_download_ncbi_sra/config.py or parsed as function attributes 62 | * arguments description can be found at any time by invoking the -h (or -H) option or by consulting the config file: 63 | 64 | ```text 65 | -PROJECT_NAME The name of the project (defining the name of the folder) 66 | -PATH_DATA The absolute path where the project will be created and the SRA files downloaded and extracted 67 | -PATH_SOFT path toward the .soft file (with the corresponding ftp addresses for the .sra files) 68 | -NB_THREADS number of threads (download in parallel) to use for downloading rsa files (default 2) 69 | -FASTQ_DUMP path to the fastq-dump software 70 | -FASTQ_DUMP_OPTION options to use to extract the sra (using fastq-dump) "--split-3 -B is the default" and it is strongly recommended to keep it 71 | -LIMIT define the maximum number of sra files to be downloaded (default None) 72 | ``` 73 | 74 | ## usage 75 | move to folder of the git project (https://github.com/lanagarmire/SSrGE.git) 76 | 77 | ```bash 78 | cd SSrGE 79 | ``` 80 | 81 | * Setting the global variables into the config file (download_ncbi_sra/config.py) or parsing them each time as arguments 82 | * [optional] Running the tests: 83 | 84 | ```bash 85 | python ./test/test_download.py -v 86 | ``` 87 | 88 | * download and extract data (download by default .sra files from the example .soft file): 89 | 90 | ```bash 91 | python garmire_download_ncbi_sra/download_data.py 92 | ``` 93 | * download and extract data (with parsing options): 94 | 95 | ```bash 96 | python garmire_download_ncbi_sra/download_data.py -NB_THREADS 5 -PATH_SOFT tutut/... 97 | ``` 98 | * extract SRA file 99 | 100 | ```bash 101 | python garmire_download_ncbi_sra/extract_data.py 102 | ``` 103 | * remove SRA file 104 | 105 | ```bash 106 | python garmire_download_ncbi_sra/remove_sra.py 107 | ``` 108 | 109 | ## contact and credentials 110 | * Developer: Olivier Poirion (PhD) 111 | * contact: opoirion@hawaii.edu -------------------------------------------------------------------------------- /garmire_SNV_calling/parse_10x_bam_file_to_fastq_files.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | 3 | from os import mkdir 4 | from os.path import isdir 5 | from os.path import isfile 6 | 7 | from sys import stdout 8 | 9 | from os import popen 10 | 11 | from collections import defaultdict 12 | 13 | import cPickle 14 | 15 | from multiprocessing import Pool 16 | 17 | from glob import glob 18 | 19 | from os import popen 20 | 21 | from os import remove 22 | 23 | 24 | ######################## VARIABLE ############################ 25 | PATH_FASTQ = '/mnt/nas_rna2/opoirion/10x_data/fastq/' 26 | PATH_BAM = '/mnt/nas_rna2/opoirion/10x_data/neurons_900_possorted_genome_bam.bam' 27 | 28 | # maximum of reads for a cells 29 | MAX_READS = None 30 | # number of cells you want 31 | NB_CELLS = 1000 32 | FASTQ_THREAD = 10 33 | ############################################################### 34 | 35 | 36 | def main(): 37 | """ 38 | """ 39 | stats, cell_list = get_cell_stats() 40 | write_bam_files(cell_list) 41 | bam_to_fastq(cell_list) 42 | 43 | 44 | def bam_to_fastq(cell_list): 45 | """ 46 | """ 47 | print('converting all the bam files into fastq files...') 48 | pool = Pool(FASTQ_THREAD) 49 | pool.map(_bam_to_fastq, cell_list) 50 | 51 | def _bam_to_fastq(cell): 52 | """ 53 | """ 54 | bam_files = glob('{0}/{1}/*.bam'.format(PATH_FASTQ, cell)) 55 | 56 | if not bam_files: 57 | return 58 | 59 | for bam_file in bam_files: 60 | cmd = 'bamToFastq -i {0} -fq {1}/{2}/{2}.fastq'.format(bam_file, PATH_FASTQ, cell) 61 | popen(cmd).read() 62 | remove(bam_file) 63 | 64 | def write_bam_files(cell_list): 65 | """ 66 | """ 67 | print('\n#### SECOND PASS ####') 68 | cmd = "samtools idxstats {0} | awk -F '\t' '{{s+=$3+$4}}END{{print s}}'".format(PATH_BAM) 69 | nb_reads = int(popen(cmd).read().strip('\n')) 70 | 71 | if MAX_READS is None: 72 | max_reads = nb_reads 73 | else: 74 | max_reads = MAX_READS 75 | 76 | file_dict = {} 77 | 78 | f_raw = pysam.AlignmentFile(PATH_BAM, 'rb') 79 | header = f_raw.header 80 | 81 | i = 0 82 | 83 | while i < max_reads: 84 | try: 85 | reads = f_raw.next() 86 | except Exception: 87 | break 88 | 89 | try: 90 | bc_tags = reads.get_tag('CB') 91 | except KeyError: 92 | continue 93 | 94 | i += 1 95 | 96 | stdout.write('\r nb reads {0} / {1}'.format(i, nb_reads)) 97 | stdout.flush() 98 | 99 | if bc_tags not in cell_list: 100 | continue 101 | 102 | if bc_tags not in file_dict: 103 | folder = '{0}/{1}'.format(PATH_FASTQ, bc_tags) 104 | 105 | if not isdir(folder): 106 | mkdir(folder) 107 | 108 | file_dict[bc_tags] = pysam.AlignmentFile( 109 | '{0}/bc_tags.bam'.format(folder), 'wb', header=header) 110 | 111 | file_dict[bc_tags].write(reads) 112 | 113 | f_raw.close() 114 | 115 | def get_cell_stats(): 116 | """ 117 | """ 118 | path_pickle = '{0}/cell_stats.pickle'.format(PATH_FASTQ) 119 | 120 | if isfile(path_pickle): 121 | stats_dict = cPickle.load(open(path_pickle)) 122 | else: 123 | stats_dict = _get_cell_stats() 124 | cPickle.dump(stats_dict, open(path_pickle, 'w')) 125 | 126 | cells, count = zip(*sorted(stats_dict.items(), key=lambda x:x[1], reverse=True)[:NB_CELLS]) 127 | 128 | return stats_dict, set(cells) 129 | 130 | def _get_cell_stats(): 131 | """ 132 | """ 133 | print('#### FIRST PASS ####') 134 | cmd = "samtools idxstats {0} | awk -F '\t' '{{s+=$3+$4}}END{{print s}}'".format(PATH_BAM) 135 | nb_reads = int(popen(cmd).read().strip('\n')) 136 | 137 | if MAX_READS is None: 138 | max_reads = nb_reads 139 | else: 140 | max_reads = MAX_READS 141 | 142 | stats_dict = defaultdict(int) 143 | 144 | f_raw = pysam.AlignmentFile(PATH_BAM, 'rb') 145 | 146 | i = 0 147 | 148 | while i < max_reads: 149 | try: 150 | reads = f_raw.next() 151 | except Exception: 152 | break 153 | 154 | try: 155 | bc_tags = reads.get_tag('CB') 156 | except KeyError: 157 | continue 158 | 159 | stats_dict[bc_tags] += 1 160 | 161 | i += 1 162 | 163 | stdout.write('\r nb reads {0} / {1}'.format(i, nb_reads)) 164 | stdout.flush() 165 | 166 | f_raw.close() 167 | 168 | return stats_dict 169 | 170 | 171 | if __name__ == '__main__': 172 | main() 173 | -------------------------------------------------------------------------------- /garmire_SNV_calling/process_annotate_snv.py: -------------------------------------------------------------------------------- 1 | 2 | """ process one fastqc report""" 3 | 4 | from os import popen 5 | from os import listdir 6 | 7 | from os.path import isdir 8 | from os.path import isfile 9 | from subprocess import Popen 10 | 11 | from distutils.dir_util import mkpath 12 | from shutil import rmtree 13 | from shutil import copyfile 14 | from shutil import move 15 | from sys import stdout as STDOUT 16 | from sys import argv 17 | from random import randint 18 | from random import random 19 | from time import sleep 20 | from time import time 21 | from fnmatch import fnmatch 22 | 23 | from garmire_SNV_calling.config import OUTPUT_PATH_SNV 24 | from garmire_SNV_calling.config import SNPEFF 25 | from garmire_SNV_calling.config import JAVA 26 | from garmire_SNV_calling.config import SNPEFF_DB 27 | 28 | from garmire_SNV_calling.process_multiple_generic import MPI 29 | 30 | ############ VARIABLES ############################################ 31 | SRR_TO_PROCESS = "" # for debug purpose 32 | PROCESS_ID = randint(0, 1000000) 33 | INPUT_PATH = OUTPUT_PATH_SNV + '/data/' 34 | 35 | if "--specific_folder" in argv: 36 | SRR_TO_PROCESS = argv[ 37 | argv.index("--specific_folder") + 1] 38 | if "--process_id" in argv: 39 | PROCESS_ID = int(argv[ 40 | argv.index("--process_id") + 1]) 41 | if "--nb_threads" in argv: 42 | NB_THREADS = int(argv[ 43 | argv.index("--nb_threads") + 1]) 44 | else: 45 | NB_THREADS = None 46 | ################################################################### 47 | 48 | 49 | def main(): 50 | if NB_THREADS: 51 | input_list = listdir(INPUT_PATH) 52 | mpi = MPI(input_list=input_list, 53 | ProcessClass=ProcessAnnotateSNV, 54 | nb_processes=NB_THREADS) 55 | mpi.run() 56 | else: 57 | process_annotate_snv = ProcessAnnotateSNV(id=PROCESS_ID) 58 | process_annotate_snv.process(SRR_TO_PROCESS) 59 | 60 | class ProcessAnnotateSNV(): 61 | """ 62 | Process SNV annotation using snpEff software 63 | """ 64 | def __init__(self, 65 | path_to_data=OUTPUT_PATH_SNV, 66 | id="1", 67 | clean_tmp=True, 68 | ): 69 | self.path_to_data = path_to_data 70 | self.time_start = None 71 | self.id = str(id) 72 | self.stdout = None 73 | 74 | def process(self, srr_to_process=SRR_TO_PROCESS): 75 | """ 76 | process one fastq file using fastqc 77 | """ 78 | tmppath = self.path_to_data + "/tmp/" + self.id 79 | inputpath = self.path_to_data + "/data/" 80 | input_file = '{0}/{1}/snv_filtered.vcf'\ 81 | .format(inputpath, srr_to_process) 82 | tmp_file = '{0}/snv_filtered_annotated.vcf'\ 83 | .format(tmppath) 84 | output_file = '{0}/{1}/snv_filtered_annotated.vcf'\ 85 | .format(inputpath, srr_to_process) 86 | 87 | if not isdir(inputpath): 88 | print '{0} is not a folder!'.format( 89 | self.path_to_data + srr_to_process) 90 | return 91 | 92 | if not isfile(input_file): 93 | print 'no input file: {0}!'.format( 94 | input_file) 95 | return 96 | 97 | if isfile(output_file): 98 | print '{0} already exists!'.format( 99 | output_file) 100 | return 101 | 102 | sleep(random()) 103 | if not isdir(tmppath): 104 | mkpath(tmppath) 105 | 106 | popen("rm {0}/*".format(tmppath)).read() 107 | 108 | self.stdout = open(tmppath + '/stdout.log', 'w') 109 | 110 | cmd = "{0} -jar {1} eff -v {2} {3} -noStats > {4}"\ 111 | .format(JAVA, SNPEFF, SNPEFF_DB, input_file, tmp_file) 112 | 113 | self._exec_cmd(cmd) 114 | self._exec_cmd("mv {0} {1}"\ 115 | .format(tmp_file, output_file)) 116 | self._exec_cmd("mv {0}/stdout.log {1}/{2}/snv_annotation.log"\ 117 | .format(tmppath, inputpath, srr_to_process)) 118 | self.stdout.close() 119 | popen('rm -r {0}/*'.format(tmppath)) 120 | 121 | def _exec_cmd(self, cmd): 122 | """ execute cmd """ 123 | process = Popen(cmd, 124 | stdout=self.stdout, 125 | stderr=self.stdout, 126 | shell=True) 127 | 128 | process.communicate() 129 | if process.returncode: 130 | raise Exception('{0} raise non 0 return code!\n'\ 131 | .format(cmd)) 132 | 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /garmire_SNV_calling/deploy_BSseeker.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | from multiprocessing import Pool 4 | 5 | from os import popen 6 | 7 | from os import listdir 8 | from os import mkdir 9 | from os.path import isdir 10 | from os.path import isfile 11 | from os.path import getsize 12 | from os.path import split as pathsplit 13 | 14 | import re 15 | 16 | from glob import glob 17 | 18 | from time import sleep 19 | from random import random 20 | 21 | from fnmatch import fnmatch 22 | 23 | from distutils.dir_util import mkpath 24 | 25 | from garmire_SNV_calling.config import FASTQ_PATH 26 | from garmire_SNV_calling.config import PATH_OUTPUT 27 | from garmire_SNV_calling.config import REF_GENOME 28 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN 29 | from garmire_SNV_calling.config import BSSEEKER2_REP 30 | from garmire_SNV_calling.config import PYTHON 31 | from garmire_SNV_calling.config import BOWTIE_REP 32 | from garmire_SNV_calling.config import DO_TRIMGALORE 33 | from garmire_SNV_calling.config import TRIMGALORE_REP 34 | 35 | 36 | ################ VARIABLE ################################## 37 | 38 | OUTPUT_PATH = PATH_OUTPUT + '/BSseeker/' 39 | PROCESS_THREADS = 2 40 | BISMARK_OPTION = '' 41 | REF_GENOME_PATH = pathsplit(REF_GENOME)[0] 42 | ############################################################ 43 | 44 | 45 | sleep(2 * random()) 46 | if not isdir(OUTPUT_PATH): 47 | mkpath(OUTPUT_PATH) 48 | 49 | 50 | def main(): 51 | pool = Pool(PROCESS_THREADS) 52 | pool.map(process_one_file, listdir(FASTQ_PATH)) 53 | 54 | def process_one_file(fil): 55 | """ """ 56 | print(fil) 57 | if isfile(FASTQ_PATH + fil): 58 | return False 59 | 60 | if PATTERN and not fnmatch(fil, PATTERN): 61 | return False 62 | 63 | print("====> file to be aligned:", fil) 64 | 65 | if not isdir(OUTPUT_PATH + fil): 66 | mkdir(OUTPUT_PATH + fil) 67 | 68 | bam_file_name = glob(OUTPUT_PATH + fil + '/*.bam') 69 | 70 | if bam_file_name \ 71 | and getsize(bam_file_name[0]): 72 | print('bam file result alreay exists for:{0}\nskipping...'\ 73 | .format(bam_file_name[0])) 74 | return False 75 | 76 | fastq_str = "" 77 | 78 | fastq_files = list(set(glob(FASTQ_PATH + fil + '/*.fastq'))) 79 | print('fastq files founds: {0}'.format(fastq_files)) 80 | 81 | stdout = open(OUTPUT_PATH + fil + "/log.out", 'w') 82 | 83 | if len(fastq_files) > 2: 84 | print('tow many fastq files!') 85 | return False 86 | 87 | elif len(fastq_files) == 2: 88 | fastq_1 = [fastq for fastq in fastq_files 89 | if re.match('.+_1\.fastq', fastq)] 90 | 91 | assert(fastq_1) 92 | 93 | fastq_1 = fastq_1[0] 94 | 95 | fastq_2 = [fastq for fastq in fastq_files 96 | if re.match('.+_2\.fastq', fastq,)] 97 | assert(fastq_2) 98 | 99 | fastq_2 = fastq_2[0] 100 | 101 | if DO_TRIMGALORE: 102 | cmd_trim = "{0}/trim_galore {1} {2} --paired --no_report_file -o {3}".format( 103 | TRIMGALORE_REP, fastq_1, fastq_2, FASTQ_PATH + fil) 104 | _run_cmd(cmd_trim, stdout) 105 | 106 | fastq_1 = '{0}_val_1.fq'.format(fastq_1.rsplit('.', 1)[0]) 107 | fastq_2 = '{0}_val_2.fq'.format(fastq_2.rsplit('.', 1)[0]) 108 | 109 | fastq_str = ' -1 {0} -2 {1} '.format(fastq_1, fastq_2) 110 | 111 | elif len(fastq_files) == 1: 112 | if DO_TRIMGALORE: 113 | fastq_file = fastq_files[0] 114 | cmd_trim = "{0}/trim_galore {1} --no_report_file -o {2}".format( 115 | TRIMGALORE_REP, fastq_file, FASTQ_PATH + fil) 116 | _run_cmd(cmd_trim, stdout) 117 | fastq_file = '{0}_trimmed.fq'.format(fastq_file.rsplit('.', 1)[0]) 118 | 119 | fastq_str = ' -i {0} '.format(fastq_file) 120 | 121 | if not fastq_str: 122 | print('no fastq file found for:{0}!\nskipping'.format(fil)) 123 | return False 124 | 125 | cmd = "{0} {1}/bs_seeker2-align.py -g {2}"\ 126 | " --aligner=bowtie2 -p {3} --db {4} -r {5}"\ 127 | .format(PYTHON, 128 | BSSEEKER2_REP, 129 | REF_GENOME, 130 | BOWTIE_REP, 131 | REF_GENOME_PATH, 132 | fastq_str 133 | ) 134 | 135 | _run_cmd(cmd, stdout) 136 | _run_cmd('mv {0}/*.bam {1}/; mv {0}/*log {1}/ '.format( 137 | FASTQ_PATH + fil, OUTPUT_PATH + fil), stdout) 138 | 139 | return True 140 | 141 | 142 | def _run_cmd(cmd, *args): 143 | """run cmd""" 144 | popen(cmd).read() 145 | 146 | 147 | if __name__ == "__main__": 148 | main() 149 | -------------------------------------------------------------------------------- /garmire_SNV_calling/process_fastqc_report.py: -------------------------------------------------------------------------------- 1 | 2 | """ process one fastqc report""" 3 | 4 | from os import popen 5 | from os import listdir 6 | 7 | from os.path import isdir 8 | from os.path import isfile 9 | from os.path import getsize 10 | from subprocess import Popen 11 | 12 | from distutils.dir_util import mkpath 13 | from shutil import rmtree 14 | from shutil import copyfile 15 | from shutil import move 16 | from sys import stdout as STDOUT 17 | from sys import argv 18 | from random import randint 19 | from random import random 20 | from time import sleep 21 | from time import time 22 | from fnmatch import fnmatch 23 | 24 | from garmire_SNV_calling.config import FASTQC 25 | from garmire_SNV_calling.config import PATH_OUTPUT 26 | from garmire_SNV_calling.config import FASTQ_PATH 27 | 28 | from garmire_SNV_calling.process_multiple_generic import MPI 29 | 30 | ############ VARIABLES ############################################ 31 | SRR_TO_PROCESS = "" # for debug purpose 32 | PROCESS_ID = randint(0, 1000000) 33 | 34 | if "--specific_folder" in argv: 35 | SRR_TO_PROCESS = argv[ 36 | argv.index("--specific_folder") + 1] 37 | if "--process_id" in argv: 38 | PROCESS_ID = int(argv[ 39 | argv.index("--process_id") + 1]) 40 | if "--nb_threads" in argv: 41 | NB_THREADS = int(argv[ 42 | argv.index("--nb_threads") + 1]) 43 | else: 44 | NB_THREADS = None 45 | ################################################################### 46 | 47 | 48 | def main(): 49 | if NB_THREADS: 50 | input_list = listdir(FASTQ_PATH) 51 | mpi = MPI(input_list=input_list, 52 | ProcessClass=ProcessFastqC, 53 | nb_processes=NB_THREADS) 54 | mpi.run() 55 | else: 56 | process_fastqc = ProcessFastqC(id=PROCESS_ID) 57 | process_fastqc.process(SRR_TO_PROCESS) 58 | 59 | class ProcessFastqC(): 60 | """ Process Fastqc report""" 61 | def __init__(self, 62 | path_to_data=PATH_OUTPUT, 63 | id="1", 64 | clean_tmp=True, 65 | ): 66 | self.output_path = PATH_OUTPUT + '/fastqc/' 67 | self.path_to_data = path_to_data 68 | self.time_start = None 69 | self.id = str(id) 70 | self.stdout = None 71 | 72 | def process(self, srr_to_process=SRR_TO_PROCESS): 73 | """ 74 | process one fastq file using fastqc 75 | """ 76 | tmppath = self.output_path + "/tmp/" + self.id 77 | outpath = self.output_path + "/data/" 78 | 79 | if not isdir(FASTQ_PATH + srr_to_process): 80 | print '{0} is not a folder!'.format( 81 | FASTQ_PATH + srr_to_process) 82 | return 83 | 84 | if isdir("{1}/{0}_fastqc"\ 85 | .format(srr_to_process, outpath)): 86 | print '{0} output already exists'.format( 87 | "{1}/{0}_fastqc"\ 88 | .format(srr_to_process, outpath)) 89 | return 90 | 91 | sleep(random()) 92 | if not isdir(tmppath): 93 | mkpath(tmppath) 94 | if not isdir(outpath): 95 | mkpath(outpath) 96 | 97 | popen("rm {0}/*".format(tmppath)).read() 98 | path_fastq = "" 99 | 100 | for fil in listdir(FASTQ_PATH + srr_to_process): 101 | if fnmatch(fil, '*.fastq'): 102 | path_fastq = '{0}/{1}/{2}'.format(FASTQ_PATH, 103 | srr_to_process, 104 | fil) 105 | fil = fil.rsplit('.', 1)[0] 106 | break 107 | if not path_fastq: 108 | print 'No fastq file for :{0}'.format(path_fastq) 109 | return 110 | 111 | self.stdout = open(tmppath + '/stdout.log', 'w') 112 | 113 | cmd = "{0} {1} -o {2} -d {2} --extract"\ 114 | .format(FASTQC, path_fastq, tmppath) 115 | 116 | self._exec_cmd(cmd) 117 | self._exec_cmd("mv {0}/{1}_fastqc {3}/{2}_fastqc"\ 118 | .format(tmppath, fil, srr_to_process, outpath)) 119 | self.stdout.close() 120 | popen('rm -r {0}/*'.format(tmppath)) 121 | 122 | def _exec_cmd(self, cmd): 123 | """ execute cmd """ 124 | process = Popen(cmd, 125 | stdout=self.stdout, 126 | stderr=self.stdout, 127 | shell=True) 128 | 129 | process.communicate() 130 | if process.returncode: 131 | raise Exception('{0} raise non 0 return code!\n'\ 132 | .format(cmd)) 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | -------------------------------------------------------------------------------- /garmire_SNV_calling/process_multiple_snv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ process multiple bam file with SNV""" 4 | 5 | from multiprocessing import Process 6 | from multiprocessing import Queue 7 | 8 | from time import sleep 9 | from os import listdir 10 | from os.path import isfile 11 | from os.path import isdir 12 | from shutil import rmtree as rmdir 13 | 14 | from garmire_SNV_calling.process_snv_GATK import ProcessGATKSNV 15 | from garmire_SNV_calling.process_freebayes import ProcessFreebayesCaller 16 | 17 | from garmire_SNV_calling.config import PATH_OUTPUT 18 | 19 | from garmire_SNV_calling.config import OUTPUT_PATH_GATK 20 | from garmire_SNV_calling.config import OUTPUT_PATH_FREEBAYES 21 | 22 | from sys import argv 23 | 24 | 25 | ######## VARIABLE ############################## 26 | CLEANING_MODE = True 27 | 28 | 29 | if '--freebayes' in argv or '--do_both_callers' in argv: 30 | SNVCLASS = ProcessFreebayesCaller 31 | OUTPUT_PATH_SNV = OUTPUT_PATH_FREEBAYES 32 | print('GATK SNV caller used. To use Freebayes, add --freebayes option') 33 | else: 34 | SNVCLASS = ProcessGATKSNV 35 | print('freebayes SNV caller used') 36 | OUTPUT_PATH_SNV = OUTPUT_PATH_GATK 37 | 38 | if '--limit' in argv: 39 | LIMIT = int(argv[argv.index('--limit') + 1 ]) 40 | else: 41 | LIMIT = None 42 | 43 | if "--nb_processes" in argv: 44 | NB_PROCESS = eval(argv[ 45 | argv.index("--nb_processes") + 1]) 46 | else: 47 | from garmire_SNV_calling.config import NB_PROCESS_SNV as NB_PROCESS 48 | ################################################ 49 | 50 | 51 | def main(): 52 | print("launching SNV on {0} processes (Y/n)"\ 53 | .format(NB_PROCESS)) 54 | 55 | mp_analysis = Mp_Analysis() 56 | mp_analysis.run() 57 | 58 | 59 | class Mp_Analysis(): 60 | def __init__(self): 61 | """ """ 62 | 63 | self.mp_queue = Queue() 64 | 65 | output_star = listdir(PATH_OUTPUT + "star/") 66 | 67 | if LIMIT: 68 | output_star = output_star[:LIMIT] 69 | 70 | for fil in output_star: 71 | if not isfile(PATH_OUTPUT + "star/" + fil + \ 72 | "/Aligned.sortedByCoord.out.bam"): 73 | print('no star bam file for {0} skipping'.format(fil)) 74 | 75 | if isdir(PATH_OUTPUT + "star/" + fil) and CLEANING_MODE: 76 | rmdir(PATH_OUTPUT + "star/" + fil) 77 | continue 78 | 79 | if isfile("{0}/data/{1}/snv_filtered.vcf"\ 80 | .format(OUTPUT_PATH_SNV, fil)): 81 | print('VCF file output already exists for {0} skipping...'\ 82 | .format(fil)) 83 | continue 84 | 85 | print("file to be processed:", fil) 86 | self.mp_queue.put(fil) 87 | 88 | print("\n #### now launching multiprocessing analysis #### \n") 89 | 90 | self.processes = [TrSNVMultiprocessing(self.mp_queue, id=i) 91 | for i in range(NB_PROCESS)] 92 | def _run(self): 93 | for p in self.processes: 94 | p.start() 95 | 96 | while self.mp_queue.qsize(): 97 | for p in self.processes: 98 | if p.exitcode: 99 | raise KeyboardInterrupt 100 | sleep(1) 101 | 102 | def run(self): 103 | try: 104 | self._run() 105 | 106 | except KeyboardInterrupt: 107 | for p in self.processes: 108 | p.terminate() 109 | 110 | 111 | class TrSNVMultiprocessing(Process): 112 | """ 113 | Launch and control several instance of SNV process 114 | """ 115 | def __init__(self, input_queue, id): 116 | self.input_queue = input_queue 117 | self.id = id 118 | Process.__init__(self) 119 | self.process_snv = SNVCLASS(id=self.id) 120 | 121 | def run(self): 122 | while self.input_queue.qsize(): 123 | try: 124 | patient = self.input_queue.get(True, 0.2) 125 | except Exception as e: 126 | print("exception:{0}".format(e)) 127 | continue 128 | else: 129 | print("processing for file {0} with id {1}"\ 130 | .format(patient, self.id)) 131 | 132 | if '--do_both_callers' in argv: 133 | error = self.process_snv.process_ALL_callers(patient) 134 | else: 135 | error = self.process_snv.process(patient) 136 | 137 | if error is not None: 138 | print('error {1} found for patient: {0}'.format(patient, error)) 139 | 140 | 141 | if __name__ == "__main__": 142 | main() 143 | -------------------------------------------------------------------------------- /garmire_download_ncbi_sra/download_data.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ 4 | download data from NCBI according to GEO accession number 5 | """ 6 | 7 | from os.path import isdir 8 | from os.path import isfile 9 | from os import popen 10 | 11 | from os import mkdir 12 | 13 | import urllib2 14 | 15 | from distutils.dir_util import mkpath 16 | 17 | import json 18 | 19 | from garmire_download_ncbi_sra.config import PATH_DATA 20 | from garmire_download_ncbi_sra.config import NB_THREADS 21 | from garmire_download_ncbi_sra.config import LIMIT 22 | 23 | from garmire_SNV_calling.bash_utils import exec_cmd 24 | 25 | from urllib2 import URLError 26 | 27 | import re 28 | 29 | from multiprocessing.pool import ThreadPool 30 | 31 | from time import sleep 32 | 33 | 34 | ############ VARIABLES ############ 35 | PATH_SEQ = PATH_DATA + '/fastq/' 36 | 37 | if not isdir(PATH_SEQ): 38 | mkdir(PATH_SEQ) 39 | ################################### 40 | 41 | 42 | def main(): 43 | download_data() 44 | 45 | def _download_old(url): 46 | """ """ 47 | gsm, address = url 48 | 49 | try: 50 | srx = address.rsplit('/', 1)[-1] 51 | url = urllib2.urlopen(address).read().split() 52 | srr = url[-1] 53 | 54 | srr_url = "{0}/{1}/{1}.sra".format(address, srr) 55 | f_name = "{0}{1}__{2}__{3}.sra".format(PATH_SEQ, 56 | gsm, 57 | srx, 58 | srr) 59 | except Exception as e: 60 | print('error with SRX {0}!!!'.format(address)) 61 | return "{1} {0}".format(str(e), address) 62 | 63 | if isfile(f_name): 64 | print("{0} already exists continue...".format(f_name)) 65 | return "{0} already exists continue...".format(f_name) 66 | 67 | try: 68 | print('downloading {0} to {1}...'.format(srr_url, f_name)) 69 | popen("wget -O {0} {1} --no-verbose".format( 70 | f_name, 71 | srr_url)).read() 72 | print('{0} successfully downloaded'.format(f_name)) 73 | return 74 | 75 | except Exception as e: 76 | print('error while downloading {0}!!!'.format(address)) 77 | return "{1} {0}\n".format(str(e), address) 78 | 79 | sleep(0.2) 80 | 81 | def _download(data, verbose=True): 82 | """ """ 83 | gsm, url_address = data 84 | 85 | waiting_list = [10, 20, 30] 86 | 87 | f_name = "{0}/{1}.sra".format(PATH_SEQ, gsm) 88 | 89 | if isfile('{0}/download_successfull.log'.format(PATH_SEQ)): 90 | msg = 'file {0} already downloaded. skipping...'.format(f_name) 91 | print(msg) 92 | return msg 93 | while True: 94 | try: 95 | url = urllib2.urlopen(url_address).read() 96 | except URLError as e: 97 | if waiting_list: 98 | sleep_time = waiting_list.pop() 99 | print('error when downloading: {1} sleeping {0} s...'.format(sleep_time, e)) 100 | sleep(sleep_time) 101 | else: 102 | raise e 103 | else: 104 | break 105 | 106 | srr = re.findall('run=(?PSRR[0-9]+)', url)[0] 107 | 108 | srr_url = "ftp://ftp-trace.ncbi.nlm.nih.gov"\ 109 | "/sra/sra-instant/reads/ByRun/sra/SRR/{0}/{1}/{1}.sra".format( 110 | srr[0:6], srr) 111 | 112 | print('downloading: {0}'.format(srr_url)) 113 | 114 | if verbose: 115 | verb = '' 116 | else: 117 | verb = '--no-verbose' 118 | cmd = "wget {2} -O {0} {1} ".format(f_name, srr_url, verb) 119 | print('launching: {0}'.format(cmd)) 120 | 121 | exec_cmd(cmd) 122 | 123 | msg = '{0} successfully downloaded'.format(f_name) 124 | print(msg) 125 | 126 | f_log = open('{0}/download_successfull.log'.format(PATH_SEQ), 'w') 127 | f_log.write('download complete') 128 | 129 | return msg 130 | 131 | def download_data(): 132 | """download dataset from ncbi """ 133 | 134 | urls = get_urls() 135 | 136 | if LIMIT: 137 | urls = urls[:LIMIT] 138 | 139 | if not isdir(PATH_SEQ): 140 | mkpath(PATH_SEQ) 141 | 142 | f_error = open(PATH_DATA + "/error_log.txt", "w") 143 | 144 | thread_pool = ThreadPool(processes=NB_THREADS) 145 | 146 | res = thread_pool.map(_download, urls) 147 | 148 | print("######## errors founds:") 149 | for err in res: 150 | if err: 151 | print(err) 152 | f_error.write('{0}\n'.format(err)) 153 | 154 | def get_urls(): 155 | """ 156 | get download addresses as GSM id according to the following template: 157 | 169. TuMP2-10b 158 | Organism: Mus musculus 159 | Source name: mouse pancreatic tumor 160 | Platform: GPL15907 Series: GSE51372 161 | FTP download: SRA SRX364871 162 | ftp://ftp-trace.ncbi.nlm.nih.gov/ 163 | sra/sra-instant/reads/ByExp/sra/SRX/SRX364/SRX364871/ 164 | Sample Accession: GSM1243834 ID: 301243834 165 | """ 166 | f_meta = open('{0}/metadata.json'.format(PATH_DATA)) 167 | metadata = json.load(f_meta) 168 | 169 | gsms, urls = [], [] 170 | 171 | for sample in metadata: 172 | if 'SRA' in metadata[sample]: 173 | gsms.append(sample) 174 | urls.append(metadata[sample]['SRA']) 175 | 176 | return zip(gsms, urls) 177 | 178 | if __name__ == "__main__": 179 | main() 180 | -------------------------------------------------------------------------------- /garmire_SNV_calling/deploy_star.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from fnmatch import fnmatch 3 | 4 | from os import listdir 5 | from os import mkdir 6 | from os.path import isdir 7 | from os.path import isfile 8 | from os.path import getsize 9 | 10 | from garmire_SNV_calling.bash_utils import exec_cmd 11 | 12 | from time import sleep 13 | from random import random 14 | 15 | from os import popen 16 | 17 | from distutils.dir_util import mkpath 18 | 19 | from garmire_SNV_calling.config import PATH_STAR_SOFTWARE \ 20 | as PATH_SOFTWARE 21 | 22 | from garmire_SNV_calling.config import STAR_THREADS as THREADS 23 | 24 | from garmire_SNV_calling.bash_utils import printf 25 | 26 | 27 | ############ VARIABLES ############################################ 28 | 29 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN 30 | from garmire_SNV_calling.config import FASTQ_PATH 31 | from garmire_SNV_calling.config import STAR_INDEX_PATH 32 | from garmire_SNV_calling.config import STAR_INDEX_READ_LENGTH 33 | 34 | from garmire_SNV_calling.config import OUTPUT_PATH_STAR \ 35 | as OUTPUT_PATH 36 | 37 | from garmire_SNV_calling.config import SIMULATED_REF_GENOME 38 | 39 | ################################################################### 40 | 41 | 42 | def star_analysis( 43 | output_path=OUTPUT_PATH, 44 | fastq_path=FASTQ_PATH, 45 | pattern=PATTERN, 46 | star_index_path=STAR_INDEX_PATH, 47 | star_index_read_length=STAR_INDEX_READ_LENGTH, 48 | simulated_ref_genome=SIMULATED_REF_GENOME, 49 | path_software=PATH_SOFTWARE, 50 | threads=THREADS, 51 | cufflinks_compatibility=None, 52 | custom_star_index_name=True, 53 | stdout=None, 54 | printf=printf): 55 | """ 56 | """ 57 | sleep(2 * random()) 58 | 59 | options = '' 60 | 61 | if cufflinks_compatibility: 62 | options = '--outSAMstrandField intronMotif'\ 63 | ' --outFilterIntronMotifs RemoveNoncanonical' 64 | 65 | if not isdir(output_path): 66 | mkpath(output_path) 67 | 68 | for fil in listdir(fastq_path): 69 | if isfile(fastq_path + '/' + fil): 70 | continue 71 | 72 | if pattern and not fnmatch(fil, pattern): 73 | continue 74 | 75 | printf("====> file to be aligned: {0}".format(fil)) 76 | 77 | if not isdir(output_path + fil): 78 | mkdir(output_path + fil) 79 | 80 | if isfile(output_path + fil + "/Aligned.sortedByCoord.out.bam") \ 81 | and getsize(output_path + fil + "/Aligned.sortedByCoord.out.bam"): 82 | printf('bam file result alreay exists for:{0}\nskipping...'\ 83 | .format(fil)) 84 | continue 85 | 86 | fastq_str = "" 87 | 88 | for fastq_fil in sorted(listdir(fastq_path + '/' + fil)): 89 | if fnmatch(fastq_fil, "*.fastq"): 90 | fastq_str += "{0}/{1}/{2} ".format(fastq_path, fil, fastq_fil) 91 | 92 | if not fastq_str: 93 | printf('no fastq file found for:{0}!\nskipping'.format(fil)) 94 | continue 95 | 96 | if custom_star_index_name: 97 | star_index_path_ready = "{0}READ{1}".format(star_index_path.rstrip('/'), 98 | star_index_read_length) 99 | 100 | if simulated_ref_genome: 101 | star_index_path_ready = star_index_path_ready.rstrip('/') \ 102 | + 'SIM{0}/'.format(simulated_ref_genome) 103 | else: 104 | star_index_path_ready = star_index_path 105 | 106 | tmp_path = '{0}/_STARtmp'.format(output_path + '/' + fil + "/") 107 | 108 | if isdir(tmp_path): 109 | exec_cmd('rm -r {0}'.format(tmp_path), stdout) 110 | 111 | cmd = "{0} --readFilesIn {1} --runThreadN {2}"\ 112 | " --twopassMode Basic --outSAMtype BAM SortedByCoordinate" \ 113 | " --outTmpDir {5} --outFileNamePrefix {3} --genomeDir {4} {6}"\ 114 | .format(path_software, 115 | fastq_str, 116 | threads, 117 | output_path + '/' + fil + "/", 118 | star_index_path_ready, 119 | tmp_path, 120 | options 121 | ) 122 | 123 | printf('star cmd to be launched:{0}'.format(cmd)) 124 | exec_cmd(cmd, stdout) 125 | 126 | def check_star_folder(new_star_path): 127 | """ 128 | """ 129 | if isfile('{0}/star_aligned_successfull.log'.format(new_star_path)): 130 | return '#### STAR already aligned successfully in: {0}'.format(new_star_path) 131 | 132 | def clean_star_folder(path_star_results): 133 | """ 134 | """ 135 | path_bam = '{0}/Aligned.sortedByCoord.out.bam'.format(path_star_results) 136 | path_log_final = '{0}/Log.final.out'.format(path_star_results) 137 | path_sj = '{0}/SJ.out.tab'.format(path_star_results) 138 | 139 | assert(isfile(path_bam) and isfile(path_log_final) and isfile(path_sj)) 140 | 141 | popen('rm -r {0}/_STAR*'.format(path_star_results)).read() 142 | popen('rm -r {0}/Log.out'.format(path_star_results)).read() 143 | popen('rm -r {0}/Log.progress.out'.format(path_star_results)).read() 144 | 145 | f_log = open('{0}/star_aligned_successfull.log'.format(path_star_results), 'w') 146 | f_log.write('STAR successfull') 147 | 148 | return 149 | 150 | 151 | if __name__ == "__main__": 152 | star_analysis() 153 | -------------------------------------------------------------------------------- /garmire_download_ncbi_sra/download_soft_file.py: -------------------------------------------------------------------------------- 1 | from garmire_SNV_calling.bash_utils import exec_cmd 2 | 3 | from garmire_download_ncbi_sra.config import PATH_DATA 4 | from garmire_download_ncbi_sra.config import PROJECT_NAME 5 | from garmire_download_ncbi_sra.config import SOFT_ID 6 | 7 | from glob import glob 8 | 9 | from collections import Counter 10 | 11 | from os import mkdir 12 | 13 | from os.path import isfile 14 | 15 | import re 16 | 17 | from collections import defaultdict 18 | 19 | from os.path import isdir 20 | 21 | from datetime import datetime 22 | import json 23 | 24 | 25 | def main(): 26 | """ """ 27 | if not isdir(PATH_DATA): 28 | mkdir(PATH_DATA) 29 | 30 | download_and_process_soft(SOFT_ID) 31 | 32 | 33 | def download_and_process_soft(gse, erase=False): 34 | """ 35 | """ 36 | if not erase: 37 | if glob('{0}/{1}*'.format(PATH_DATA, gse)): 38 | print('soft file seems existing for: {0}'.format(gse)) 39 | return 40 | 41 | print('downloadin: {0}...'.format(gse)) 42 | 43 | address = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/{0}nnn/{1}/soft/{1}_family.soft.gz'.format( 44 | gse[:-3], 45 | gse 46 | ) 47 | 48 | exec_cmd('wget {0} -O {1}/{2}.soft.gz'.format(address, PATH_DATA, PROJECT_NAME)) 49 | exec_cmd('gzip -d {0}/{1}.soft.gz'.format(PATH_DATA, PROJECT_NAME)) 50 | 51 | read_soft('{0}/{1}.soft'.format(PATH_DATA, PROJECT_NAME)) 52 | 53 | def read_soft(soft_file): 54 | """ 55 | """ 56 | gse_dict = extract_gsm_from_soft(soft_file) 57 | 58 | if not gse_dict: 59 | print('soft_file:{0} empty!'.format(soft_file)) 60 | return 61 | 62 | n_samples = len(gse_dict) 63 | 64 | organism = Counter([gse_dict[gse]['organism_code'] for gse in gse_dict]) 65 | 66 | organism = sorted(organism.items(), key=lambda x:x[1], reverse=True)[0][0] 67 | organism = organism.split()[0] 68 | 69 | f_stat = open('{0}/statistics.json'.format(PATH_DATA), 'w') 70 | f_meta = open('{0}/metadata.json'.format(PATH_DATA), 'w') 71 | 72 | f_meta.write(json.dumps(gse_dict, indent=2)) 73 | 74 | f_stat.write(json.dumps({ 75 | 'organism':organism, 76 | "nb_samples": n_samples, 77 | "soft ID": SOFT_ID 78 | }, indent=2)) 79 | 80 | print("organism: {0}".format(organism)) 81 | print("number of samples: {0}".format(n_samples)) 82 | 83 | 84 | def extract_gsm_from_soft( 85 | soft_file, 86 | flatten_gsm=False, 87 | remove_not_sra=True): 88 | """ 89 | """ 90 | gse_dict = {} 91 | 92 | assert(isfile(soft_file)) 93 | 94 | f_soft = open(soft_file) 95 | line = f_soft.readline() 96 | 97 | while line: 98 | if line.count('^SAMPLE'): 99 | data = defaultdict(list) 100 | gse = line.strip('\n').split(' = ', 1)[1].strip() 101 | data['GSE'] = gse 102 | 103 | line = f_soft.readline() 104 | 105 | while line and line[0] == '!': 106 | key, value = line.split(' = ', 1) 107 | 108 | key = key.strip('! ') 109 | key = key.replace('/', '_') 110 | value = value.strip('\n ') 111 | 112 | if key[:7] == 'Sample_': 113 | key = key[7:] 114 | 115 | if value[:6] == 'ftp://': 116 | data['ftp'].append(value) 117 | 118 | sra = re.findall('https://www.ncbi.nlm.nih.gov/sra?term=SRX[0-9]+', value) 119 | 120 | geo_organism = data['organism_ch1'] 121 | 122 | if geo_organism: 123 | geo_organism = geo_organism[0] 124 | 125 | if geo_organism == 'Homo sapiens' or geo_organism == 'Homo': 126 | data['organism_code'] = 'HUMAN' 127 | elif geo_organism == 'Mus musculus' or geo_organism == 'Mus': 128 | data['organism_code'] = 'MOUSE' 129 | 130 | if sra: 131 | data['SRA'].append(sra[0]) 132 | 133 | data[key].append(value) 134 | 135 | line = f_soft.readline() 136 | 137 | if 'relation' in data: 138 | for relation in data['relation']: 139 | key, value = relation.split(': ') 140 | key.strip(), value.strip() 141 | data[key] = value 142 | 143 | if flatten_gsm: 144 | for key in data: 145 | data[key] = check_value(key, data[key]) 146 | 147 | if len(data[key]) == 1: 148 | data[key] = data[key][0] 149 | 150 | if 'SRA' in data or not remove_not_sra: 151 | gse_dict[gse] = data 152 | 153 | else: 154 | line = f_soft.readline() 155 | 156 | return gse_dict 157 | 158 | 159 | def check_value(key, values): 160 | """ 161 | """ 162 | is_list= True 163 | 164 | if not isinstance(values, list): 165 | is_list = False 166 | values = [values] 167 | 168 | values = map(format_value, values) 169 | 170 | if key.count('zip') and not isinstance(values[0], int): 171 | values = map(lambda x:0, values) 172 | 173 | if key.count('phone') and not isinstance(values[0], int): 174 | values = map(lambda x:0, values) 175 | 176 | if not is_list: 177 | values = values[0] 178 | 179 | return values 180 | 181 | def format_value(value): 182 | """ 183 | """ 184 | if value.isdigit(): 185 | value = int(value) 186 | elif re.findall('[A-Z][a-z][a-z] [0-9]{2} [0-9]{4}', value): 187 | value = re.findall('[A-Z][a-z][a-z] [0-9]{2} [0-9]{4}', value)[0] 188 | value = datetime.strptime(value, '%b %d %Y') 189 | 190 | return value 191 | 192 | 193 | if __name__ == '__main__': 194 | main() 195 | -------------------------------------------------------------------------------- /README_snv_calling.md: -------------------------------------------------------------------------------- 1 | # SNV computation pipeline 2 | 3 | This module aims to align reads from FASTQ files and infer SNVs from RNA-seq dataset. The pipeline is largely inspired from the [GATK variant calling good practices.](http://gatkforums.broadinstitute.org/wdl/discussion/3891/calling-variants-in-rnaseq). Also, it can optionally infer raw gene expression, annotate SNV and doing Quality Control (QC) check. 4 | 5 | * GATK reference: 6 | * [From FastQ data to high confidence variant calls: the Genome Analysis Toolkit best practices pipeline.](http://www.ncbi.nlm.nih.gov/pubmed/25431634) 7 | 8 | * Pipeline schema: 9 | ![Pipeline schema:](./img/workflow.png) 10 | 11 | 12 | ## 13 | 14 | 15 | # STAR alignment and SNV calling from scratch using docker 16 | 17 | ## Requirements 18 | * docker 19 | * possible root access 20 | * 13.8 GB of free memory (docker image) + memory for STAR indexes (usually 20 GB per index) and downloaded data 21 | 22 | ## installation (local) 23 | 24 | ```bash 25 | docker pull opoirion/ssrge 26 | mkdir // 27 | cd // 28 | PATHDATA=`pwd` 29 | ``` 30 | 31 | ## usage 32 | 33 | The pipeline consists of 4 steps for aligning and calling SNVs: 34 | 35 | ```bash 36 | # align and SNV calling 37 | docker run --rm opoirion/ssrge star_index -h 38 | docker run --rm opoirion/ssrge process_star -h 39 | docker run --rm opoirion/ssrge feature_counts -h 40 | docker run --rm opoirion/ssrge process_snv -h 41 | 42 | ``` 43 | 44 | ## example 45 | 46 | Let's download and process 2 samples from GSE79457 in a project name test_n2 47 | 48 | ```bash 49 | # download of the soft file containing the metadata for GSE79457 (see download section) 50 | ## all these data can also be obtained using other alternative workflows 51 | # here you need to precise which read length to use for creating a STAR index and which ref organism (MOUSE/HUMAN) 52 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge star_index -project_name test_n2 -read_length 100 -cell_type HUMAN 53 | # STAR alignment 54 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_star -project_name test_n2 -read_length 100 -cell_type HUMAN 55 | # sample-> gene count matrix 56 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge feature_counts -project_name test_n2 57 | #SNV inference 58 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_snv -project_name test_n2 -cell_type HUMAN 59 | ``` 60 | 61 | # Installation from github (*not updated!!* => Use the docker image for now) 62 | 63 | ## Requirements 64 | * The pipeline requires that the following programs are installed: 65 | * Linux/ Unix (not tested) working environment 66 | * [python 2 (>=2.7)](https://www.python.org/download/releases/2.7.2/) 67 | * [STAR Aligner](https://github.com/alexdobin/STAR) 68 | * [GATK](https://software.broadinstitute.org/gatk/download/) 69 | * [picard-tools](https://broadinstitute.github.io/picard/) 70 | * [Java (>=1.8)](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) 71 | * [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc) \[OPTIONAL\] 72 | * [featureCounts](http://subread.sourceforge.net/) \[OPTIONAL\] 73 | * [snpEff](http://snpeff.sourceforge.net/) \[OPTIONAL\] 74 | * Appropriate snpEff database should be downloaded and installed (see config.py). (It can be done using snpEff command line, see documentation) 75 | 76 | * For each sample, FASTQ files must be inside a specific folder. Also, all the FASTQ folders must be inside a specific folder. (see config.py file) 77 | * reference genome (.fa file) and gene annotations file (.gtf) must be provided (see config.py file) 78 | * Reference variant files must be also provided for the SNV calling procedure (see config.py file). 79 | * \[HUMAN\]: 80 | * dbsnp can be downloaded here: [ftp://ftp.ncbi.nih.gov/snp/organisms/](ftp://ftp.ncbi.nih.gov/snp/organisms/) 81 | * additional reference SNV resources can be downloaded here: [ftp://ftp.broadinstitute.org/bundle/2.8/hg19](ftp://ftp.broadinstitute.org/bundle/2.8/hg19) 82 | * \[MOUSE\]: 83 | * Mouse reference variant and indel databases can be downloaded here: [ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/](ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/). However, vcf files should probably be resorted toward the mouse reference genome using the sequence dictionnary. 84 | 85 | ## configuration 86 | 87 | move to folder of the git project (https://github.com/lanagarmire/SSrGE.git) 88 | 89 | ```bash 90 | cd SSrGE 91 | ``` 92 | 93 | All the environment variables should be set into the ./garmire_SNV_calling//config.py file 94 | 95 | ## usage 96 | 97 | * Once all the environment variables are defined, one should run the test scripts: 98 | 99 | * [optional] Running all the tests: 100 | * 101 | 102 | ```bash 103 | python test/test_snv.py -v 104 | python test/test_snv_optional.py -v # test optionnal features described above 105 | ``` 106 | 107 | * create a STAR index for the used reference genome and the read length used: 108 | 109 | ```bash 110 | python garmire_SNV_calling/generate_STAR_genome_index.py 111 | ``` 112 | 113 | * Align the reads 114 | 115 | ```bash 116 | python garmire_SNV_calling/deploy_star.py 117 | ``` 118 | 119 | * infer SNVs 120 | 121 | ```bash 122 | python garmire_SNV_calling/process_multiple_snv.py 123 | ``` 124 | 125 | * Check STAR overall quality (generate a csv file with the percentage of unique reads mapped for each sample in OUTPUT_PATH) 126 | 127 | ```bash 128 | python garmire_SNV_calling/check_star_overall_quality.py 129 | ``` 130 | 131 | * generate a fastqc report for each sample \[argument --nb_threads: number of processes in parallel\] 132 | 133 | ```bash 134 | python garmire_SNV_calling/process_fastqc_report.py --nb_threads 135 | ``` 136 | 137 | * Use the FastQC report to generate a csv file in OUTPUT_PATH reporting, for each sample, if the [duplicated test](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/8%20Duplicate%20Sequences.html) of fastqc is passed. 138 | 139 | ```bash 140 | python garmire_SNV_calling/check_fastqc_stats.py 141 | ``` 142 | 143 | * Generate gene expression matrices (raw count) 144 | 145 | ```bash 146 | python garmire_SNV_calling/compute_frequency_matrix.py 147 | ``` 148 | 149 | * Annotate SNV: generate new .vcf files with SNV annotations. \[argument --nb_threads: number of processes in parallel\] 150 | 151 | ```bash 152 | python garmire_SNV_calling/process_annotate_snv.py --nb_threads 153 | ``` 154 | 155 | ## contact and credentials 156 | * Developer: Olivier Poirion (PhD) 157 | * contact: opoirion@hawaii.edu -------------------------------------------------------------------------------- /garmire_SSrGE/extract_data.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | from collections import defaultdict 3 | from collections import Counter 4 | 5 | from bisect import bisect 6 | 7 | import numpy as np 8 | 9 | from garmire_SSrGE.config import EXPRESSION_MATRIX_FOLDER_PATH 10 | from garmire_SSrGE.config import GENE_MATRIX_NAME 11 | from garmire_SSrGE.config import VCF_FOLDER_PATH 12 | from garmire_SSrGE.config import VCF_NAME 13 | 14 | from garmire_SSrGE.load_data import process_line_from_vcf_file 15 | from garmire_SSrGE.load_data import process_line_from_annotated_vcf_file 16 | from garmire_SSrGE.load_data import load_indexes 17 | 18 | 19 | def debug(): 20 | """ DEBUG """ 21 | ExtractData() 22 | 23 | 24 | class ExtractData(): 25 | """ """ 26 | def __init__( 27 | self, 28 | expression_matrix_folder_path=EXPRESSION_MATRIX_FOLDER_PATH, 29 | gene_matrix_name=GENE_MATRIX_NAME, 30 | vcf_folder_path=VCF_FOLDER_PATH, 31 | vcf_name=VCF_NAME): 32 | """ """ 33 | self.expression_matrix_folder_path = expression_matrix_folder_path 34 | self.gene_matrix_name = gene_matrix_name 35 | self.vcf_folder_path = vcf_folder_path 36 | self.vcf_name = vcf_name 37 | 38 | self.index = None 39 | self.position_index = None 40 | self.index_start = None 41 | self.index_end = None 42 | self.snv_id_dict = defaultdict(str) 43 | self._snvs_index = {} 44 | 45 | self.average_expression = defaultdict(list) 46 | 47 | def _load_indexes(self): 48 | """ """ 49 | if isinstance(self.index_end, type(None)): 50 | (self.index_start, 51 | self.index_end, 52 | self.position_index) = load_indexes() 53 | 54 | def _load_annotate_snv_from_vcf(self, snv_path): 55 | """load annotated snv """ 56 | f_snv = open(snv_path, 'r') 57 | 58 | result = defaultdict(list) 59 | 60 | for line in f_snv: 61 | res = process_line_from_annotated_vcf_file(line) 62 | if res: 63 | snvid, snvinfolist = res 64 | result[snvid] = snvinfolist 65 | return result 66 | 67 | def load_snv_from_vcf_file(self, vcf_path): 68 | """ 69 | load snv from a vcf file 70 | 71 | input: 72 | :vcf_path: path to the vcf file 73 | """ 74 | self._load_indexes() 75 | 76 | f_snv = open(vcf_path, 'r') 77 | 78 | wrong_count = 0 79 | good_count = 0 80 | 81 | result = Counter() 82 | 83 | for line in f_snv: 84 | res = process_line_from_vcf_file(line) 85 | 86 | if not res: 87 | continue 88 | 89 | chrid, start, end, snv_id = res 90 | 91 | if chrid not in self.position_index['start']: 92 | continue 93 | 94 | ref_start = bisect(self.position_index['start'][chrid], start) 95 | ref_start_list = self.position_index['start'][chrid][max(ref_start-10, 0): 96 | ref_start] 97 | ref_end = bisect(self.position_index['end'][chrid], end) 98 | ref_end_list = self.position_index['end'][chrid][ref_end: 99 | ref_end+10] 100 | ref_start_from_end = set([en[0] for e in ref_end_list 101 | for en in self.index_end[chrid][e]]) 102 | genes_hit_by_snv = ref_start_from_end\ 103 | .intersection(ref_start_list) 104 | 105 | if genes_hit_by_snv: 106 | 107 | for gene_begin in genes_hit_by_snv: 108 | for gene_end_tuple in self.index_start[chrid][gene_begin]: 109 | gene_end = gene_end_tuple[0] 110 | 111 | if not ((gene_begin < start) and (end < gene_end) ): 112 | wrong_count += 1 113 | continue 114 | 115 | good_count += 1 116 | gene_id = gene_end_tuple[1] 117 | snv_name = (gene_id, start) 118 | 119 | if snv_id: 120 | self.snv_id_dict[(gene_id, start)] = snv_id 121 | 122 | result[snv_name] = 1.0 123 | 124 | snv_index = (chrid, start) 125 | 126 | self._snvs_index[snv_index] = snv_name 127 | 128 | return result 129 | 130 | def load_snv_from_cell(self, folder): 131 | """ 132 | Return SNV found as a dict: 133 | Counter(snv_id: 1) 134 | 135 | input: 136 | :folder: str id of the sample 137 | """ 138 | f_path = "{0}/{1}/{2}".format( 139 | self.vcf_folder_path, folder, self.vcf_name) 140 | 141 | return self.load_snv_from_vcf_file(f_path) 142 | 143 | def load_expression_profile_from_cell(self, folder): 144 | """ 145 | Return cell log FPKM as a dict: 146 | Counter(gene_id: expr_profile) 147 | 148 | input: 149 | :folder: str id of the sample 150 | """ 151 | f_path = "{0}/{1}/{2}".format( 152 | self.expression_matrix_folder_path, folder, self.gene_matrix_name) 153 | 154 | fpkm_dict = self.load_expression_profile_from_file(f_path) 155 | 156 | for gid in fpkm_dict: 157 | fpkm_dict[gid] = np.log(1.0 + fpkm_dict[gid]) 158 | 159 | return fpkm_dict 160 | 161 | def get_average_expression_dict(self): 162 | """ """ 163 | for gid in self.average_expression: 164 | 165 | self.average_expression[gid] = np.mean( 166 | self.average_expression[gid]) 167 | 168 | return self.average_expression 169 | 170 | def load_expression_profile_from_file(self, f_path): 171 | """ 172 | load expression profile from file 173 | input: 174 | :f_path: path to the matrix file 175 | """ 176 | count_array = defaultdict(list) 177 | 178 | res = Counter() 179 | 180 | f_expr = open(f_path, 'r') 181 | f_expr.readline() 182 | f_expr.readline() 183 | 184 | tot_nb_read = 0 185 | 186 | for line in f_expr: 187 | line = line.strip('\n').split('\t') 188 | value = float(line[-1]) 189 | g_start = float(line[2].split(';', 1)[0]) 190 | g_end = float(line[3].split(';', 1)[0]) 191 | gene_id = line[0] 192 | 193 | if value: 194 | tot_nb_read += value 195 | pre_fpkm = value / (g_end - g_start) 196 | res[gene_id] = pre_fpkm 197 | 198 | else: 199 | count_array[gene_id].append(value) 200 | 201 | for gid in res: 202 | res[gid] *= 10**9 / tot_nb_read 203 | count_array[gid].append(res[gid]) 204 | 205 | self.average_expression[gid].append(res[gid]) 206 | 207 | return res 208 | 209 | def get_snv_id_dict(self): 210 | """ """ 211 | return defaultdict(str, self.snv_id_dict) 212 | 213 | 214 | if __name__ == "__main__": 215 | debug() 216 | -------------------------------------------------------------------------------- /garmire_SNV_calling/process_freebayes.py: -------------------------------------------------------------------------------- 1 | from config import SAMTOOLS 2 | from config import PYTHON 3 | 4 | from sys import argv 5 | from os.path import isfile 6 | 7 | from time import time 8 | 9 | from garmire_SNV_calling.process_snv_GATK import ProcessGATKSNV 10 | from garmire_SNV_calling.process_snv_GATK import PICARD_DIR 11 | from garmire_SNV_calling.process_snv_GATK import PLATEFORM 12 | from garmire_SNV_calling.process_snv_GATK import ORGANISM 13 | from garmire_SNV_calling.process_snv_GATK import REF_GENOME 14 | from garmire_SNV_calling.process_snv_GATK import DBSNP 15 | from garmire_SNV_calling.process_snv_GATK import VCF_RESOURCES 16 | from garmire_SNV_calling.process_snv_GATK import PROCESS_ID 17 | 18 | from garmire_SNV_calling.config import PATH_OPOSSUM 19 | from garmire_SNV_calling.config import PATH_FREEBAYES 20 | 21 | from garmire_SNV_calling.config import OUTPUT_PATH_GATK 22 | 23 | 24 | if "--do_both_callers" in argv: 25 | DO_BOTH_CALLERS = True 26 | else: 27 | DO_BOTH_CALLERS = False 28 | 29 | if "--path_to_data" in argv: 30 | PATH_TO_DATA = argv[ 31 | argv.index("--path_to_data") + 1] 32 | PATH_OUTPUT = PATH_TO_DATA + '/freebayes/' 33 | else: 34 | from garmire_SNV_calling.config import PATH_OUTPUT 35 | from garmire_SNV_calling.config import OUTPUT_PATH_FREEBAYES 36 | 37 | 38 | def main(): 39 | """ """ 40 | process_freebayes = ProcessFreebayesCaller(id=PROCESS_ID) 41 | if DO_BOTH_CALLERS: 42 | process_freebayes.process_ALL_callers() 43 | else: 44 | process_freebayes.process() 45 | 46 | 47 | class ProcessFreebayesCaller(ProcessGATKSNV): 48 | """ """ 49 | def __init__(self, 50 | output_path=OUTPUT_PATH_FREEBAYES, 51 | path_to_data=PATH_OUTPUT, 52 | picard_dir=PICARD_DIR, 53 | plateform=PLATEFORM, 54 | organism=ORGANISM, 55 | path_freebayes=PATH_FREEBAYES, 56 | ref_genome=REF_GENOME, 57 | samtools=SAMTOOLS, 58 | dbsnp=DBSNP, 59 | vcf_resources=VCF_RESOURCES, 60 | output_path_gatk=OUTPUT_PATH_GATK, 61 | respath_gatk=None, 62 | **kwargs): 63 | """ """ 64 | self.output_path_gatk = output_path_gatk 65 | self.path_freebayes =path_freebayes 66 | self.samtools = samtools 67 | 68 | ProcessGATKSNV.__init__( 69 | self, 70 | output_path=output_path, 71 | path_to_data=path_to_data, 72 | picard_dir=picard_dir, 73 | plateform=plateform, 74 | organism=organism, 75 | ref_genome=ref_genome, 76 | dbsnp=dbsnp, 77 | vcf_resources=vcf_resources, 78 | **kwargs) 79 | 80 | self.respath_gatk = respath_gatk 81 | 82 | def process(self, srr_to_process=None): 83 | """ 84 | """ 85 | if srr_to_process: 86 | self.srr_to_process = srr_to_process 87 | 88 | msg = self._init_process() 89 | 90 | if msg: 91 | print(msg) 92 | self.stdout.write(msg) 93 | return 94 | 95 | self._launch_picard_readgroups() 96 | self._launch_picard_markduplicates() 97 | self._launch_gatk_cigar() 98 | self._launch_gatk_realigner_target_creator() 99 | self._launch_gatk_realigner_indel() 100 | self._launch_gatk_base_recalibrator() 101 | self._launch_gatk_print_reads() 102 | self._process_freebayes('recal.bam') 103 | self._finish_process() 104 | self._rm_tmp_file() 105 | 106 | def process_ALL_callers(self, srr_to_process=None): 107 | """ 108 | """ 109 | if srr_to_process: 110 | self.srr_to_process = srr_to_process 111 | 112 | msg = self._init_process() 113 | 114 | if msg: 115 | print(msg) 116 | self.stdout.write(msg) 117 | return 118 | 119 | self._init_process_gatk() 120 | self._launch_picard_readgroups() 121 | self._launch_picard_markduplicates() 122 | self._launch_gatk_cigar() 123 | self._launch_gatk_realigner_target_creator() 124 | self._launch_gatk_realigner_indel() 125 | self._launch_gatk_base_recalibrator() 126 | self._launch_gatk_print_reads() 127 | self._process_freebayes('recal.bam') 128 | self._launch_gatk_variant_calling() 129 | self._launch_gatk_variant_filtering() 130 | 131 | self._finish_process(ext="_GATK", out="_GATK") 132 | self._finish_process(ext="_freebayes", out="_freebayes") 133 | self._rm_tmp_file() 134 | 135 | def _init_process_gatk(self): 136 | """ 137 | """ 138 | if not self.respath_gatk: 139 | self.respath_gatk = self.output_path_gatk + \ 140 | "/data/" + self.srr_to_process 141 | 142 | def _process_samtools_calmd(self, bam_input="Aligned.sortedByCoord.out.bam"): 143 | """ 144 | """ 145 | if self.check_if_output_exists( 146 | "{0}/md.bam".format(self.tmppath)): 147 | return 148 | 149 | self._run_cmd( 150 | 'echo "\n\n######## LAUNCHING SAMTOOLS CALMD ########\n"') 151 | 152 | cmd = "{0} calmd -b {1}/{2} {3} > {1}/md.bam".format( 153 | self.samtools, 154 | self.tmppath, 155 | bam_input, 156 | self.ref_genome) 157 | 158 | self._run_cmd(cmd) 159 | 160 | cmd = "{0} index {1}/md.bam".format( 161 | self.samtools, 162 | self.tmppath) 163 | 164 | self._run_cmd(cmd) 165 | 166 | def _process_opossum(self, bam_input="md.bam"): 167 | """ 168 | " --SoftClipsExist True --KeepMismatches True " \ 169 | """ 170 | if self.check_if_output_exists( 171 | "{0}/opossum.bam".format(self.tmppath)): 172 | return 173 | 174 | self._run_cmd( 175 | 'echo "\n\n######## LAUNCHING opossum ########\n"') 176 | 177 | cmd = "{0} {1}/Opossum.py --BamFile {2}/{3} " \ 178 | " --OutFile {2}/clean.bam ".format( 179 | PYTHON, 180 | PATH_OPOSSUM, 181 | self.tmppath, 182 | bam_input 183 | ) 184 | 185 | self._run_cmd(cmd) 186 | 187 | def _process_freebayes(self, bam_input="clean.bam"): 188 | """ 189 | """ 190 | if self.check_if_output_exists( 191 | "{0}/snv_filtered_freebayes.vcf".format(self.tmppath)): 192 | return 193 | 194 | self._run_cmd( 195 | 'echo "\n\n######## LAUNCHING freebayes ########\n"') 196 | 197 | start_time = time() 198 | 199 | cmd = "{0} -f {1} {2}/{3} > {2}/snv_filtered_freebayes.vcf".format( 200 | self.path_freebayes, 201 | self.ref_genome, 202 | self.tmppath, 203 | bam_input 204 | ) 205 | 206 | self._run_cmd(cmd) 207 | 208 | self._run_cmd( 209 | 'echo "\n## freebayes done in {0} s##\n"'.format(time() - start_time)) 210 | 211 | assert(isfile("{0}/snv_filtered_freebayes.vcf".format(self.tmppath))) 212 | 213 | if __name__ == '__main__': 214 | main() 215 | -------------------------------------------------------------------------------- /garmire_SSrGE/extract_matrices_from_dataset.py: -------------------------------------------------------------------------------- 1 | from garmire_SSrGE.extract_data import ExtractData 2 | from garmire_SSrGE.load_data import load_gsm_and_sample_names_from_soft 3 | 4 | from garmire_SSrGE.config import EXPRESSION_MATRIX_FOLDER_PATH 5 | from garmire_SSrGE.config import VCF_FOLDER_PATH 6 | from garmire_SSrGE.config import GENE_MATRIX_NAME 7 | from garmire_SSrGE.config import VCF_NAME 8 | 9 | from sklearn.feature_extraction import DictVectorizer 10 | 11 | from os import listdir 12 | 13 | from sys import stdout 14 | 15 | from tabulate import tabulate 16 | 17 | from collections import Counter 18 | 19 | 20 | def debug(): 21 | """ DEBUG """ 22 | extract_matrix = ExtractMatrix() 23 | 24 | SNV_mat = extract_matrix.extract_SNV_mat() 25 | GE_mat = extract_matrix.extract_GE_mat() 26 | 27 | 28 | class ExtractMatrix(): 29 | """ 30 | class to extract SNV_mat and GE_mat from existing dataset 31 | 32 | Project variables must be defined into the config file (config.py): 33 | 34 | PROJECT_PATH 35 | # path toward the project folder 36 | GTF_PATH 37 | # gtf file of the reference genome 38 | GTF_PATH 39 | # gtf file of the reference genome 40 | EXPRESSION_MATRIX_FOLDER_PATH 41 | # Path of the folders containing the gene expression matrices 42 | GENE_MATRIX_NAME 43 | # name of the gene expression matrix file 44 | VCF_FOLDER_PATH 45 | # Path of the folders containing the vcf files 46 | VCF_NAME 47 | # Name of the VCF the vcf files 48 | """ 49 | 50 | def __init__(self, 51 | min_shared_snv=None, 52 | min_gene_expr=None, 53 | min_average_gene_expr=2, 54 | vcf_folder_path=VCF_FOLDER_PATH, 55 | expression_matrix_folder_path=EXPRESSION_MATRIX_FOLDER_PATH, 56 | gene_matrix_name=GENE_MATRIX_NAME, 57 | vcf_name=VCF_NAME, 58 | limit=None): 59 | """ 60 | :min_shared_snv: int min number of cells sharing a given snv 61 | :min_gene_expr: float min number of gene expression value 62 | :min_average_gene_expr: float min number of average gene expression value 63 | on average 64 | :vcf_folder_path: path to vcf folders (one folder per single cell) 65 | :expression_matrix_folder_path: path to expression matrices folders (one folder per single cell) 66 | :gene_matrix_name: name of the gene expression file for each SC folder 67 | :vcf_name: name of the .vcf file for each SC folder 68 | """ 69 | self.vcf_folder_path = vcf_folder_path 70 | self.expression_matrix_folder_path = expression_matrix_folder_path 71 | self.gene_matrix_name = gene_matrix_name 72 | self.vcf_name = vcf_name 73 | 74 | self.min_shared_snv = min_shared_snv 75 | self.min_gene_expr = min_gene_expr 76 | self.min_average_gene_expr = min_average_gene_expr 77 | 78 | samples_with_vcf = set() 79 | samples_with_ge_mat = set() 80 | 81 | if self.vcf_folder_path: 82 | samples_with_vcf = set(listdir(self.vcf_folder_path)) 83 | 84 | if self.expression_matrix_folder_path: 85 | samples_with_ge_mat = set(listdir(self.expression_matrix_folder_path)) 86 | 87 | if samples_with_vcf and samples_with_ge_mat: 88 | self.samples = list(samples_with_vcf.intersection(samples_with_ge_mat)) 89 | else: 90 | self.samples = list(samples_with_vcf.union(samples_with_ge_mat)) 91 | 92 | if limit: 93 | self.samples = self.samples[:limit] 94 | 95 | self.samples_snv_dict = {} 96 | self.samples_ge_dict = {} 97 | 98 | self.extract_data = ExtractData( 99 | vcf_folder_path=self.vcf_folder_path, 100 | expression_matrix_folder_path=self.expression_matrix_folder_path, 101 | gene_matrix_name=self.gene_matrix_name, 102 | vcf_name=self.vcf_name) 103 | 104 | self.gsm_to_name = load_gsm_and_sample_names_from_soft() 105 | self.names = [] 106 | 107 | for sample in self.samples: 108 | gsm = sample.split('_')[0] 109 | name = self.gsm_to_name[gsm] if gsm in self.gsm_to_name else gsm 110 | self.names.append(name) 111 | 112 | def get_samples_list(self): 113 | """ """ 114 | return self.samples 115 | 116 | def extract_SNV_mat(self): 117 | """ 118 | construct SNV binary matrix (n_samples x n_SNVs), 119 | using the project variables described into the config.py file 120 | 121 | return: 122 | :SNV_mat: Matrix (n_samples x n_SNVs) 123 | """ 124 | 125 | if not self.vcf_folder_path: 126 | return None 127 | 128 | i = 0 129 | 130 | for sample in self.samples: 131 | self.samples_snv_dict[sample] = self.extract_data.\ 132 | load_snv_from_cell(sample) 133 | i += 1 134 | stdout.write('\r{0} / {1} VCF files readed'.format(i, len(self.samples))) 135 | stdout.flush() 136 | 137 | average_snvs = Counter() 138 | 139 | for sample in self.samples_snv_dict: 140 | average_snvs += self.samples_snv_dict[sample] 141 | 142 | if self.min_shared_snv: 143 | for sample in self.samples_snv_dict: 144 | for snv in self.samples_snv_dict[sample].keys(): 145 | if average_snvs[snv] < self.min_shared_snv: 146 | self.samples_snv_dict[sample].pop(snv) 147 | 148 | tab = [] 149 | 150 | for sample, name in zip(self.samples, self.names): 151 | tab.append((sample, 152 | name, 153 | len(self.samples_snv_dict[sample]))) 154 | 155 | print('\n', tabulate(tab, headers=['sample', 'name', 'Number of SNVs'])) 156 | 157 | vectorizer = DictVectorizer() 158 | 159 | f_matrix = vectorizer.fit_transform([self.samples_snv_dict[sample] 160 | for sample in self.samples]) 161 | self.snv_index = vectorizer.vocabulary_ 162 | 163 | print('number of SNVs in the dataset:', len(self.snv_index)) 164 | 165 | return f_matrix 166 | 167 | def extract_GE_mat(self): 168 | """ 169 | construct GE matrix (n_genes x n_samples), 170 | using the project variables described into the config.py file 171 | 172 | return: 173 | :GE_mat: Matrix (n_genes x n_samples) 174 | """ 175 | 176 | if not self.expression_matrix_folder_path: 177 | return None 178 | 179 | i = 0 180 | 181 | for sample, name in zip(self.samples, self.names): 182 | self.samples_ge_dict[sample] = self.extract_data.\ 183 | load_expression_profile_from_cell(sample) 184 | i += 1 185 | stdout.write('\r{0} / {1} expression files readed'.format(i, len(self.samples))) 186 | stdout.flush() 187 | 188 | average_expr = self.extract_data.get_average_expression_dict() 189 | 190 | tab = [] 191 | 192 | if self.min_gene_expr or self.min_average_gene_expr: 193 | for sample in self.samples_ge_dict: 194 | for gene in self.samples_ge_dict[sample].keys(): 195 | 196 | if self.min_average_gene_expr \ 197 | and average_expr[gene] < self.min_average_gene_expr: 198 | self.samples_ge_dict[sample].pop(gene) 199 | 200 | if self.min_gene_expr \ 201 | and self.samples_ge_dict[sample][gene] < self.min_gene_expr: 202 | self.samples_ge_dict[sample].pop(gene, None) 203 | 204 | for sample, name in zip(self.samples, self.names): 205 | tab.append((sample, 206 | name, 207 | len(self.samples_ge_dict[sample]))) 208 | 209 | print('\n', tabulate(tab, headers=['sample', 'name', 'Number of genes'])) 210 | 211 | vectorizer = DictVectorizer() 212 | f_matrix = vectorizer.fit_transform([self.samples_ge_dict[sample] 213 | for sample in self.samples]) 214 | self.ge_index = vectorizer.vocabulary_ 215 | 216 | print('number of genes in the dataset:', len(self.ge_index)) 217 | 218 | return f_matrix.T 219 | 220 | 221 | if __name__ == "__main__": 222 | debug() 223 | -------------------------------------------------------------------------------- /example/jones_pancreatic_cancer.soft: -------------------------------------------------------------------------------- 1 | ^DATABASE = GeoMiame 2 | !Database_name = Gene Expression Omnibus (GEO) 3 | !Database_institute = NCBI NLM NIH 4 | !Database_web_link = http://www.ncbi.nlm.nih.gov/geo 5 | !Database_email = geo@ncbi.nlm.nih.gov 6 | ^SERIES = GSE85183 7 | !Series_title = Selective single cell isolation for genomics using microraft arrays 8 | !Series_geo_accession = GSE85183 9 | !Series_status = Public on Aug 05 2016 10 | !Series_submission_date = Aug 04 2016 11 | !Series_last_update_date = Aug 05 2016 12 | !Series_summary = Genomic methods are used increasingly to interrogate the individual cells that compose specific tissues. However, current methods for single cell isolation struggle to phenotypically differentiate specific cells in a heterogeneous population and rely primarily on the use of fluorescent markers. Many cellular phenotypes of interest are too complex to be measured by this approach, making it difficult to connect genotype and phenotype at the level of individual cells. Here we demonstrate that microraft arrays, which are arrays containing thousands of individual cell culture sites, can be used to select single cells based on a variety of phenotypes, such as cell surface markers, cell proliferation and drug response. We then show that a common genomic procedure, RNA-seq, can be readily adapted to the single cells isolated from these rafts. We show that data generated using microrafts and our modified RNA-seq protocol compared favorably with the Fluidigm C1. We then used microraft arrays to select pancreatic cancer cells that proliferate in spite of cytotoxic drug treatment. Our single cell RNA-seq data identified several expected and novel gene expression changes associated with early drug resistance. 13 | !Series_overall_design = 120 samples including cells isolated using microrafts and the Fluidigm C1 14 | !Series_type = Expression profiling by high throughput sequencing 15 | !Series_contributor = Joshua,D,Welch 16 | !Series_contributor = Corbin,D,Jones 17 | !Series_sample_id = GSM2259781 18 | !Series_sample_id = GSM2259782 19 | !Series_sample_id = GSM2259783 20 | !Series_contact_name = Corbin,D.,Jones 21 | !Series_contact_email = cdjones@email.unc.edu 22 | !Series_contact_institute = The University of North Carolina at Chapel Hill 23 | !Series_contact_address = 3159 Genome Sciences Building 24 | !Series_contact_city = Chapel Hill 25 | !Series_contact_state = NC 26 | !Series_contact_zip/postal_code = 27599 27 | !Series_contact_country = USA 28 | !Series_supplementary_file = ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/series/GSE85183/GSE85183_expression_levels.txt.gz 29 | !Series_supplementary_file = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP080/SRP080915 30 | !Series_platform_id = GPL15520 31 | !Series_platform_taxid = 9606 32 | !Series_sample_taxid = 9606 33 | !Series_relation = BioProject: http://www.ncbi.nlm.nih.gov/bioproject/PRJNA336476 34 | !Series_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRP080915 35 | ^PLATFORM = GPL15520 36 | !Platform_title = Illumina MiSeq (Homo sapiens) 37 | !Platform_geo_accession = GPL15520 38 | !Platform_status = Public on May 02 2012 39 | !Platform_submission_date = May 02 2012 40 | !Platform_last_update_date = Aug 05 2016 41 | !Platform_technology = high-throughput sequencing 42 | !Platform_distribution = virtual 43 | !Platform_organism = Homo sapiens 44 | !Platform_taxid = 9606 45 | !Platform_contact_name = ,,GEO 46 | !Platform_contact_country = USA 47 | !Platform_data_row_count = 0 48 | ^SAMPLE = GSM2259781 49 | !Sample_title = mch2-1_TAAGGCG-TATCCTC_L001_RNA-seq 50 | !Sample_geo_accession = GSM2259781 51 | !Sample_status = Public on Aug 05 2016 52 | !Sample_submission_date = Aug 04 2016 53 | !Sample_last_update_date = Aug 05 2016 54 | !Sample_type = SRA 55 | !Sample_channel_count = 1 56 | !Sample_source_name_ch1 = CFPAC-1_None_Bulk 57 | !Sample_organism_ch1 = Homo sapiens 58 | !Sample_taxid_ch1 = 9606 59 | !Sample_characteristics_ch1 = cell line: CFPAC-1 60 | !Sample_characteristics_ch1 = treated with: None 61 | !Sample_characteristics_ch1 = isolation: Bulk 62 | !Sample_characteristics_ch1 = num. cells: ~10000 63 | !Sample_characteristics_ch1 = proliferative?: Unknown 64 | !Sample_molecule_ch1 = total RNA 65 | !Sample_extract_protocol_ch1 = ClonTech SMARTer kit 66 | !Sample_extract_protocol_ch1 = Nextera XT 67 | !Sample_description = RNA 68 | !Sample_data_processing = Read alignment using MapSplice 2 69 | !Sample_data_processing = Gene expression quantification using RSEM 70 | !Sample_data_processing = Genome_build: hg19 71 | !Sample_data_processing = Supplementary_files_format_and_content: Gene expression quantification (FPKMs) 72 | !Sample_platform_id = GPL15520 73 | !Sample_contact_name = Corbin,D.,Jones 74 | !Sample_contact_email = cdjones@email.unc.edu 75 | !Sample_contact_institute = The University of North Carolina at Chapel Hill 76 | !Sample_contact_address = 3159 Genome Sciences Building 77 | !Sample_contact_city = Chapel Hill 78 | !Sample_contact_state = NC 79 | !Sample_contact_zip/postal_code = 27599 80 | !Sample_contact_country = USA 81 | !Sample_instrument_model = Illumina MiSeq 82 | !Sample_library_selection = cDNA 83 | !Sample_library_source = transcriptomic 84 | !Sample_library_strategy = RNA-Seq 85 | !Sample_relation = BioSample: http://www.ncbi.nlm.nih.gov/biosample/SAMN05511259 86 | !Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRX1999927 87 | !Sample_supplementary_file_1 = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX199/SRX1999927 88 | !Sample_series_id = GSE85183 89 | !Sample_data_row_count = 0 90 | ^SAMPLE = GSM2259782 91 | !Sample_title = MCH2-2_AGGCAGA-AAGGAGT_L001_RNA-seq 92 | !Sample_geo_accession = GSM2259782 93 | !Sample_status = Public on Aug 05 2016 94 | !Sample_submission_date = Aug 04 2016 95 | !Sample_last_update_date = Aug 05 2016 96 | !Sample_type = SRA 97 | !Sample_channel_count = 1 98 | !Sample_source_name_ch1 = CFPAC-1_None_Bulk 99 | !Sample_organism_ch1 = Homo sapiens 100 | !Sample_taxid_ch1 = 9606 101 | !Sample_characteristics_ch1 = cell line: CFPAC-1 102 | !Sample_characteristics_ch1 = treated with: None 103 | !Sample_characteristics_ch1 = isolation: Bulk 104 | !Sample_characteristics_ch1 = num. cells: ~10000 105 | !Sample_characteristics_ch1 = proliferative?: Unknown 106 | !Sample_molecule_ch1 = total RNA 107 | !Sample_extract_protocol_ch1 = ClonTech SMARTer kit 108 | !Sample_extract_protocol_ch1 = Nextera XT 109 | !Sample_description = RNA 110 | !Sample_data_processing = Read alignment using MapSplice 2 111 | !Sample_data_processing = Gene expression quantification using RSEM 112 | !Sample_data_processing = Genome_build: hg19 113 | !Sample_data_processing = Supplementary_files_format_and_content: Gene expression quantification (FPKMs) 114 | !Sample_platform_id = GPL15520 115 | !Sample_contact_name = Corbin,D.,Jones 116 | !Sample_contact_email = cdjones@email.unc.edu 117 | !Sample_contact_institute = The University of North Carolina at Chapel Hill 118 | !Sample_contact_address = 3159 Genome Sciences Building 119 | !Sample_contact_city = Chapel Hill 120 | !Sample_contact_state = NC 121 | !Sample_contact_zip/postal_code = 27599 122 | !Sample_contact_country = USA 123 | !Sample_instrument_model = Illumina MiSeq 124 | !Sample_library_selection = cDNA 125 | !Sample_library_source = transcriptomic 126 | !Sample_library_strategy = RNA-Seq 127 | !Sample_relation = BioSample: http://www.ncbi.nlm.nih.gov/biosample/SAMN05511258 128 | !Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRX1999928 129 | !Sample_supplementary_file_1 = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX199/SRX1999928 130 | !Sample_series_id = GSE85183 131 | !Sample_data_row_count = 0 132 | ^SAMPLE = GSM2259783 133 | !Sample_title = RAFTSE10_CGTACTA-GCGTAAG_L001_RNA-seq 134 | !Sample_geo_accession = GSM2259783 135 | !Sample_status = Public on Aug 05 2016 136 | !Sample_submission_date = Aug 04 2016 137 | !Sample_last_update_date = Aug 05 2016 138 | !Sample_type = SRA 139 | !Sample_channel_count = 1 140 | !Sample_source_name_ch1 = CFPAC-1_None_Microraft 141 | !Sample_organism_ch1 = Homo sapiens 142 | !Sample_taxid_ch1 = 9606 143 | !Sample_characteristics_ch1 = cell line: CFPAC-1 144 | !Sample_characteristics_ch1 = treated with: None 145 | !Sample_characteristics_ch1 = isolation: Microraft 146 | !Sample_characteristics_ch1 = num. cells: 0 147 | !Sample_characteristics_ch1 = proliferative?: Unknown 148 | !Sample_molecule_ch1 = total RNA 149 | !Sample_extract_protocol_ch1 = ClonTech SMARTer kit 150 | !Sample_extract_protocol_ch1 = Nextera XT 151 | !Sample_description = RNA 152 | !Sample_data_processing = Read alignment using MapSplice 2 153 | !Sample_data_processing = Gene expression quantification using RSEM 154 | !Sample_data_processing = Genome_build: hg19 155 | !Sample_data_processing = Supplementary_files_format_and_content: Gene expression quantification (FPKMs) 156 | !Sample_platform_id = GPL15520 157 | !Sample_contact_name = Corbin,D.,Jones 158 | !Sample_contact_email = cdjones@email.unc.edu 159 | !Sample_contact_institute = The University of North Carolina at Chapel Hill 160 | !Sample_contact_address = 3159 Genome Sciences Building 161 | !Sample_contact_city = Chapel Hill 162 | !Sample_contact_state = NC 163 | !Sample_contact_zip/postal_code = 27599 164 | !Sample_contact_country = USA 165 | !Sample_instrument_model = Illumina MiSeq 166 | !Sample_library_selection = cDNA 167 | !Sample_library_source = transcriptomic 168 | !Sample_library_strategy = RNA-Seq 169 | !Sample_relation = BioSample: http://www.ncbi.nlm.nih.gov/biosample/SAMN05511257 170 | !Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRX1999929 171 | !Sample_supplementary_file_1 = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX199/SRX1999929 172 | !Sample_series_id = GSE85183 173 | !Sample_data_row_count = 0 174 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SSrGE procedure 2 | 3 | This procedure aims to fit sparse linear models using a binary matrix (n_samples x n_SNV) as features matrix and a gene expression matrix (n_genes x n_samples) as response. The procedure infers a sparse linear model (LASSO by default) for each gene (raw in the second matrix) and keeps the non-null inferred coefs. 4 | 5 | This procedure can be used as a dimension reduction/feature selection procedure or as a feature ranking. It is based on the Scikit-Learn library and is easy to re-implement. However, the package allows to parallelize the fitting procedures, implements a cross-validation procedure and performs eeSNVs and gene rankings. 6 | 7 | SSrGE can be used as a stand-alone procedure to reduce any SNV matrix (raw:single-cell, col: SNV (binary)), using a gene expression matrix (raw: gene-expression (float), col:single-cell). However, we have developped two additional modules, included in this package, that can be used to download and process RNA-seq data: 8 | * [download_ncbi_data](https://github.com/lanagarmire/SSrGE/blob/master/README_download_ncbi_rsa.md): download and extract .sra files from NCBI 9 | * [SNV_calling](https://github.com/lanagarmire/SSrGE/blob/master/README_snv_calling.md): align reads/infer SNVs and infer gene expression matrices from .fastq files. 10 | 11 | Alternatively, we compiled the download, alignment, and SNV calling pipelines into a docker container: opoirion/ssrge (see bellow). 12 | 13 | 14 | ## installation (local) 15 | 16 | ```bash 17 | git clone https://github.com/lanagarmire/SSrGE.git 18 | cd SSrGE 19 | pip2 install -r requirements.txt --user # python 2.7.X must be used 20 | ``` 21 | 22 | ## Requirements 23 | * Linux working environment 24 | * [python 2 (>=2.7)](https://www.python.org/download/releases/2.7.2/) 25 | * Python libraries (automatically installed with the pip install command): 26 | * Numpy 27 | * Scipy 28 | * [Scikit-learn](http://scikit-learn.org/) (version = 0.18) 29 | * tabulate 30 | 31 | ## usage 32 | * test SSrGE is functional: 33 | ```bash 34 | python2 test/test_ssrge.py -v 35 | ``` 36 | 37 | * Instantiate and fit SSrGE: 38 | 39 | SSrGE should be used as a python package, below are usage example. 40 | SSrGE takes as input two matrices (A SNV matrix (n_cells x n_SNVs) and a Gene matrix (n_cells x n_Genes) 41 | In the original study, we encoded X with the following procedure: 42 | if a given snv (s) is present into a given cell (c), then X_c,n = 1 43 | However, any type of encoding or continuous values can be used (For example, one can use X_c,n = 1 for a 1/1 genotype and 0.5 for a 0/1 genotype) 44 | 45 | ```python 46 | from garmire_SSrGE.ssrge import SSrGE 47 | from garmire_SSrGE.examples import create_example_matrix_v1 # create examples matrices 48 | 49 | 50 | help(SSrGE) # See the different functions and specific variables 51 | help(create_example_matrix_v1) 52 | 53 | X, Y, W = create_example_matrix_v1() 54 | 55 | ssrge = SSrGE() 56 | 57 | ssrge.fit(X, Y) 58 | 59 | score_models, score_null_models = ssrge.score(X, Y) 60 | 61 | X_r = ssrge.transform(X) 62 | 63 | print X_r.shape, X.shape 64 | 65 | ranked_feature = ssrge.rank_eeSNVs() 66 | 67 | ssrge_ES = SSrGE(model='ElasticNet', alpha=01, l1_ratio=0.5) # Fitting using sklearn ElasticNet instead 68 | ssrge_ES.fit(X, Y) 69 | 70 | ``` 71 | 72 | * Add CNV matrix: 73 | 74 | The fit method can take an additional CNV matrix of shape (n_cells x n_genes), and describing the CNV level for each gene. 75 | 76 | ```python 77 | from garmire_SSrGE.examples import create_example_matrix_v3 78 | 79 | X, Y, C, W = create_example_matrix_v3() 80 | 81 | help(ssrge.fit) # see the specific documentation of the fit method 82 | ssrge.fit(X, Y, C) 83 | ``` 84 | 85 | * Rank eeSNVs: 86 | 87 | ```python 88 | ranked_feature = ssrge.rank_eeSNVs() 89 | ``` 90 | 91 | * Performing cross-validation 92 | 93 | ```python 94 | from garmire_SSrGE.linear_cross_validation import LinearCrossVal 95 | 96 | help(LinearCrossVal) 97 | 98 | X, Y, W = create_example_matrix_v1() 99 | 100 | cross_val = LinearCrossVal( 101 | model='LASSO', 102 | SNV_mat=X, 103 | GE_mat=Y 104 | ) 105 | 106 | path = cross_val.regularization_path('alpha', [0.01, 0.1, 0.2]) 107 | ``` 108 | 109 | ## Use K top-ranked eeSNVs 110 | 111 | Instead of relying on the regularization parameter (alpha), to select the number of eeSNVs, the `nb_ranked_features` argument can be specified to abotained a fixed number of eeSNVs (assuming that nb_ranked_features is lower than the number of eeSNVs obtained with the specified alpha). 112 | 113 | ```python 114 | ssrge_topk = SSrGE(nb_ranked_features=2) 115 | X_r_2 = ssrge_topk.fit_transform(X, Y) 116 | 117 | print X_r_2.shape # (100, 2) 118 | 119 | ``` 120 | 121 | ## Ranking genes using eeSNVs and providing SNV ids 122 | 123 | In order to rank genes with eeSNVs, the SSrGE instance must be instantiated with SNV ids and gene ids list. 124 | 125 | * the gene id order should correspond to the gene matrix 126 | * a SNV id should be a tuple containing the gene id harboring the given SNV and a user defined SNV id (genome position for example). 127 | 128 | ```python 129 | gene_id_list_example = ['KRAS', 'HLA-A', 'SPARC'] 130 | snv_id_list_example = [('KRAS', 10220), ('KRAS', 10520), ('SPARC', 0220)] 131 | 132 | 133 | ## real example 134 | from garmire_SSrGE.examples import create_example_matrix_v2 135 | 136 | X, Y, gene_id_list, snv_id_list = create_example_matrix_v2() 137 | 138 | ssrge = SSrGE( 139 | snv_id_list=snv_id_list, 140 | gene_id_list=gene_id_list, 141 | nb_ranked_features=2, 142 | alpha=0.01) 143 | 144 | ssrge.fit(X, Y) 145 | 146 | print ssrge.rank_genes() 147 | 148 | ``` 149 | 150 | ## Analyzing a subgroup 151 | 152 | Extract specific eeSNVs and impacted genes of a given subgroup. a given eeSNV is specific to a subgroup if it is signficantly more present amongst the cells from the given subgroup: 153 | 154 | ```python 155 | 156 | # Defining as a subgroup the first 6 elements from X 157 | subgroup = ssrge.rank_features_for_a_subgroup([0, 1, 2, 3, 4, 5]) 158 | 159 | print subgroup.ranked_genes 160 | print subgroup.ranked_eeSNVs 161 | 162 | print subgroup.significant_genes 163 | print subgroup.significant_eeSNVs 164 | 165 | ``` 166 | 167 | ## create SNV and GE matrices from .VCF files and gene expression files 168 | 169 | It is possible to create an SNV matrix using preexisting .vcf files and also a Gene expression matrix using expression files. 170 | 171 | Each cell must have a distinct .vcf file with a unique name (e.g. snv_filtered.vcf) inside a unique folder, specific of the cell, with the name of the cells: 172 | 173 | * example: 174 | 175 | ```bash 176 | 177 | data 178 | |-- GSM2259781__SRX1999927__SRR3999457 179 | | |-- snv_filtered.vcf 180 | | `-- stdout.log 181 | `-- GSM2259782__SRX1999928__SRR3999458 182 | |-- snv_filtered.vcf 183 | `-- stdout.log 184 | 185 | ``` 186 | 187 | (stdout.log is not used and were created by the previous analysis) 188 | 189 | and similarly for the gene expression files (matrix_counts.txt): 190 | 191 | ```bash 192 | 193 | STAR 194 | |-- GSM2259781__SRX1999927__SRR3999457 195 | | |-- matrix_counts.txt 196 | | `-- matrix_counts.txt.summary 197 | `-- GSM2259782__SRX1999928__SRR3999458 198 | |-- matrix_counts.txt 199 | `-- matrix_counts.txt.summary 200 | 201 | ``` 202 | 203 | (matrix_counts.txt.summary is not used and were created by the previous analysis) 204 | 205 | * The format of the expression files supported is the following: 206 | 207 | ```bash 208 | 209 | #gene_name chromsomes starting position ending position additionnal columns gene expression 210 | MIR6859-3 chr1;chr15;chr16 17369;102513727;67052 17436;102513794;67119 ... 200 211 | ``` 212 | 213 | * variables (paths and file names) specific to GE and SNV matrix extraction can be defined in the config file: garmire_SSrGE/config.py 214 | * First, a GTF index must be created: 215 | 216 | ```bash 217 | python2 ./garmire_SSrGE/generate_refgenome_index.py 218 | ``` 219 | 220 | * Once the index generated, the matrices can be genereated easily: 221 | 222 | 223 | # SRA project download, STAR alignment and SNV calling from scratch using docker 224 | 225 | ## Requirements 226 | * docker 227 | * possible root access 228 | * 13.8 GB of free memory (docker image) + memory for STAR indexes (usually 20 GB per index) and downloaded data 229 | 230 | ## installation (local) 231 | 232 | ```bash 233 | docker pull opoirion/ssrge 234 | mkdir // 235 | cd // 236 | PATHDATA=`pwd` 237 | ``` 238 | 239 | ## usage 240 | 241 | The pipeline consists of 3 steps (for downloading the data) and 4 steps for aligning and calling SNVs: 242 | 243 | ```bash 244 | # Download 245 | docker run --rm opoirion/ssrge download_soft_file -h 246 | docker run --rm opoirion/ssrge download_sra -h 247 | docker run --rm opoirion/ssrge extract_sra -h 248 | # align and SNV calling 249 | docker run --rm opoirion/ssrge star_index -h 250 | docker run --rm opoirion/ssrge process_star -h 251 | docker run --rm opoirion/ssrge feature_counts -h 252 | docker run --rm opoirion/ssrge process_snv -h 253 | 254 | ``` 255 | 256 | ## example 257 | 258 | Let's download and process 2 samples from GSE79457 in a project name test_n2 259 | 260 | ```bash 261 | # download of the soft file containing the metadata for GSE79457 262 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_soft_file -project_name test_n2 -soft_id GSE79457 263 | # download sra files 264 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_sra -project_name test_n2 -max_nb_samples 2 265 | # exctract sra files 266 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge extract_sra -project_name test_n2 267 | # rm sra files (optionnal) 268 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge rm_sra -project_name test_n2 269 | ## all these data can also be obtained using other alternative workflows 270 | # here you need to precise which read length to use for creating a STAR index and which ref organism (MOUSE/HUMAN) 271 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge star_index -project_name test_n2 -read_length 100 -cell_type HUMAN 272 | # STAR alignment 273 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_star -project_name test_n2 -read_length 100 -cell_type HUMAN 274 | # sample-> gene count matrix 275 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge feature_counts -project_name test_n2 276 | #SNV inference 277 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_snv -project_name test_n2 -cell_type HUMAN 278 | ``` 279 | 280 | 281 | ## contact and credentials 282 | * Developer: Olivier Poirion (PhD) 283 | * contact: opoirion@hawaii.edu 284 | -------------------------------------------------------------------------------- /garmire_SSrGE/multiprocess_fitting.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Queue 2 | from multiprocessing import Process 3 | 4 | from contextlib import contextmanager 5 | import signal 6 | 7 | import numpy as np 8 | 9 | from time import sleep 10 | from sys import stdout 11 | 12 | from garmire_SSrGE.config import MIN_OBS_FOR_REGRESS 13 | from garmire_SSrGE.config import TIME_LIMIT 14 | 15 | from collections import Counter 16 | 17 | from numpy import hstack 18 | from scipy.sparse import hstack as shstack 19 | from scipy.sparse import issparse 20 | 21 | 22 | import warnings 23 | 24 | from time import time 25 | 26 | 27 | class TimeoutException(Exception): pass 28 | 29 | 30 | @contextmanager 31 | def time_limit(seconds): 32 | warnings.catch_warnings() 33 | warnings.simplefilter("ignore") 34 | 35 | def signal_handler(signum, frame): 36 | raise TimeoutException("Timed out!") 37 | 38 | signal.signal(signal.SIGALRM, signal_handler) 39 | signal.alarm(seconds) 40 | 41 | try: 42 | yield 43 | finally: 44 | signal.alarm(0) 45 | 46 | 47 | def debug(): 48 | """ 49 | #### DEBUG #### 50 | **** Test function **** 51 | """ 52 | from garmire_SSrGE.examples import create_example_matrix_v3 53 | from sklearn.linear_model import Lasso 54 | 55 | 56 | X, Y, C, W = create_example_matrix_v3() 57 | 58 | multi_test = BatchFitting(I_mat=X, 59 | O_mat=Y.T, 60 | CNV_mat=C, 61 | model=Lasso, 62 | model_params={'alpha': 0.01}, 63 | nb_processes=1, 64 | only_nonzero=False, 65 | min_obs_for_regress=0, 66 | cis_model=None) 67 | g_index, coefs, intercepts = multi_test.run() 68 | 69 | return g_index, coefs, intercepts 70 | 71 | class MultiProcessFitting(Process): 72 | def __init__(self, 73 | input_queue, 74 | output_queue, 75 | model, 76 | model_params, 77 | matrix, 78 | process_id, 79 | CNV_mat=None, 80 | time_limit=TIME_LIMIT, 81 | min_obs_for_regress=MIN_OBS_FOR_REGRESS, 82 | only_nonzero=False, 83 | cis_model=None): 84 | """ """ 85 | Process.__init__(self) 86 | self.input_queue = input_queue 87 | self.output_queue = output_queue 88 | self.model = model 89 | self.model_params = model_params 90 | self.matrix = matrix 91 | self.CNV_mat = CNV_mat 92 | self.process_id = process_id 93 | self.only_nonzero = only_nonzero 94 | self.cis_model = cis_model 95 | self.time_limit = time_limit 96 | self.min_obs_for_regress = min_obs_for_regress 97 | 98 | def run(self): 99 | """ """ 100 | model = self.model(**self.model_params) 101 | 102 | while not self.input_queue.empty(): 103 | try: 104 | gene_i, y, data = self.input_queue.get(True, 0.1) 105 | except Exception as e: 106 | continue 107 | 108 | index = None 109 | 110 | if self.only_nonzero: 111 | matrix, y, index = self._clean_matrix(y) 112 | else: 113 | matrix = self.matrix 114 | 115 | if self.cis_model: 116 | matrix = self._matrix_to_cis_model(matrix, gene_i) 117 | 118 | if self.CNV_mat is not None: 119 | matrix = self._add_cnv(matrix, y, gene_i, index) 120 | 121 | if y.shape[0] > self.min_obs_for_regress and \ 122 | not isinstance(matrix, type(None)): 123 | try: 124 | with time_limit(self.time_limit): 125 | model.fit(X=matrix, y=y, **data) 126 | except Exception as e: 127 | intercept = np.nan 128 | coefs = np.empty(self.matrix.shape[1]) 129 | coefs[:] = np.nan 130 | print('\n exception found for linear model:{0}\n skipping'\ 131 | .format(e)) 132 | else: 133 | if self.cis_model: 134 | coefs = np.zeros(self.matrix.shape[1]) 135 | coefs[self.cis_model[gene_i]] = model.coef_ 136 | else: 137 | coefs = model.coef_ 138 | 139 | intercept = model.intercept_ 140 | 141 | else: 142 | intercept = np.nan 143 | coefs = np.empty(self.matrix.shape[1]) 144 | coefs[:] = np.nan 145 | 146 | coefs = Counter({i:np.abs(coefs[i]) 147 | for i in np.nonzero(np.nan_to_num(coefs))[0]}) 148 | 149 | while True: 150 | try: 151 | self.output_queue.put((gene_i, coefs, intercept), timeout=0.1) 152 | except Exception as e: 153 | continue 154 | else: 155 | break 156 | 157 | def _matrix_to_cis_model(self, matrix, gene_i): 158 | """ """ 159 | if not self.cis_model[gene_i]: 160 | return None 161 | return matrix.T[self.cis_model[gene_i]].T 162 | 163 | def _clean_matrix(self, y): 164 | """ """ 165 | index = np.nonzero(y)[0] 166 | return self.matrix[index], y[index], index 167 | 168 | def _add_cnv(self, matrix, y, gene_i, index): 169 | """ """ 170 | stack = shstack if issparse(matrix) else hstack 171 | CNV_mat = self.CNV_mat[index] 172 | 173 | return stack([matrix, CNV_mat.T[gene_i].T.reshape((matrix.shape[0], 1))]) 174 | 175 | 176 | class BatchFitting(): 177 | """ """ 178 | def __init__( 179 | self, 180 | I_mat, 181 | O_mat, 182 | model, 183 | model_params, 184 | CNV_mat=None, 185 | nb_processes=1, 186 | time_limit=TIME_LIMIT, 187 | min_obs_for_regress=MIN_OBS_FOR_REGRESS, 188 | add_y_index=False, 189 | only_nonzero=False, 190 | cis_model=None, 191 | ): 192 | self.I_mat = I_mat 193 | self.O_mat = O_mat 194 | self.CNV_mat = CNV_mat 195 | self.model = model 196 | self.model_params = model_params 197 | self.nb_processes = nb_processes 198 | self.add_y_index = add_y_index 199 | self.only_nonzero = only_nonzero 200 | self.cis_model = cis_model 201 | self.time_limit = time_limit 202 | self.min_obs_for_regress = min_obs_for_regress 203 | 204 | def run(self): 205 | """ run batch fitting """ 206 | 207 | res = self._run() 208 | 209 | if isinstance(res, Exception): 210 | raise res 211 | 212 | return res 213 | 214 | def _kill_processes(self): 215 | """ """ 216 | for process in self.processes_list: 217 | process.terminate() 218 | 219 | def _get_qsize(self, output_queue): 220 | """ """ 221 | while True: 222 | try: 223 | with time_limit(self.time_limit): 224 | try: 225 | out_qsize = output_queue.qsize() 226 | except NotImplementedError: 227 | out_qsize = None 228 | break 229 | except Exception as e: 230 | print('exception was found for qsize:', e) 231 | continue 232 | 233 | return out_qsize 234 | 235 | def _run(self): 236 | """custom unordered multiprocessing""" 237 | input_queue = Queue() 238 | output_queue = Queue() 239 | self.processes_list = [] 240 | res_list = [] 241 | qsize = self.O_mat.shape[0] 242 | i = 0 243 | 244 | for y in self.O_mat: 245 | data = {} 246 | if self.add_y_index: 247 | data['y_index'] = i 248 | input_queue.put((i, y, data)) 249 | i += 1 250 | 251 | for i in range(self.nb_processes): 252 | self.processes_list.append( 253 | MultiProcessFitting( 254 | input_queue=input_queue, 255 | output_queue=output_queue, 256 | model=self.model, 257 | model_params=self.model_params, 258 | matrix=self.I_mat, 259 | CNV_mat=self.CNV_mat, 260 | process_id=i, 261 | time_limit=self.time_limit, 262 | only_nonzero=self.only_nonzero, 263 | min_obs_for_regress=self.min_obs_for_regress, 264 | cis_model=self.cis_model) 265 | ) 266 | 267 | for process in self.processes_list: 268 | process.start() 269 | 270 | terminate = False 271 | 272 | j = 0 273 | prog = ['/', '-', '\\', '|'] 274 | 275 | while True: 276 | for process in self.processes_list: 277 | if process.exitcode: 278 | print('error with process with id: {0} terminating'\ 279 | .format(process.process_id)) 280 | 281 | terminate = True 282 | break 283 | 284 | if terminate: 285 | break 286 | 287 | out_qsize = self._get_qsize(output_queue) 288 | 289 | if out_qsize is not None: 290 | stdout.write('\r{0} / {1} models done {2}'\ 291 | .format(out_qsize, qsize, prog[j])) 292 | else: 293 | stdout.write('\r Multithreqding queue not implemented for this OS'\ 294 | ' cannot give an estimation of the models computed') 295 | 296 | if input_queue.empty(): 297 | sleep(self.time_limit) 298 | break 299 | 300 | stdout.flush() 301 | 302 | j += 1 303 | 304 | if j == 4: 305 | j = 0 306 | 307 | if out_qsize >= qsize or out_qsize == True: 308 | break 309 | 310 | sleep(0.5) 311 | 312 | if terminate: 313 | print('one of the process raised an exception'\ 314 | '\n killing process...') 315 | self._kill_processes() 316 | 317 | return Exception('process not finished correctly!') 318 | 319 | print('\n') 320 | 321 | for i in range(qsize): 322 | res_list.append(output_queue.get()) 323 | stdout.write('\r{0} / {1} results loaded'\ 324 | .format(i + 1, qsize)) 325 | stdout.flush() 326 | 327 | del output_queue 328 | del input_queue 329 | 330 | self._kill_processes() 331 | 332 | return zip(*res_list) 333 | 334 | 335 | if __name__ == "__main__": 336 | debug() 337 | -------------------------------------------------------------------------------- /garmire_SNV_calling/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | config file for SNV calling pipeline 3 | 4 | """ 5 | 6 | from os.path import split as pathsplit 7 | from argparse import ArgumentParser 8 | 9 | ARGPARSER = ArgumentParser(description='Argument for the SNV pipeline', 10 | prefix_chars='-') 11 | 12 | ARGPARSER.add_argument('-project_name', 13 | help='name of the project folder and where to find the fastq files (default: sample_test)', 14 | default="sample_test", 15 | metavar='str') 16 | 17 | ARGPARSER.add_argument('-cell_type', 18 | help=' (HUMAN/MOUSE) default: HUMAN', 19 | default="HUMAN", 20 | metavar='str') 21 | 22 | ARGPARSER.add_argument('-read_length', 23 | help=' star index read length (default: 51)', 24 | default=51, 25 | type=int, 26 | metavar='int') 27 | 28 | ARGPARSER.add_argument('-star_nb_threads', 29 | help=' number of threads for STAR analysis (default 12)', 30 | default=12, 31 | type=int, 32 | metavar='int') 33 | 34 | ARGPARSER.add_argument('-snv_nb_threads', 35 | help=' number of SNV calling pipelines executed in parallel (default 3)', 36 | default=3, 37 | type=int, 38 | metavar='int') 39 | 40 | ARGS = ARGPARSER.parse_known_args()[0] 41 | 42 | # Project name. Used to create folder 43 | PROJECT_NAME = ARGS.project_name 44 | # type of the dataset (human or mouse). Used to select reference genomes 45 | CELL_TYPE = ARGS.cell_type 46 | # valid sequencing machine for picard tools: 47 | # ILLUMINA, SLX, SOLEXA, SOLID, 454, LS454, COMPLETE, PACBIO, 48 | # IONTORRE NT, CAPILLARY, HELICOS, UNKNOWN 49 | PLATEFORM = 'ILLUMINA' 50 | # Read length used to create star index for reference genome 51 | STAR_INDEX_READ_LENGTH = ARGS.read_length 52 | 53 | ############ FOLDER ARCHITECTURE #################################### 54 | #Alias to define the GLOBAL_DATA_ROOT, OUTPUT_ROOT and PROG_ROOT 55 | # (could be overloaded using reference paths) 56 | USER = 'opoirion' 57 | # Alias to define the root folder for reference data 58 | # (could be overloaded using reference paths) 59 | GLOBAL_DATA_ROOT = '/data/' 60 | # Alias to define the output folder 61 | OUTPUT_ROOT = '/data/results/' 62 | # Alias to define the folder containing softwares. 63 | # (could be overloaded using reference paths) 64 | PROG_ROOT = '/prog/' 65 | # Absolute path for the .soft file (dataset description) from NCBI 66 | SOFT_PATH = "{0}/{1}/{1}.soft".format(GLOBAL_DATA_ROOT, PROJECT_NAME) 67 | ###################################################################### 68 | 69 | ############ STANDART VARIABLE ####################################### 70 | TYPE_VAR = { 71 | 'HUMAN': { 72 | # gtf file containing annotated human genes 73 | 'ANNOTATION_PATH': "/data/Illumina_hg19/Annotation/genes.gtf", 74 | # folder which will contains the STAR index using human genome 75 | 'STAR_INDEX_PATH': "{0}/Illumina_hg19/Sequences/STARindex".format(OUTPUT_ROOT), 76 | # folder which will contains the BSSEQ index using human genome 77 | 'BSSEQ_INDEX_PATH': "/data/Illumina_hg19/Sequences/BSSEQindex".format(OUTPUT_ROOT), 78 | # human reference fasta (.fa) file 79 | 'REF_GENOME': "/data/Illumina_hg19/Sequences/WholeGenomeFasta/genome.fa", 80 | # Reference human genome used 81 | 'ORGANISM': 'hg19', 82 | # reference variant database used. The last version can be downloaded from: 83 | # ftp://ftp.ncbi.nih.gov/snp/organisms/ (human_9607_b{version}_p2) 84 | 'DBSNP': "/data/Illumina_hg19/vcf/dbsnp_138.hg19.reduced.vcf",\ 85 | 'VCF_RESOURCES': [ 86 | # Other reference variant resources. 87 | # Can be downloaded from ftp://ftp.broadinstitute.org/bundle/2.8/hg19 88 | # "/data/hg19/vcf/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf", 89 | # Indel variant reference database 90 | # can be downloaded from ftp://ftp.broadinstitute.org/bundle/2.8/hg19 91 | # "/data/hg19/vcf/1000G_phase1.indels.hg19.sites.vcf", 92 | ] 93 | }, 94 | 'MOUSE': { 95 | # gtf file containing annotated mouse genes 96 | 'ANNOTATION_PATH': "/data/Mus_musculus/UCSC/mm10/Annotation/genes.gtf", 97 | # folder which will contains the STAR index using mouse genome 98 | 'STAR_INDEX_PATH': "{0}/Mus_musculus/UCSC/mm10/Sequence/STARindex".format(OUTPUT_ROOT), 99 | # folder which will contains the BS-SEQ index using mouse genome 100 | 'BSSEQ_INDEX_PATH': "{0}/Mus_musculus/UCSC/mm10/Sequence/BSSEQindex".format(OUTPUT_ROOT), 101 | # Mouse reference fasta (.fa) file 102 | 'REF_GENOME': "/data/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa", 103 | # Reference mouse genome used 104 | 'ORGANISM': 'mm10', 105 | # reference variant database used. This version can be downloaded from: 106 | # ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/. 107 | 'DBSNP': "/data/Mus_musculus/UCSC/mm10/vcf/mgp.v3.snps.rsIDdbSNPv137_ordered.reduced.vcf", 108 | # reference indel variant database used. This version can be downloaded from: 109 | # ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/. 110 | # Mouse VCF files must be sorted toward the sequence dictionnary of the mouse reference genome using SortVCF function from picard-tools 111 | 'VCF_RESOURCES': [ 112 | # "/data/mm10/vcf/mgp.v3.indels.rsIDdbSNPv137_ordered.vcf" 113 | ] 114 | } 115 | } 116 | ###################################################################### 117 | 118 | ############ MOUSE/ HUMAN ############################################ 119 | REF_GENOME = TYPE_VAR[CELL_TYPE]['REF_GENOME'] 120 | ANNOTATION_PATH = TYPE_VAR[CELL_TYPE]['ANNOTATION_PATH'] 121 | STAR_INDEX_PATH = TYPE_VAR[CELL_TYPE]['STAR_INDEX_PATH'] 122 | ORGANISM = TYPE_VAR[CELL_TYPE]['ORGANISM'] 123 | DBSNP = TYPE_VAR[CELL_TYPE]['DBSNP'] 124 | VCF_RESOURCES = TYPE_VAR[CELL_TYPE]['VCF_RESOURCES'] 125 | BSSEQ_INDEX_PATH = TYPE_VAR[CELL_TYPE]['BSSEQ_INDEX_PATH'] 126 | ###################################################################### 127 | 128 | ############# DATASET ################################################ 129 | # Absolute path for fastq files. 130 | # Fastq files must be organised using one folder for one SRX experiment 131 | FASTQ_PATH = "{0}/{1}/fastq/".format(OUTPUT_ROOT, PROJECT_NAME) 132 | # output path 133 | PATH_OUTPUT = "{0}/{1}/".format(OUTPUT_ROOT, PROJECT_NAME) 134 | #specific string pattern that a folder name must match 135 | SPECIFIC_FILENAME_PATTERN = "" 136 | ###################################################################### 137 | 138 | ################ RRBS reads specific PREPROCESSING ################### 139 | # Used aligner (star for reads from gene expression 140 | # bismark / BS-seeker2 (RRBS read alignment)) 141 | USED_ALIGNER = 'STAR' 142 | # Are the reads from the bislufite pipeline for SNV calling? 143 | ARE_READS_BISULFITE = False 144 | # specific trimming preprocessing for RRBS reads 145 | DO_TRIMGALORE = True 146 | ###################################################################### 147 | 148 | ######################## MONOVAR ##################################### 149 | MONOVAR_REP = '{0}/monovar/'.format(PROG_ROOT) 150 | MONOVAR_SAMTOOLS = '{0}/external/samtools/samtools'.format(MONOVAR_REP) 151 | ###################################################################### 152 | 153 | ############# SOFTWARE ############################################### 154 | # Available java version. Must be > 1.8 155 | JAVA = "java" 156 | # Max memory used by Java 157 | JAVA_MEM = "-Xmx110g" 158 | # GATK folder where can be found GATK software 159 | GATK_DIR = "{0}/GATK/".format(PROG_ROOT) 160 | # GATK jar name 161 | GATK_JAR = "GenomeAnalysisTK.jar" 162 | # picard-tools software 163 | PICARD_DIR = "{0}/picard-tools-2.1.1/".format(PROG_ROOT) 164 | # Perl 165 | PERL = 'perl' 166 | # python 167 | PYTHON = 'python' 168 | #BOWTIE ALIGNER (for BSSEEKER and bismark) 169 | BOWTIE_REP = '/usr/bin/' 170 | # software for RRBS bisulfite reads preprocessing 171 | TRIMGALORE_REP = '{0}/TrimGalore/'.format(PROG_ROOT) 172 | # BSseeker2 software to call methylation reads 173 | BSSEEKER2_REP = '{0}/BSseeker2/'.format(PROG_ROOT) 174 | # BS-Snper (SNP calling for bisulfite reads) 175 | BSSNPER = '{0}/BS-Snper/BS-Snper.pl'.format(PROG_ROOT) 176 | # bismark software for RRBS alignment 177 | BISMARK_SOFTWARE = '{0}/Bismark/bismark'.format(PROG_ROOT) 178 | # STAR aligner software 179 | PATH_STAR_SOFTWARE = "{0}/STAR/bin/Linux_x86_64_static/STAR"\ 180 | .format(PROG_ROOT) 181 | # fastqc software [OPTIONAL] 182 | FASTQC = "fastqc" 183 | # snpEff software (vcf annotation) [OPTIONAL] 184 | SNPEFF = '{0}/snpEff/snpEff.jar'.format(PROG_ROOT) 185 | # required snpEff databases (vcf annotation) [OPTIONAL] 186 | SNPEFF_DICT = {'MOUSE': 'GRCm38.82', 187 | 'HUMAN': 'GRCh37.75'} 188 | SNPEFF_DB = SNPEFF_DICT[CELL_TYPE] 189 | # SAMtools 190 | SAMTOOLS = '{0}/samtools-1.5/bin/samtools'.format(PROG_ROOT) 191 | ###################################################################### 192 | 193 | ############# STAR ################################################# 194 | # Number of threads used when using STAR aligner 195 | STAR_THREADS = ARGS.star_nb_threads 196 | # output path for STAR results 197 | OUTPUT_PATH_STAR = PATH_OUTPUT + "/star/" 198 | ##################################################################### 199 | 200 | ############ SNV CALLING PIPELINE ################################### 201 | # output path for SNVs inferred 202 | OUTPUT_PATH_GATK = PATH_OUTPUT + '/snv_pipeline_GATK/' 203 | # Number of SNV calling processes launched in parallel 204 | NB_PROCESS_SNV = ARGS.snv_nb_threads 205 | #################################################################### 206 | 207 | ########### FREEBAYES SNV CALLING PIPELINE ########################## 208 | OUTPUT_PATH_FREEBAYES = PATH_OUTPUT + '/snv_pipeline_freebayes/' 209 | PATH_OPOSSUM = '{0}/Opossum/'.format(PROG_ROOT) 210 | PATH_FREEBAYES = '{0}/freebayes/bin/freebayes'.format(PROG_ROOT) 211 | #################################################################### 212 | 213 | ############ COMPUTE DISTANCE MATRIX [OPTIONAL] ############################## 214 | # software to infer gene expressions count with raw count for each single cells 215 | # [OPTIONAL] 216 | FEATURE_COUNT = "featureCounts" 217 | # path for gene expression matrices [OPTIONAL] 218 | MATRIX_OUTPUT_PATH = "{0}/{1}/expression_profile/"\ 219 | .format(OUTPUT_ROOT, PROJECT_NAME) 220 | ############################################################################### 221 | 222 | 223 | ######################## SNV SIMULATION ####################################### 224 | SIMULATED_REF_GENOME = None 225 | 226 | if SIMULATED_REF_GENOME: 227 | MUTATION_FILE = '{0}/Simulated{1}Mut/sim_snv.bed'.format( 228 | pathsplit(pathsplit(REF_GENOME)[0])[0], SIMULATED_REF_GENOME) 229 | SEQUENCES_PATH = pathsplit(pathsplit(REF_GENOME)[0])[0] 230 | SIM_GENOME_DIR = '{0}/Simulated{1}Mut/'.format(SEQUENCES_PATH, SIMULATED_REF_GENOME) 231 | REF_GENOME_ORIGINAL = REF_GENOME[:] 232 | REF_GENOME = '{0}/genome.fa'.format(SIM_GENOME_DIR) 233 | else: 234 | MUTATION_FILE = None 235 | SEQUENCES_PATH = None 236 | SIM_GENOME_DIR = None 237 | REF_GENOME_ORIGINAL = None 238 | ############################################################################### 239 | -------------------------------------------------------------------------------- /garmire_SSrGE/ssrge.py: -------------------------------------------------------------------------------- 1 | from garmire_SSrGE.multiprocess_fitting import BatchFitting 2 | 3 | from garmire_SSrGE.config import TIME_LIMIT 4 | from garmire_SSrGE.config import MIN_OBS_FOR_REGRESS 5 | from garmire_SSrGE.config import NB_THREADS 6 | 7 | from sklearn.linear_model import Lasso 8 | from sklearn.linear_model import ElasticNet 9 | from sklearn.linear_model import OrthogonalMatchingPursuit 10 | from sklearn.linear_model import LassoLars 11 | from sklearn.linear_model import LassoCV 12 | 13 | from sklearn.metrics import median_absolute_error 14 | 15 | from scipy.stats import fisher_exact 16 | 17 | from collections import Counter 18 | from collections import defaultdict 19 | 20 | from scipy.sparse import issparse 21 | 22 | from warnings import warn 23 | 24 | from sys import stdout 25 | 26 | import numpy as np 27 | 28 | 29 | def debug(): 30 | """ 31 | #### DEBUG #### 32 | 33 | **** Test function **** 34 | 35 | """ 36 | from garmire_SSrGE.examples import create_example_matrix_v4 37 | 38 | X, Y, C, ge_list, s_list = create_example_matrix_v4() 39 | 40 | ssrge = SSrGE(snv_id_list=s_list, 41 | gene_id_list=ge_list, 42 | nb_ranked_features=3, 43 | alpha=0.01) 44 | 45 | ssrge.fit_transform(X, Y, C) 46 | ssrge.score(X, Y) 47 | 48 | print(ssrge.retained_snvs) 49 | print(ssrge.retained_genes) 50 | 51 | ssrge = SSrGE(nb_ranked_features=2, 52 | alpha=0.01) 53 | 54 | ssrge.fit_transform(X, Y, C) 55 | 56 | ssrge.score(X,Y) 57 | print(ssrge.retained_snvs) 58 | print(ssrge.retained_genes) 59 | 60 | 61 | class SSrGE(): 62 | """ 63 | Class to perform the SSrGE (Sparse SNV inference to reflect Gene Expression) 64 | """ 65 | def __init__( 66 | self, 67 | snv_id_list=[], 68 | gene_id_list=[], 69 | nb_ranked_features=None, 70 | time_limit=TIME_LIMIT, 71 | min_obs_for_regress=MIN_OBS_FOR_REGRESS, 72 | nb_threads=NB_THREADS, 73 | model='LASSO', 74 | model_params=None, 75 | alpha=0.1, 76 | n_alphas=50, 77 | l1_ratio=0.5, 78 | verbose=True, 79 | **kwargs): 80 | """ 81 | input: 82 | :gene_id_list: list of genes ids 83 | :snv_id_list: list(tuple) the gene ids corresponds 84 | to the gene where the given 85 | svn is found 86 | :nb_ranked_features: int top ranked features (snvs and genes) to keep 87 | :n_alphas: number of alphas to use if model == LassoCV (see sklearn doc for LassoCV) 88 | """ 89 | self.retained_genes = [] 90 | self.retained_snvs = [] 91 | self._do_rank_genes = False 92 | self._snv_ids_given = False 93 | self.snv_index = None 94 | self.gene_index = None 95 | self.snv_id_dict = None 96 | self.gene_id_dict = None 97 | 98 | self._cnv_used = None 99 | self.cnv_score = defaultdict(float) 100 | 101 | self.nb_ranked_features = nb_ranked_features 102 | 103 | if list(snv_id_list): 104 | try: 105 | assert(all(len(snv) == 2 for snv in snv_id_list)) 106 | except Exception: 107 | warn('snv_id_list given but not conform and cannot be used.'\ 108 | 'to rank gene_id_list.'\ 109 | '\ncorrect format: :snv_id_list: list(tuple) ') 110 | else: 111 | self._do_rank_genes = True 112 | self._snv_ids_given = True 113 | 114 | self._create_dicts(snv_id_list, gene_id_list) 115 | 116 | self.snvs_ranked = [] # list of tupe (snv, score) 117 | self.genes_ranked = [] # list of tupe (gene, score) 118 | 119 | self.gene_weights = None 120 | 121 | self.time_limit = time_limit 122 | self.min_obs_for_regress = min_obs_for_regress 123 | self.nb_threads = nb_threads 124 | 125 | self.eeSNV_weight = None # total eeSNV absolute weight 126 | self.SNV_mat = None # fitted SNV_mat 127 | self.GE_mat = None # fitted GE_mat 128 | self.SNV_mat_shape = None # dim of the fitted SNV_mat 129 | self.GE_mat_shape = None # dim of the GE_mat used as predicat 130 | self.eeSNV_index = None # eeSNV index 131 | self.intercepts = None # Intercepts for non null model: {index gene: intercept value} 132 | self.coefs = None # coefs for non null model: {index gene: coefs dict} 133 | self.verbose = verbose # whether to print ranking results into the terminal 134 | self.eeSNV_CIS_score = defaultdict(float) 135 | self.gene_CIS_score = defaultdict(float) 136 | self._model_type = None 137 | self.alpha = alpha 138 | self.n_alphas = n_alphas 139 | 140 | if model == 'LASSO': 141 | self._model_type = model 142 | self.model = Lasso 143 | self.model_params = { 144 | 'alpha': alpha, 145 | 'max_iter': 1000, 146 | } 147 | 148 | elif model == 'OMP': 149 | self._model_type = model 150 | self.model = OrthogonalMatchingPursuit 151 | self.model_params = {'n_nonzero_coefs': alpha} 152 | 153 | elif model == 'LassoLars': 154 | self._model_type = model 155 | self.model = LassoLars 156 | self.model_params = { 157 | 'alpha': alpha, 158 | 'max_iter': 1000,} 159 | 160 | elif model == 'LassoCV': 161 | self._model_type = model 162 | self.model = LassoCV 163 | self.model_params = { 164 | 'n_alphas': n_alphas} 165 | 166 | elif model == 'ElasticNet': 167 | self.model = ElasticNet 168 | self.model_params = { 169 | 'alpha': alpha, 170 | 'l1_ratio': l1_ratio, 171 | 'max_iter': 1000, 172 | } 173 | else: 174 | self.model = model 175 | self.model_params = model_params 176 | 177 | def _create_dicts(self, snv_id_list, gene_id_list): 178 | """ """ 179 | self.snv_index = dict(enumerate(snv_id_list)) 180 | self.gene_index = dict(enumerate(gene_id_list)) 181 | 182 | self.snv_id_dict = {name: pos 183 | for pos, name in self.snv_index.items()} 184 | self.gene_id_dict = defaultdict(str, {name: pos 185 | for pos, name in self.gene_index.items()}) 186 | 187 | def fit(self, SNV_mat, GE_mat, CNV_mat=None, to_dense=False): 188 | """ 189 | infer eeSNV by fitting sparse linear models using SNV as features 190 | and gene expression as objectives 191 | 192 | input: 193 | :SNV_mat: (n_samples x n_SNVs) matrix (binary). Matrix can be sparse 194 | :GE_mat: (n_samples x n_Genes) matrix (float value) 195 | :to_dense: Bool if True SNV_mat is converted as ndarray 196 | 197 | return: 198 | SNV_index, eeSNV_mat 199 | 200 | :SNV_index: list List of eeSNV index from the SNV_matrix 201 | :eeSNV_mat: (n_samples x n_eeSNVs) matrix (binary) (len(n_eeSNVs) < len(n_SNVs)) 202 | """ 203 | if GE_mat.shape[0] == SNV_mat.shape[0] and \ 204 | GE_mat.shape[1] != SNV_mat.shape[1]: 205 | GE_mat = GE_mat.T 206 | 207 | self.SNV_mat_shape = SNV_mat.shape 208 | self.GE_mat_shape = GE_mat.shape 209 | 210 | if self._model_type == 'OMP' and self.alpha and \ 211 | 0.0 < self.alpha < 1.0: 212 | self.model_params['n_nonzero_coefs'] = int(np.floor( 213 | SNV_mat.shape[0] * self.alpha)) 214 | 215 | if not self._snv_ids_given: 216 | self._create_dicts(range(self.SNV_mat_shape[1]), 217 | range(self.GE_mat_shape[0])) 218 | 219 | assert(self.SNV_mat_shape[0] == self.GE_mat_shape[1]) 220 | assert(self.SNV_mat_shape[0] == self.GE_mat_shape[1]) 221 | 222 | if issparse(GE_mat): 223 | GE_mat = GE_mat.todense() 224 | 225 | if (to_dense or \ 226 | self._model_type == 'OMP' or \ 227 | self._model_type == 'LassoLars' ) \ 228 | and issparse(SNV_mat): 229 | SNV_mat = SNV_mat.todense() 230 | 231 | if isinstance(GE_mat, np.matrix): 232 | GE_mat = np.array(GE_mat) 233 | 234 | self._cnv_used = CNV_mat is not None 235 | 236 | g_index, coefs, intercepts = BatchFitting( 237 | I_mat=SNV_mat, 238 | O_mat=GE_mat, 239 | CNV_mat=CNV_mat, 240 | model=self.model, 241 | model_params=self.model_params, 242 | nb_processes=self.nb_threads, 243 | time_limit=self.time_limit, 244 | min_obs_for_regress=self.min_obs_for_regress, 245 | only_nonzero=True).run() 246 | 247 | self._process_computed_coefs(coefs, g_index, intercepts) 248 | self._rank_eeSNVs() 249 | 250 | if self._do_rank_genes: 251 | self._rank_genes() 252 | 253 | self.select_top_ranked_features() 254 | 255 | self.SNV_mat = SNV_mat 256 | self.GE_mat = GE_mat 257 | 258 | def select_top_ranked_features(self, nb_ranked_features=None): 259 | """ """ 260 | if not nb_ranked_features: 261 | nb_ranked_features=self.nb_ranked_features 262 | 263 | self.retained_genes = [gene for gene, score in 264 | self.genes_ranked[:nb_ranked_features]] 265 | self.retained_snvs = [snv for snv, score in 266 | self.snvs_ranked[:nb_ranked_features]] 267 | 268 | def transform(self, SNV_mat): 269 | """ 270 | create sparse matrix using input original coefs list of dicts 271 | input: 272 | :SNV_mat: Matrix (len(samples), len(SNV)) 273 | return: 274 | :eeSNV_mat: Matrix (len(samples), len(eeSNV)) 275 | """ 276 | return SNV_mat.T[[self.snv_id_dict[snv] for snv in self.retained_snvs]].T 277 | 278 | def fit_transform(self, SNV_mat, GE_mat, CNV_mat=None, to_dense=False): 279 | """ 280 | Combination of fit and transform functions 281 | """ 282 | self.fit(SNV_mat, GE_mat, CNV_mat=CNV_mat, to_dense=to_dense) 283 | return self.transform(SNV_mat) 284 | 285 | def _process_computed_coefs(self, coefs, g_index, intercepts): 286 | """ 287 | instanciate weight coefs and eeSNV indexes 288 | 289 | input: 290 | :coefs: list 291 | """ 292 | if self.verbose: 293 | print('\nprocess computed coefs....') 294 | 295 | self.eeSNV_index = list(set([key for coef in coefs for key in coef.keys()])) 296 | self.eeSNV_index = {self.eeSNV_index[i]: i for i in range(len(self.eeSNV_index))} 297 | 298 | self.eeSNV_weight = defaultdict(float) 299 | self.eeSNV_CIS_score = defaultdict(float) 300 | 301 | self.intercepts = {} 302 | self.coefs = {} 303 | 304 | i = 0 305 | length = len(coefs) 306 | 307 | for counter, gene, intercept in zip(coefs, g_index, intercepts): 308 | gene_name = self.gene_index[gene] if self.gene_index else gene 309 | 310 | for key, count in counter.items(): 311 | if self._cnv_used and key == self.SNV_mat_shape[1]: 312 | self.cnv_score[gene_name] = count 313 | continue 314 | 315 | self.eeSNV_weight[key] += count 316 | 317 | if self._snv_ids_given: 318 | genename, pos = self.snv_index[key] 319 | 320 | if genename == self.gene_index[gene]: 321 | self.eeSNV_CIS_score[self.snv_index[key]] += count 322 | 323 | if counter: 324 | self.intercepts[gene] = intercept 325 | self.coefs[gene] = counter 326 | 327 | if self.cnv_score[gene_name]: 328 | self.coefs[gene].pop(self.SNV_mat_shape[1]) 329 | 330 | i += 1 331 | 332 | stdout.write('\r {0:.2f} / 100'.format(i / length * 100)) 333 | stdout.flush() 334 | 335 | for snv in self.eeSNV_CIS_score: 336 | self.eeSNV_CIS_score[snv] /= self.eeSNV_weight[self.snv_id_dict[snv]] 337 | 338 | print('\n') 339 | 340 | def score(self, SNV_mat, GE_mat): 341 | """ 342 | Return mean of MSE for GE prediction, using GE_mat as predicat 343 | and SNV_mat as features. SNV_mat and GE_mat should be havethe same number 344 | of SNVs and genes than the fitted models, respectively 345 | 346 | input: 347 | :SNV_mat: (n_samples x n_SNVs) matrix (binary). Matrix can be sparse 348 | :GE_mat: (n_GE x n_samples) matrix (float value) 349 | 350 | return: 351 | :err_models: float mean of the MSE for models 352 | :err_null_models: float mean of the MSE for null models (only intercepts) 353 | """ 354 | if GE_mat.shape[0] == SNV_mat.shape[0] and \ 355 | GE_mat.shape[1] != SNV_mat.shape[1]: 356 | GE_mat = GE_mat.T 357 | 358 | assert(SNV_mat.shape[1] == self.SNV_mat_shape[1]) 359 | assert(GE_mat.shape[0] == self.GE_mat_shape[0]) 360 | 361 | errs_model = [] 362 | errs_null_model = [] 363 | 364 | if issparse(GE_mat): 365 | GE_mat = GE_mat.todense() 366 | 367 | if isinstance(GE_mat, np.matrix): 368 | GE_mat = np.array(GE_mat) 369 | 370 | if issparse(SNV_mat): 371 | SNV_mat = SNV_mat.todense() 372 | 373 | if isinstance(SNV_mat, np.matrix): 374 | SNV_mat = np.array(SNV_mat) 375 | 376 | for non_null_gene in self.coefs: 377 | non_zero = np.nonzero(GE_mat[non_null_gene])[0] 378 | 379 | if not len(non_zero): 380 | continue 381 | 382 | Y_test = GE_mat[non_null_gene][non_zero] 383 | coef = np.zeros(self.SNV_mat_shape[1]) 384 | coef[list(self.coefs[non_null_gene].keys())] = [self.coefs[non_null_gene][k] 385 | for k in self.coefs[non_null_gene]] 386 | Y_inferred =np.asarray(SNV_mat[non_zero] * np.matrix(coef).T).T[0] \ 387 | + self.intercepts[non_null_gene] 388 | 389 | Y_null_inferred = np.ones(Y_test.shape[0]) * self.intercepts[non_null_gene] 390 | 391 | score = median_absolute_error(Y_inferred, Y_test) 392 | score_null = median_absolute_error(Y_null_inferred, Y_test) 393 | 394 | errs_model.append(score) 395 | errs_null_model.append(score_null) 396 | 397 | return np.mean(errs_model), np.mean(errs_null_model) 398 | 399 | def rank_eeSNVs(self): 400 | """ 401 | rank eeSNVs according to their inferred coefs 402 | """ 403 | return self.snvs_ranked 404 | 405 | def _rank_eeSNVs(self): 406 | """ 407 | rank eeSNVs according to their inferred coefs 408 | """ 409 | 410 | self.snvs_ranked = [] 411 | 412 | ranked_snv = sorted(self.eeSNV_weight.items(), 413 | key=lambda x:x[1], 414 | reverse=True) 415 | 416 | for snv_i, score in ranked_snv: 417 | self.snvs_ranked.append((self.snv_index[snv_i], score)) 418 | 419 | return self.snvs_ranked 420 | 421 | def rank_genes(self): 422 | """ 423 | rank genes according to their inferred coefs 424 | """ 425 | return self.genes_ranked 426 | 427 | def _rank_genes(self): 428 | """ 429 | rank genes according to the inferred coefs of eeSNVs inferred and present inside 430 | """ 431 | self.gene_weights = defaultdict(float) 432 | self.gene_CIS_score = defaultdict(float) 433 | 434 | for snv_i, score in self.eeSNV_weight.items(): 435 | snv = self.snv_index[snv_i] 436 | gene, pos = snv 437 | self.gene_weights[gene] += score 438 | 439 | if self._snv_ids_given: 440 | self.gene_CIS_score[gene] += self.eeSNV_CIS_score[snv] * score 441 | 442 | for gene in self.gene_CIS_score: 443 | self.gene_CIS_score[gene] /= self.gene_weights[gene] 444 | 445 | self.genes_ranked = sorted(self.gene_weights.items(), 446 | key=lambda x:x[1], 447 | reverse=True) 448 | return self.genes_ranked 449 | 450 | def rank_features_for_a_subgroup(self, sample_id_list): 451 | """ 452 | Rank the eeSNVs and the genes for a given subgroup of samples 453 | 454 | input: 455 | :sample_id_list: id of samples of interest 456 | example [1,5,10] => group with samples 1, 5 and 10 457 | output: 458 | :SubGroupData: data container with features specific to the subgroup 459 | """ 460 | gene_weights_list = defaultdict(Counter) 461 | snv_weights_list = defaultdict(Counter) 462 | exp_gene_weights_list = defaultdict(Counter) 463 | exp_snv_weights_list = defaultdict(Counter) 464 | 465 | sample_id_comp = list(set( 466 | range(self.SNV_mat.shape[0])).difference(sample_id_list)) 467 | 468 | SNV_mat_sub = self.SNV_mat[sample_id_list].todense() 469 | SNV_mat_comp = self.SNV_mat[sample_id_comp].todense() 470 | GE_mat_sub = self.GE_mat.T[sample_id_list] 471 | GE_mat_comp = self.GE_mat.T[sample_id_comp] 472 | 473 | for key in self.eeSNV_weight: 474 | SNV_mat_sub.T[key] *= self.eeSNV_weight[key] 475 | 476 | subgroup = SubGroupData() 477 | 478 | for index, gene in self.gene_index.items(): 479 | subgroup.gene_expr_distrib[gene] = GE_mat_sub.T[index] 480 | 481 | for snv_i, score in self.eeSNV_weight.items(): 482 | snv = self.snv_index[snv_i] 483 | gene, pos = snv 484 | gene_i = self.gene_id_dict[gene] 485 | 486 | for cell_i in range(SNV_mat_sub.shape[0]): 487 | snv_weights_list[cell_i][snv] = SNV_mat_sub[cell_i, snv_i] 488 | 489 | if gene_i != '': 490 | gene_weights_list[cell_i][gene] += SNV_mat_sub[cell_i, snv_i] 491 | 492 | if gene_i != '' and GE_mat_sub[cell_i, gene_i]: 493 | exp_snv_weights_list[cell_i][snv] = SNV_mat_sub[cell_i, snv_i] 494 | exp_gene_weights_list[cell_i][gene] += SNV_mat_sub[cell_i, snv_i] 495 | 496 | if gene_i: 497 | index_cells_comp = np.nonzero(GE_mat_comp.T[gene_i])[0] 498 | subgroup.exp_snv_distrib_comp[snv] = np.array( 499 | SNV_mat_comp.T[snv_i, index_cells_comp]) 500 | 501 | for cell_i in snv_weights_list: 502 | 503 | for gene in gene_weights_list[cell_i]: 504 | subgroup.gene_weights_distrib[gene].append( 505 | gene_weights_list[cell_i][gene]) 506 | 507 | for snv in snv_weights_list[cell_i]: 508 | subgroup.snv_weights_distrib[snv].append( 509 | snv_weights_list[cell_i][snv]) 510 | 511 | for cell_i in exp_snv_weights_list: 512 | for gene in exp_gene_weights_list[cell_i]: 513 | subgroup.exp_gene_weights_distrib[gene].append( 514 | exp_gene_weights_list[cell_i][gene]) 515 | 516 | for snv in exp_snv_weights_list[cell_i]: 517 | subgroup.exp_snv_weights_distrib[snv].append( 518 | exp_snv_weights_list[cell_i][snv]) 519 | 520 | subgroup._get_significant_subgroup_features() 521 | 522 | return subgroup 523 | 524 | class SubGroupData(): 525 | """ 526 | class containing data for a given subgroup of cells 527 | 528 | attribute: 529 | :significant_eeSNVs: list of ranked significant eeSNVs with their score 530 | :significant_genes: list of ranked significant eeSNVs with their score 531 | 532 | :gene_expr_distrib: distribution of gene expression for each gene 533 | :gene_weights_distrib: distribution of gene weights for each gene 534 | (according to the eeSNVs) for the subgroup 535 | :snv_weights_distrib: distribution of the eeSNV weights for each eeSNV 536 | :exp_gene_weights_distrib: distribution of gene weights for each gene using only, 537 | for a given gene, the subset of cells expressing the gene 538 | :exp_snv_weights_distrib: distribution of eeSNV weights for each eeSNV using only, 539 | for a given eeSNV, the subset of cells expressing the gene 540 | related to the eeSNV 541 | """ 542 | def __init__(self): 543 | """ """ 544 | self.significant_eeSNVs = [] 545 | self.significant_genes = [] 546 | self.ranked_eeSNVs = [] 547 | self.ranked_genes = [] 548 | 549 | self.gene_expr_distrib = defaultdict(list) 550 | self.gene_weights_distrib = defaultdict(list) 551 | self.snv_weights_distrib = defaultdict(list) 552 | self.exp_gene_weights_distrib = defaultdict(list) 553 | self.exp_snv_weights_distrib = defaultdict(list) 554 | self.exp_snv_distrib_comp = defaultdict(list) 555 | 556 | 557 | def _get_significant_subgroup_features(self, thres=0.05): 558 | """ """ 559 | snv_ranked = [] 560 | gene_ranked = defaultdict(float) 561 | 562 | for snv in self.snv_weights_distrib: 563 | distrib_test = np.asarray(self.exp_snv_weights_distrib[snv]).astype('bool') 564 | distrib_ref = np.asarray(self.exp_snv_distrib_comp[snv]).astype('bool') 565 | 566 | key_mean = np.mean(self.exp_snv_weights_distrib[snv]) 567 | contingency = np.array([[distrib_test.sum(), (distrib_test == False).sum()], 568 | [distrib_ref.sum(), (distrib_ref == False).sum()],]) 569 | 570 | score, pvalue = fisher_exact(contingency) 571 | 572 | if pvalue < thres and distrib_test.mean() > distrib_ref.mean(): 573 | snv_ranked.append((snv, key_mean, pvalue)) 574 | 575 | snv_ranked.sort(key=lambda x:x[1], reverse=True) 576 | 577 | self.significant_eeSNVs = snv_ranked 578 | 579 | for (gene, pos), score, pvalue in snv_ranked: 580 | gene_ranked[gene] += score 581 | 582 | self.significant_genes = sorted(gene_ranked.items(), 583 | key=lambda x:x[1], 584 | reverse=True) 585 | 586 | self.ranked_genes = sorted([(gene, np.mean(self.gene_weights_distrib[gene])) 587 | for gene in self.gene_weights_distrib], 588 | key=lambda x:x[1], 589 | reverse=True) 590 | self.ranked_eeSNVs = sorted([(snv, np.mean(self.snv_weights_distrib[snv])) 591 | for snv in self.snv_weights_distrib], 592 | key=lambda x:x[1], 593 | reverse=True) 594 | 595 | 596 | if __name__ == "__main__": 597 | debug() 598 | -------------------------------------------------------------------------------- /garmire_SNV_calling/process_snv_GATK.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ process one SSR with GATK pipeline SNV""" 4 | 5 | from os import popen 6 | from os.path import isdir 7 | from os.path import isfile 8 | from os.path import getsize 9 | from subprocess import Popen 10 | from subprocess import PIPE 11 | 12 | from distutils.dir_util import mkpath 13 | 14 | from shutil import copyfile 15 | from shutil import move 16 | 17 | from sys import stdout as STDOUT 18 | from sys import argv 19 | 20 | from random import random 21 | from time import sleep 22 | from time import time 23 | 24 | from glob import glob 25 | 26 | from garmire_SNV_calling.config import JAVA 27 | from garmire_SNV_calling.config import JAVA_MEM 28 | from garmire_SNV_calling.config import PICARD_DIR 29 | from garmire_SNV_calling.config import GATK_DIR 30 | from garmire_SNV_calling.config import GATK_JAR 31 | 32 | 33 | ############ VARIABLES ############################################ 34 | SRR_TO_PROCESS = "" # for debug purpose 35 | PROCESS_ID = 0 36 | 37 | 38 | from garmire_SNV_calling.config import OUTPUT_PATH_GATK 39 | from garmire_SNV_calling.config import PATH_OUTPUT 40 | from garmire_SNV_calling.config import PLATEFORM 41 | from garmire_SNV_calling.config import ORGANISM 42 | 43 | from garmire_SNV_calling.config import REF_GENOME 44 | from garmire_SNV_calling.config import DBSNP 45 | from garmire_SNV_calling.config import VCF_RESOURCES 46 | 47 | 48 | if "--ignore_already_exists" in argv: 49 | IGNORE_ALREADY_EXISTS = True 50 | else: 51 | IGNORE_ALREADY_EXISTS = False 52 | 53 | if "--clean_tmp" in argv: 54 | CLEAN_TMP = True 55 | else: 56 | CLEAN_TMP = False 57 | 58 | ################################################################### 59 | 60 | 61 | def main(): 62 | process_GATK_snv = ProcessGATKSNV(id=PROCESS_ID) 63 | process_GATK_snv.process() 64 | 65 | 66 | class ProcessGATKSNV(): 67 | """ """ 68 | def __init__(self, 69 | bam_file_name='', 70 | srr_to_process=SRR_TO_PROCESS, 71 | output_path=OUTPUT_PATH_GATK, 72 | path_to_data=PATH_OUTPUT, 73 | java=JAVA, 74 | java_mem=JAVA_MEM, 75 | picard_dir=PICARD_DIR, 76 | gatk_dir=GATK_DIR, 77 | plateform=PLATEFORM, 78 | organism=ORGANISM, 79 | ref_genome=REF_GENOME, 80 | dbsnp=DBSNP, 81 | vcf_resources=VCF_RESOURCES, 82 | gatk_jar=GATK_JAR, 83 | id="1", 84 | ignore_already_exists=IGNORE_ALREADY_EXISTS, 85 | clean_tmp=CLEAN_TMP, 86 | respath=None, 87 | ): 88 | 89 | self.respath = respath 90 | 91 | self.output_path = output_path 92 | self.path_to_data = path_to_data 93 | self.srr_to_process = srr_to_process 94 | 95 | self.bam_file_name = bam_file_name 96 | 97 | self.java = java 98 | self.java_mem = java_mem 99 | self.picard_dir = picard_dir 100 | self.gatk_dir = gatk_dir 101 | self.plateform = plateform 102 | self.organism = organism[:] 103 | self.ignore_already_exists = ignore_already_exists 104 | self.gatk_jar = gatk_jar 105 | 106 | if self.organism == 'HUMAN': 107 | self.organism = 'hg19' 108 | 109 | elif self.organism == 'MOUSE': 110 | self.organism = 'mm10' 111 | 112 | self.ref_genome = ref_genome 113 | self.dbsnp = dbsnp 114 | self.vcf_resources = vcf_resources 115 | 116 | self.id = str(id) 117 | self.stdout = None 118 | self.tmppath = None 119 | self.time_start = None 120 | self.bam_file_path = None 121 | self.clean_tmp = clean_tmp 122 | 123 | def process(self, srr_to_process=None): 124 | """ 125 | process one star bam file with snv calling pipeline 126 | """ 127 | if srr_to_process: 128 | self.srr_to_process = srr_to_process 129 | 130 | msg = self._init_process() 131 | 132 | if msg: 133 | print(msg) 134 | self.stdout.write(msg) 135 | return 136 | 137 | self._launch_picard_readgroups() 138 | self._launch_picard_markduplicates() 139 | self._launch_gatk_cigar() 140 | self._launch_gatk_realigner_target_creator() 141 | self._launch_gatk_realigner_indel() 142 | self._launch_gatk_base_recalibrator() 143 | self._launch_gatk_print_reads() 144 | self._launch_gatk_variant_calling() 145 | self._launch_gatk_variant_filtering() 146 | self._finish_process(ext="", out="_GATK") 147 | self._rm_tmp_file() 148 | 149 | def process_exome(self, srr_to_process=None): 150 | """ 151 | process one star bam file with snv calling pipeline 152 | """ 153 | if srr_to_process: 154 | self.srr_to_process = srr_to_process 155 | 156 | 157 | msg = self._init_process() 158 | 159 | if msg: 160 | print(msg) 161 | self.stdout.write(msg) 162 | return 163 | 164 | self._launch_picard_readgroups() 165 | self._launch_picard_buildbamindex(name='rg_added_sorted') 166 | self._launch_picard_markduplicates() 167 | self._launch_gatk_base_recalibrator(input_name='dedupped') 168 | self._launch_gatk_print_reads(input_name='dedupped') 169 | self._launch_gatk_variant_calling() 170 | self._launch_gatk_variant_filtering() 171 | 172 | self._finish_process(ext="_GATK", out="_GATK") 173 | self._rm_tmp_file() 174 | 175 | def _init_process(self): 176 | """mk tmp folders... """ 177 | self.time_start = time() 178 | self.tmppath = '{0}/{1}/'.format( 179 | self.output_path, self.srr_to_process) 180 | 181 | if not self.respath: 182 | self.respath = '{0}/{1}/'.format( 183 | self.output_path, self.srr_to_process) 184 | 185 | sleep(2 * random()) 186 | if not isdir(self.tmppath): 187 | mkpath(self.tmppath) 188 | 189 | if self.clean_tmp and glob('{0}/*'.format(self.tmppath)): 190 | popen("rm {0}/*".format(self.tmppath)).read() 191 | 192 | self.stdout = open(self.tmppath + '/stdout.log', 'a+') 193 | 194 | self.stdout.write('\n\n######## file id {0} ########\n'\ 195 | .format(self.srr_to_process)) 196 | 197 | if isfile(self.respath + '/snv_filtered.vcf') \ 198 | and getsize(self.respath + '/snv_filtered.vcf'): 199 | msg = 'file : {0} already exists!'\ 200 | .format(self.respath + '/snv_filtered.vcf') 201 | print(msg) 202 | 203 | if self.ignore_already_exists: 204 | print('continuing anyway...') 205 | else: 206 | return msg 207 | 208 | if not self.bam_file_name: 209 | self.bam_file_path = '{0}{1}/Aligned.sortedByCoord.out.bam'.format( 210 | self.path_to_data + "/star/" , self.srr_to_process) 211 | else: 212 | self.bam_file_path = self.path_to_data + self.bam_file_name 213 | 214 | if not isfile(self.bam_file_path)\ 215 | or not getsize(self.bam_file_path): 216 | err = 'error file : {0} not found or empty!'\ 217 | .format(self.bam_file_path) 218 | raise Exception(err) 219 | 220 | copyfile("{0}".format(self.bam_file_path), 221 | "{0}/Aligned.sortedByCoord.out.bam".format(self.tmppath)) 222 | 223 | def _finish_process(self, ext="", out=""): 224 | """mk res folders... """ 225 | 226 | if not isdir(self.respath): 227 | mkpath(self.respath) 228 | 229 | self.stdout.write('''\n #### FINISHED #### \n 230 | ALL PROCESS DONE FOR: {0} in {1} s 231 | '''.format(self.srr_to_process, time() - self.time_start)) 232 | 233 | self._run_cmd('echo "#### FINISHED ####'\ 234 | ' ALL PROCESS DONE FOR: {0} in {1} s"'\ 235 | .format(self.srr_to_process, time() - self.time_start)) 236 | 237 | if self.tmppath == self.respath and ext == out: 238 | return 239 | 240 | if isfile(self.tmppath + '/snv_filtered{0}.vcf'.format(ext)): 241 | move(self.tmppath + '/snv_filtered{0}.vcf'.format(ext), 242 | self.respath + '/snv_filtered{0}.vcf'.format(out)) 243 | 244 | if isfile(self.tmppath + '/snv_filtered{0}.vcf.idx'.format(ext)): 245 | move(self.tmppath + '/snv_filtered{0}.vcf.idx'.format(ext), 246 | self.respath + '/snv_filtered{0}.vcf.idx'.format(out)) 247 | 248 | move(self.tmppath + '/stdout.log', 249 | self.respath + '/stdout.log') 250 | 251 | def _launch_picard_readgroups(self): 252 | """ 253 | launch picard AddOrReplaceReadGroups 254 | """ 255 | if self.check_if_output_exists( 256 | "{0}/rg_added_sorted.bam".format(self.tmppath)): 257 | return 258 | 259 | self._run_cmd( 260 | 'echo "\n\n######## LAUNCHING PICARD READGROUPS ########\n"') 261 | 262 | cmd = "{0} {1} -jar {2}/picard.jar AddOrReplaceReadGroups" \ 263 | " I={3}/Aligned.sortedByCoord.out.bam"\ 264 | " O={3}/rg_added_sorted.bam" \ 265 | " SO=coordinate" \ 266 | " RGID={4}" \ 267 | " RGPU={4}" \ 268 | " RGSM={4}" \ 269 | " RGPL={5}" \ 270 | " RGLB={6}" \ 271 | .format(self.java, 272 | self.java_mem, 273 | self.picard_dir, 274 | self.tmppath, 275 | self.id, 276 | self.plateform, 277 | self.organism 278 | ) 279 | self._run_cmd(cmd) 280 | 281 | def _launch_picard_markduplicates(self): 282 | """ 283 | launch picard MarkDuplicates 284 | """ 285 | if self.check_if_output_exists( 286 | "{0}/dedupped.bam".format(self.tmppath)): 287 | return 288 | 289 | self._run_cmd( 290 | 'echo "\n\n######## LAUNCHING PICARD MARKDUPLICATES ########\n"') 291 | 292 | cmd = "{0} {1} -jar {2}/picard.jar MarkDuplicates" \ 293 | " I={3}/rg_added_sorted.bam"\ 294 | " O={3}/dedupped.bam" \ 295 | " M={3}/output.metrics" \ 296 | " CREATE_INDEX=true" \ 297 | " VALIDATION_STRINGENCY=SILENT" \ 298 | .format(self.java, 299 | self.java_mem, 300 | self.picard_dir, 301 | self.tmppath, 302 | ) 303 | self._run_cmd(cmd) 304 | 305 | def _launch_picard_buildbamindex(self, name='dedupped'): 306 | """ 307 | launch picard buildbamindex 308 | """ 309 | if self.check_if_output_exists( 310 | "{0}/{1}.bai".format(self.tmppath, name)): 311 | return 312 | 313 | self._run_cmd( 314 | 'echo "\n\n######## LAUNCHING PICARD BuildBamIndex ########\n"') 315 | 316 | cmd = "{0} {1} -jar {2}/picard.jar BuildBamIndex" \ 317 | " I={3}/{4}.bam" \ 318 | " TMP_DIR={3}" \ 319 | .format(self.java, 320 | self.java_mem, 321 | self.picard_dir, 322 | self.tmppath, 323 | name, 324 | ) 325 | self._run_cmd(cmd) 326 | 327 | def _launch_picard_sortsam(self): 328 | """ 329 | launch picard SORTSAM 330 | """ 331 | if self.check_if_output_exists( 332 | "{0}/sorted.bam".format(self.tmppath)): 333 | return 334 | 335 | self._run_cmd( 336 | 'echo "\n\n######## LAUNCHING PICARD REORDERSAM ########\n"') 337 | 338 | cmd = "{0} {1} -jar {2}/picard.jar SortSam" \ 339 | " I={3}/dedupped.bam" \ 340 | " O={3}/sorted.bam" \ 341 | " SORT_ORDER=coordinate" \ 342 | " TMP_DIR={3}" \ 343 | " CREATE_INDEX=TRUE" \ 344 | .format(self.java, 345 | self.java_mem, 346 | self.picard_dir, 347 | self.tmppath, 348 | ) 349 | self._run_cmd(cmd) 350 | 351 | def _launch_picard_reordersam(self): 352 | """ 353 | launch picard REORDERSAM 354 | """ 355 | if self.check_if_output_exists( 356 | "{0}/reordered.bam".format(self.tmppath)): 357 | return 358 | 359 | self._run_cmd( 360 | 'echo "\n\n######## LAUNCHING PICARD REORDERSAM ########\n"') 361 | 362 | cmd = "{0} {1} -jar {2}/picard.jar ReorderSam" \ 363 | " I={3}/dedupped.bam" \ 364 | " O={3}/dedupped_reodered.bam" \ 365 | " R={4}"\ 366 | " CREATE_INDEX=TRUE" \ 367 | .format(self.java, 368 | self.java_mem, 369 | self.picard_dir, 370 | self.tmppath, 371 | self.ref_genome 372 | ) 373 | self._run_cmd(cmd) 374 | 375 | def _launch_gatk_cigar(self): 376 | """ 377 | Running cigar string split and mapq 255 fix GATK 378 | """ 379 | if self.check_if_output_exists( 380 | "{0}/split.bam".format(self.tmppath)): 381 | return 382 | 383 | self._run_cmd('echo "\n\n######## LAUNCHING CIGAR ########\n"') 384 | 385 | cmd = "{0} {1} -jar {2}/{5} -T SplitNCigarReads" \ 386 | " -I {3}/dedupped.bam" \ 387 | " -o {3}/split.bam" \ 388 | " -R {4}" \ 389 | " -rf ReassignOneMappingQuality" \ 390 | " -RMQF 255" \ 391 | " -RMQT 60" \ 392 | " -U ALLOW_N_CIGAR_READS" \ 393 | .format(self.java, 394 | self.java_mem, 395 | self.gatk_dir, 396 | self.tmppath, 397 | self.ref_genome, 398 | self.gatk_jar 399 | ) 400 | 401 | self._run_cmd_fix_quality(cmd, to_rm='split.ba*') 402 | 403 | def _launch_gatk_realigner_target_creator(self, input_name='split.bam', resolve='hard'): 404 | """ 405 | Running Realignment Target creator 406 | """ 407 | if self.check_if_output_exists( 408 | "{0}/forRealigner.intervals".format(self.tmppath)): 409 | return 410 | 411 | self._run_cmd( 412 | 'echo "\n\n######## LAUNCHING REALIGNER TARGET CREATOR ########\n"') 413 | 414 | cmd = "{0} {1} -jar {2}/{6} -T RealignerTargetCreator" \ 415 | " -I {3}/{5}" \ 416 | " -o {3}/forRealigner.intervals"\ 417 | " -R {4}" \ 418 | " -nt 20 " \ 419 | .format(self.java, 420 | self.java_mem, 421 | self.gatk_dir, 422 | self.tmppath, 423 | self.ref_genome, 424 | input_name, 425 | self.gatk_jar 426 | ) 427 | 428 | for vcf in self.vcf_resources: 429 | cmd += " -known {0}".format(vcf) 430 | 431 | self._run_cmd_fix_quality(cmd, to_rm='forRealigner.intervals', resolve=resolve) 432 | 433 | def _launch_gatk_realigner_indel(self): 434 | """ 435 | Running Realignment 436 | """ 437 | if self.check_if_output_exists( 438 | "{0}/realigned.bam".format(self.tmppath)): 439 | return 440 | 441 | self._run_cmd( 442 | 'echo "\n\n######## LAUNCHING REALIGNER INDEL ########\n"') 443 | 444 | cmd = "{0} {1} -jar {2}/{5} -T IndelRealigner" \ 445 | " -I {3}/split.bam" \ 446 | " -targetIntervals {3}/forRealigner.intervals"\ 447 | " --out {3}/realigned.bam" \ 448 | " -R {4}" \ 449 | .format(self.java, 450 | self.java_mem, 451 | self.gatk_dir, 452 | self.tmppath, 453 | self.ref_genome, 454 | self.gatk_jar 455 | ) 456 | 457 | for vcf in self.vcf_resources: 458 | cmd += " -known {0}".format(vcf) 459 | 460 | self._run_cmd(cmd) 461 | 462 | def _launch_gatk_base_recalibrator(self, input_name='realigned'): 463 | """ 464 | Running base recalibration 465 | """ 466 | if self.check_if_output_exists( 467 | "{0}/recal_data.csv".format(self.tmppath)): 468 | return 469 | 470 | self._run_cmd( 471 | 'echo "\n\n######## LAUNCHING RECALIBRATION STEP 1 ########\n"') 472 | 473 | cmd = "{0} {1} -jar {2}/{7} -T BaseRecalibrator" \ 474 | " -I {3}/{6}.bam" \ 475 | " -o {3}/recal_data.csv" \ 476 | " -R {4}" \ 477 | " -nct 20" \ 478 | " --knownSites {5}" \ 479 | .format(self.java, 480 | self.java_mem, 481 | self.gatk_dir, 482 | self.tmppath, 483 | self.ref_genome, 484 | self.dbsnp, 485 | input_name, 486 | self.gatk_jar 487 | ) 488 | 489 | for vcf in self.vcf_resources: 490 | cmd += " --knownSites {0}".format(vcf) 491 | 492 | self._run_cmd_fix_quality(cmd, to_rm='recal_data.csv', resolve='hard') 493 | 494 | def _launch_gatk_print_reads(self, input_name='realigned'): 495 | """ 496 | Running base recalibration STEP 2 497 | """ 498 | if self.check_if_output_exists( 499 | "{0}/recal.bam".format(self.tmppath)): 500 | return 501 | 502 | self._run_cmd( 503 | 'echo "\n\n######## LAUNCHING RECALIBRATION STEP 2 ########\n"') 504 | 505 | cmd = "{0} {1} -jar {2}/{6} -T PrintReads" \ 506 | " -I {3}/{5}.bam" \ 507 | " --out {3}/recal.bam" \ 508 | " -R {4}" \ 509 | " -BQSR {3}/recal_data.csv" \ 510 | " -nct 20" \ 511 | .format(self.java, 512 | self.java_mem, 513 | self.gatk_dir, 514 | self.tmppath, 515 | self.ref_genome, 516 | input_name, 517 | self.gatk_jar 518 | ) 519 | 520 | self._run_cmd_fix_quality(cmd, to_rm='recal.bam', resolve='hard') 521 | 522 | def _launch_gatk_variant_calling(self, output_name='snv_raw_GATK.vcf'): 523 | """ 524 | variant calling 525 | """ 526 | if self.check_if_output_exists( 527 | "{0}/{1}".format(self.tmppath, output_name)): 528 | return 529 | 530 | self._run_cmd( 531 | 'echo "\n\n######## LAUNCHING VARIANT CALLING ########\n"') 532 | 533 | start_time = time() 534 | 535 | cmd = "{0} {1} -jar {2}/{7} -T HaplotypeCaller" \ 536 | " -I {3}/recal.bam" \ 537 | " -o {3}/{6}" \ 538 | " -R {4}" \ 539 | " --dbsnp {5}" \ 540 | " -dontUseSoftClippedBases" \ 541 | " -stand_call_conf 20.0" \ 542 | " -stand_emit_conf 20.0" \ 543 | .format(self.java, 544 | self.java_mem, 545 | self.gatk_dir, 546 | self.tmppath, 547 | self.ref_genome, 548 | self.dbsnp, 549 | output_name, 550 | self.gatk_jar 551 | 552 | ) 553 | 554 | self._run_cmd(cmd) 555 | 556 | self._run_cmd( 557 | 'echo "\n## GATK variant calling done in {0} s##\n"'.format( 558 | time() - start_time)) 559 | 560 | def _launch_gatk_variant_filtering( 561 | self, 562 | input_name='snv_raw_GATK.vcf', 563 | output_name='snv_filtered_GATK.vcf'): 564 | """ 565 | variant filtering 566 | """ 567 | if self.check_if_output_exists( 568 | "{0}/{1}".format(self.tmppath, output_name)): 569 | return 570 | 571 | self._run_cmd( 572 | 'echo "\n######## LAUNCHING VARIANT FILTERING ########\n"') 573 | 574 | start_time = time() 575 | 576 | cmd = "{0} {1} -jar {2}/{7} -T VariantFiltration" \ 577 | " -V {3}/{5}" \ 578 | " -o {3}/{6}" \ 579 | " -R {4}" \ 580 | " -cluster 3" \ 581 | " -filterName FS" \ 582 | ' -filter "FS > 30.0"' \ 583 | " -filterName QD" \ 584 | ' -filter "QD < 2.0"' \ 585 | .format(self.java, 586 | self.java_mem, 587 | self.gatk_dir, 588 | self.tmppath, 589 | self.ref_genome, 590 | input_name, 591 | output_name, 592 | self.gatk_jar 593 | ) 594 | 595 | self._run_cmd(cmd) 596 | 597 | self._run_cmd( 598 | 'echo "\n## GATK variant filtering done in {0} s##\n"'.format( 599 | time() - start_time)) 600 | 601 | def check_if_output_exists(self, outfile): 602 | """ 603 | """ 604 | if isfile(outfile) and getsize(outfile) and not self.clean_tmp: 605 | return True 606 | else: 607 | popen('rm {0}'.format(outfile)).read() 608 | 609 | def _rm_tmp_file(self): 610 | """ 611 | """ 612 | if isdir(self.tmppath) and self.clean_tmp: 613 | for fil in glob('{0}'.format(self.tmppath)): 614 | if fil.count("snv_filtered") or fil.count("stdout.log"): 615 | continue 616 | cmd = "rm {0}".format(fil) 617 | 618 | try: 619 | self._run_cmd(cmd) 620 | except Exception as e: 621 | print('#### error while trying to remove the tmp file: {0}'\ 622 | .format(e)) 623 | 624 | if self.tmppath != self.respath: 625 | cmd = "rm -rf {0}".format(self.tmppath) 626 | 627 | try: 628 | self._run_cmd(cmd) 629 | except Exception as e: 630 | print('#### error while trying to remove the tmp folder: {0}'\ 631 | .format(e)) 632 | 633 | def _run_cmd(self, cmd): 634 | """run cmd""" 635 | stdout_read = open(self.tmppath + '/stdout.log', 'r') 636 | stdout_read.seek(0, 2) 637 | 638 | process = Popen(cmd, 639 | stdout=PIPE, 640 | stderr=PIPE, 641 | shell=True) 642 | 643 | c = process.stdout.read(1) 644 | e = process.stderr.read(1) 645 | 646 | while process.poll() == None or c or e: 647 | 648 | STDOUT.write(c) 649 | self.stdout.write(c) 650 | STDOUT.write(e) 651 | self.stdout.write(e) 652 | STDOUT.flush() 653 | self.stdout.flush() 654 | 655 | c = process.stdout.read(1) 656 | e = process.stderr.read(1) 657 | 658 | process.communicate() 659 | 660 | if process.returncode: 661 | raise Exception('{0} raise non 0 return code!\n'\ 662 | .format(cmd)) 663 | 664 | def _run_cmd_fix_quality(self, cmd, to_rm, resolve='soft'): 665 | """ """ 666 | try: 667 | self._run_cmd(cmd) 668 | except Exception: 669 | self._run_cmd('echo "\n\nERROR DETECTED.' \ 670 | 'Try correcting missencoded quality score"') 671 | 672 | if resolve == 'hard': 673 | cmd += ' --allow_potentially_misencoded_quality_scores' 674 | else: 675 | cmd += ' --fix_misencoded_quality_scores' 676 | 677 | popen("rm {0}/{1}".format(self.tmppath, to_rm)).read() 678 | self._run_cmd(cmd) 679 | 680 | if __name__ == "__main__": 681 | main() 682 | --------------------------------------------------------------------------------