├── garmire_SSrGE
    ├── __init__.py
    ├── examples.py
    ├── config.py
    ├── generate_refgenome_index.py
    ├── load_data.py
    ├── linear_cross_validation.py
    ├── extract_data.py
    ├── extract_matrices_from_dataset.py
    ├── multiprocess_fitting.py
    └── ssrge.py
├── requirements.txt
├── garmire_SNV_calling
    ├── __init__.py
    ├── bash_utils.py
    ├── make_GSMID_sampleID_csv.py
    ├── check_star_overall_quality.py
    ├── check_fastqc_stats.py
    ├── generate_bsseeker_genome_index.py
    ├── generate_STAR_genome_index.py
    ├── deploy_BSseeker_call_methylation.py
    ├── process_multiple_generic.py
    ├── process_snv_calling_with_monovar.py
    ├── compute_frequency_matrix.py
    ├── deploy_bismark.py
    ├── parse_10x_bam_file_to_fastq_files.py
    ├── process_annotate_snv.py
    ├── deploy_BSseeker.py
    ├── process_fastqc_report.py
    ├── process_multiple_snv.py
    ├── deploy_star.py
    ├── process_freebayes.py
    ├── config.py
    └── process_snv_GATK.py
├── garmire_download_ncbi_sra
    ├── __init__.py
    ├── argv.py
    ├── remove_sra.py
    ├── extract_data.py
    ├── config.py
    ├── download_data.py
    └── download_soft_file.py
├── img
    └── workflow.png
├── .bumpversion.cfg
├── .gitignore
├── setup.py
├── test
    ├── test_snv_optional.py
    ├── test_download.py
    ├── test_extract_matrices.py
    ├── test_snv.py
    └── test_ssrge.py
├── README_download_ncbi_rsa.md
├── README_snv_calling.md
├── example
    └── jones_pancreatic_cancer.soft
└── README.md


/garmire_SSrGE/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .
2 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/garmire_download_ncbi_sra/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/img/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lanagarmire/SSrGE/HEAD/img/workflow.png


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | files = ./setup.py
3 | commit = True
4 | tag = True
5 | current_version = 2.0.2
6 | 
7 | 


--------------------------------------------------------------------------------
/garmire_download_ncbi_sra/argv.py:
--------------------------------------------------------------------------------
1 | """
2 | Instanciate argument passed
3 | Also, helper to print when -H or -h is passed as argument
4 | """
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.nt
 3 | *.err
 4 | *.out
 5 | *.csv
 6 | *egg-info*
 7 | build/*
 8 | dist/*
 9 | *data/*
10 | *~
11 | *tmp/*
12 | *#*
13 | *.cache*
14 | */slurm/bash_variable.sh
15 | 


--------------------------------------------------------------------------------
/garmire_download_ncbi_sra/remove_sra.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | """
 4 | remove sra file
 5 | """
 6 | 
 7 | from garmire_download_ncbi_sra.config import PATH_DATA
 8 | from os import popen
 9 | 
10 | 
11 | def main():
12 |     rm_sra()
13 | 
14 | def rm_sra():
15 |     """extract sra file"""
16 |     path_seq = PATH_DATA + '/fastq/'
17 |     popen('rm {0}*.sra'.format(path_seq)).read()
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys, os
 3 | 
 4 | VERSION = '2.0.2'
 5 | 
 6 | setup(name='garmire_SSrGE',
 7 |       version=VERSION,
 8 |       description="compute SNV from RNA-seq following GATK recommendations",
 9 |       long_description="""""",
10 |       classifiers=[],
11 |       keywords='',
12 |       author='Olivier Poirion (PhD)',
13 |       author_email='opoirion@hawaii.edu',
14 |       url='',
15 |       license='MIT',
16 |       packages=find_packages(exclude=['examples', 'tests']),
17 |       include_package_data=True,
18 |       zip_safe=False,
19 |       install_requires=[
20 |           'numpy',
21 |           'scipy',
22 |           'scikit-learn',
23 |           'tabulate'],
24 |       )
25 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/bash_utils.py:
--------------------------------------------------------------------------------
 1 | from subprocess import call
 2 | 
 3 | from sys import stdout as STDOUT
 4 | 
 5 | 
 6 | def printf(msg):
 7 |     """
 8 |     """
 9 |     print(msg)
10 | 
11 | 
12 | def exec_cmd(cmd, stdout=STDOUT):
13 |     """
14 |     """
15 |     if stdout is None:
16 |         stdout = STDOUT
17 | 
18 |     try:
19 |         answer = call(cmd.split(), stdout=stdout)
20 |     except Exception:
21 |         raise Exception('error when launching {0} \n cannot execute the command!'.format(cmd))
22 | 
23 |     try:
24 |         assert(answer == 0)
25 |     except Exception:
26 |         raise Exception('{0} return a non 0 code!'.format(cmd))
27 | 
28 |     call('echo ### cmd: {0} succesfull ###\n'.format(cmd).split(), stdout=stdout)
29 | 


--------------------------------------------------------------------------------
/test/test_snv_optional.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from garmire_SNV_calling.config import JAVA
 4 | from garmire_SNV_calling.config import SNPEFF
 5 | from garmire_SNV_calling.config import FEATURE_COUNT
 6 | from garmire_SNV_calling.config import FASTQC
 7 | 
 8 | from commands import getstatusoutput
 9 | 
10 | 
11 | class TestPackage(unittest.TestCase):
12 |     """ """
13 |     def test_snpeff(self):
14 |         """assert that snpEff is installed"""
15 |         self.assertFalse(getstatusoutput("{0} -jar {1} -version".format(JAVA, SNPEFF))[0])
16 | 
17 |     def test_featurecount(self):
18 |         """assert that featureCounts is installed"""
19 |         self.assertFalse(getstatusoutput("{0} -v".format(FEATURE_COUNT))[0])
20 | 
21 |     def test_fastqc(self):
22 |         """assert that fastQC is installed"""
23 |         self.assertFalse(getstatusoutput("{0} -version".format(FASTQC))[0])
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     unittest.main()
28 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/make_GSMID_sampleID_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 | read soft summary file from GEO page and map GSE to cell name
 5 | """
 6 | 
 7 | from garmire_SNV_calling.config import GLOBAL_DATA_ROOT
 8 | from garmire_SNV_calling.config import PROJECT_NAME
 9 | from garmire_SNV_calling.config import SOFT_PATH
10 | 
11 | from os.path import isfile
12 | 
13 | import re
14 | 
15 | def main():
16 |     csv_path = "{0}/{1}/{1}.csv"\
17 |                .format(GLOBAL_DATA_ROOT, PROJECT_NAME)
18 | 
19 |     if not isfile(SOFT_PATH):
20 |         print "error! no file: {0}".format(SOFT_PATH)
21 |         return 1
22 | 
23 |     f_soft = open(SOFT_PATH, 'r').read()
24 |     f_csv = open(csv_path, 'w')
25 | 
26 |     gse_list = re.findall("(?<=\^SAMPLE \= )\w+", f_soft)
27 |     id_list = re.findall("(?<=!Sample_title \= ).+(?!\n)", f_soft)
28 | 
29 |     for gse, ids in zip(gse_list, id_list):
30 |         ids = ids.replace(' ', '_')
31 |         f_csv.write("{0};{1}\n".format(gse, ids))
32 | 
33 |     print "done"
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/check_star_overall_quality.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | """ check overall statistics for all log files from star aligner"""
 4 | 
 5 | from os import listdir
 6 | from os.path import isfile
 7 | import re
 8 | 
 9 | from garmire_SNV_calling.config import OUTPUT_PATH_STAR
10 | from garmire_SNV_calling.config import PATH_OUTPUT
11 | 
12 | 
13 | def main():
14 |     make_aligner_quality_csv()
15 | 
16 | def make_aligner_quality_csv():
17 |     regex = "(?<=Uniquely mapped reads \% \|\t)[0-9]+\.[0-9]+"
18 |     regex = re.compile(regex)
19 | 
20 |     stats = {}
21 | 
22 |     for folder in listdir(OUTPUT_PATH_STAR):
23 |         log_file = "{0}/{1}/Log.final.out"\
24 |                    .format(OUTPUT_PATH_STAR, folder)
25 | 
26 |         if not isfile(log_file):
27 |             continue
28 | 
29 |         stats[folder] = regex.findall(
30 |             open(log_file, 'r').read())[0]
31 | 
32 |     f_csv = open(PATH_OUTPUT + '/aligner_unique_read.csv', 'w')
33 | 
34 |     for key in stats:
35 |         f_csv.write('{0};{1}\n'.format(key, stats[key]))
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/check_fastqc_stats.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | """ check overall statistics for all log files from fastqc report"""
 4 | 
 5 | from os import listdir
 6 | from os.path import isfile
 7 | import re
 8 | 
 9 | from collections import Counter
10 | 
11 | from garmire_SNV_calling.config import PATH_OUTPUT
12 | 
13 | PATH_OUTPUT_FASTQC = PATH_OUTPUT + '/fastqc/data/'
14 | 
15 | 
16 | def main():
17 |     make_aligner_quality_csv()
18 | 
19 | def make_aligner_quality_csv():
20 |     """ """
21 |     regex_status = "(?<=Sequence Duplication Levels\t)\w+"
22 |     regex_status = re.compile(regex_status)
23 | 
24 |     stats_status = {}
25 | 
26 |     for folder in listdir(PATH_OUTPUT_FASTQC):
27 |         log_file = "{0}/{1}/fastqc_data.txt"\
28 |                    .format(PATH_OUTPUT_FASTQC, folder)
29 | 
30 |         if not isfile(log_file):
31 |             continue
32 | 
33 |         read = open(log_file, 'r').read()
34 |         status = regex_status.findall(read)[0]
35 |         sample = folder.rsplit('_fastqc', 1)[0]
36 |         stats_status[sample] = status
37 | 
38 |     f_csv = open(PATH_OUTPUT + '/deduplicated_check.csv', 'w')
39 | 
40 |     for key in stats_status:
41 |         f_csv.write('{0};{1}\n'.format(key, stats_status[key]))
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/test/test_download.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from os import popen
 4 | 
 5 | from os.path import isfile
 6 | from os.path import isdir
 7 | 
 8 | from garmire_download_ncbi_sra.config import FASTQ_DUMP
 9 | from garmire_download_ncbi_sra.config import PATH_DATA
10 | from garmire_download_ncbi_sra.config import PATH_SOFT
11 | 
12 | from garmire_download_ncbi_sra.download_data import get_urls
13 | 
14 | import urllib2
15 | 
16 | class TestPackage(unittest.TestCase):
17 |     """ """
18 |     def test_fastq_dump(self):
19 |         """assert that fastq-dump exists"""
20 |         self.assertIsNotNone(popen(FASTQ_DUMP))
21 | 
22 |     def test_is_path(self):
23 |         """assert that data folder exits"""
24 |         self.assertTrue(isdir(PATH_DATA))
25 | 
26 |     def test_is_soft(self):
27 |         """assert that soft file exits"""
28 |         self.assertTrue(isfile(PATH_SOFT))
29 | 
30 |     def test_is_urls(self):
31 |         """assert that urls can be extracted from soft files"""
32 |         urls = get_urls()
33 | 
34 |         self.assertTrue(len(urls))
35 | 
36 |     def test_connect_to_urls(self):
37 |         """assert that the first url can be reached"""
38 |         urls = get_urls()
39 |         gsm, url = urls[0]
40 | 
41 |         self.assertTrue(gsm.count('GSM'))
42 |         self.assertTrue(urllib2.urlopen(url))
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/generate_bsseeker_genome_index.py:
--------------------------------------------------------------------------------
 1 | """generate STAR GENOME INDEX"""
 2 | 
 3 | from sys import stdout as sys_stdout
 4 | from os import popen
 5 | from os import mkdir
 6 | from os.path import isdir
 7 | from os.path import split as pathsplit
 8 | 
 9 | from distutils.dir_util import mkpath
10 | 
11 | from garmire_SNV_calling.config import BSSEEKER2_REP
12 | from garmire_SNV_calling.config import BSSEQ_INDEX_PATH
13 | from garmire_SNV_calling.config import REF_GENOME
14 | from garmire_SNV_calling.config import PYTHON
15 | from garmire_SNV_calling.config import BOWTIE_REP
16 | 
17 | 
18 | ################ VARIABLE ################
19 | REF_GENOME_PATH = pathsplit(REF_GENOME)[0]
20 | ##########################################
21 | 
22 | 
23 | def main():
24 |     """ """
25 |     bsseq_index_path = BSSEQ_INDEX_PATH
26 |     print "######## computing BS-seq index ########\npath:{0}\n"\
27 |         .format(bsseq_index_path)
28 | 
29 |     if not isdir(bsseq_index_path):
30 |         mkpath(bsseq_index_path)
31 | 
32 |     cmd = "{0} {1}/bs_seeker2-build.py -f {2}"\
33 |           " --aligner=bowtie2 -p {3} --db {4} -r"\
34 |         .format(PYTHON,
35 |                 BSSEEKER2_REP,
36 |                 REF_GENOME,
37 |                 BOWTIE_REP,
38 |                 REF_GENOME_PATH
39 |     )
40 | 
41 |     stdout = popen(cmd)
42 |     c = stdout.read(1)
43 | 
44 |     while c:
45 |         sys_stdout.write(c)
46 |         sys_stdout.flush()
47 |         c = stdout.read(1)
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/generate_STAR_genome_index.py:
--------------------------------------------------------------------------------
 1 | """generate STAR GENOME INDEX"""
 2 | 
 3 | from sys import stdout as sys_stdout
 4 | from os import popen
 5 | 
 6 | from os.path import isdir
 7 | from distutils.dir_util import mkpath
 8 | 
 9 | from garmire_SNV_calling.config import PATH_STAR_SOFTWARE
10 | from garmire_SNV_calling.config import STAR_INDEX_PATH
11 | from garmire_SNV_calling.config import ANNOTATION_PATH
12 | from garmire_SNV_calling.config import REF_GENOME
13 | from garmire_SNV_calling.config import STAR_THREADS
14 | from garmire_SNV_calling.config import STAR_INDEX_READ_LENGTH
15 | 
16 | 
17 | def main():
18 |     """ """
19 |     star_index_path = "{0}READ{1}/".format(STAR_INDEX_PATH.rstrip('/'),
20 |                                           STAR_INDEX_READ_LENGTH)
21 |     print "######## computing STAR index ########\npath:{0}\n"\
22 |         .format(star_index_path)
23 | 
24 |     if not isdir(star_index_path):
25 |         mkpath(star_index_path)
26 | 
27 |     cmd = "{0} --runMode genomeGenerate --runThreadN {1}"\
28 |           " --genomeDir {2} --genomeFastaFiles {3} --sjdbGTFfile {4}"\
29 |           " --sjdbOverhang {5}"\
30 |           .format(
31 |               PATH_STAR_SOFTWARE,
32 |               STAR_THREADS,
33 |               star_index_path,
34 |               REF_GENOME,
35 |               ANNOTATION_PATH,
36 |               STAR_INDEX_READ_LENGTH
37 |     )
38 |     stdout = popen(cmd)
39 |     c = stdout.read(1)
40 | 
41 |     while c:
42 |         sys_stdout.write(c)
43 |         sys_stdout.flush()
44 |         c = stdout.read(1)
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/garmire_download_ncbi_sra/extract_data.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | """
 4 | remove sra file
 5 | """
 6 | from os.path import isdir
 7 | from os import mkdir
 8 | from os import listdir
 9 | from os import popen
10 | 
11 | from fnmatch import fnmatch
12 | 
13 | from garmire_download_ncbi_sra.config import PATH_DATA
14 | from garmire_download_ncbi_sra.config import FASTQ_DUMP
15 | from garmire_download_ncbi_sra.config import FASTQ_DUMP_OPTION
16 | from garmire_download_ncbi_sra.config import LIMIT
17 | from garmire_download_ncbi_sra.config import NB_CPU
18 | 
19 | 
20 | from multiprocessing import Pool
21 | 
22 | 
23 | ############ VARIABLE ############
24 | PATH_SEQ = PATH_DATA + '/fastq/'
25 | ##################################
26 | 
27 | 
28 | def main():
29 |     fastq_dump()
30 | 
31 | def fastq_dump():
32 |     """extract sra file"""
33 |     count = 0
34 | 
35 |     print('extracting .sra files into: {0}'.format(PATH_SEQ))
36 | 
37 |     file_list = []
38 | 
39 |     for fil in listdir(PATH_SEQ):
40 |         if not fnmatch(fil, '*.sra'):
41 |             continue
42 | 
43 |         file_list.append(fil)
44 | 
45 |         count += 1
46 | 
47 |         if LIMIT and count > LIMIT:
48 |             break
49 | 
50 |     pool = Pool(NB_CPU)
51 | 
52 |     pool.map(_fastq_dump, file_list)
53 | 
54 | def _fastq_dump(fil):
55 |     """ """
56 |     print('go to extraction for file:', fil)
57 |     fil = fil.rsplit('.', 1)[0]
58 | 
59 |     if not isdir("{0}/{1}".format(PATH_SEQ, fil)):
60 |         mkdir("{0}/{1}".format(PATH_SEQ, fil))
61 |     popen('{3} {2} -v {0}/{1}.sra -O {0}/{1}/'\
62 |           .format(PATH_SEQ, fil, FASTQ_DUMP_OPTION, FASTQ_DUMP)).read()
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/examples.py:
--------------------------------------------------------------------------------
 1 | """ example """
 2 | 
 3 | from scipy.sparse import csr_matrix
 4 | import numpy as np
 5 | 
 6 | 
 7 | def create_example_matrix_v1(nb_cells=100, nb_snvs=6, nb_genes=5):
 8 |     """
 9 |     create a random feature matrix and infer Y according to coefs W
10 |     Four sparse coefs are set into W
11 |     """
12 |     X = csr_matrix(np.random.random((nb_cells, nb_snvs)))
13 |     W = np.zeros((nb_snvs, nb_genes))
14 | 
15 | 
16 |     W[0][1] = 5
17 |     W[0][0] = 5
18 |     W[1][1] = 5
19 |     W[1][0] = 5
20 |     W[3][3] = 2
21 |     W[5][4] = 6
22 | 
23 |     Y = (X * W)
24 | 
25 |     return X, Y, W
26 | 
27 | def create_example_matrix_v2(nb_cells=100, nb_snvs=6, nb_genes=5):
28 |     """
29 |     create a random feature matrix and infer Y according to coefs W
30 |     Four sparse coefs are set into W
31 |     create fake snv list and fake gene list
32 |     """
33 |     gene_list = ['KRAS',
34 |                  'HLA-A',
35 |                  'HLA-B',
36 |                  'HLA-C',
37 |                  'SPARC',
38 |                  'SARAF',
39 |                  'EIF3K',
40 |                  'ALDH',
41 |     ]
42 | 
43 | 
44 |     X = csr_matrix(np.random.random((nb_cells, nb_snvs)))
45 |     W = np.zeros((nb_snvs, nb_genes))
46 | 
47 |     gene_id_list = [gene_list[i] if i < len(gene_list) else i
48 |                     for i in range(nb_genes)]
49 | 
50 |     snv_id_list = [(gene_id_list[i], i)
51 |                    if i < nb_genes else (gene_id_list[0], i)
52 |                    for i in range(nb_snvs)]
53 | 
54 |     W[0][0] = 5
55 |     W[1][0] = 5
56 |     W[3][3] = 2
57 |     W[5][4] = 6
58 | 
59 |     Y = (X * W)
60 | 
61 |     return X, Y, gene_id_list, snv_id_list
62 | 
63 | def create_example_matrix_v3(nb_cells=100, nb_snvs=6, nb_genes=5):
64 |     """
65 |     create a random feature matrix and infer Y according to coefs W
66 |     Four sparse coefs are set into W
67 |     Additionally, create CNV matrix using Y
68 |     """
69 |     X, Y, W = create_example_matrix_v1()
70 |     C = np.random.randint(0,10, (Y.shape))
71 | 
72 |     return X, Y, C, W
73 | 
74 | def create_example_matrix_v4(nb_cells=100, nb_snvs=6, nb_genes=5):
75 |     """
76 |     create a random feature matrix and infer Y according to coefs W
77 |     Four sparse coefs are set into W
78 |     Additionally, create CNV matrix using Y
79 |     """
80 |     X, Y, gene_id_list, snv_id_list = create_example_matrix_v2()
81 |     C = np.random.randint(0,10, (Y.shape))
82 | 
83 |     return X, Y, C, gene_id_list, snv_id_list
84 | 


--------------------------------------------------------------------------------
/test/test_extract_matrices.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from garmire_SSrGE.extract_matrices_from_dataset import ExtractMatrix
 4 | 
 5 | from garmire_SSrGE.config import EXPRESSION_MATRIX_FOLDER_PATH
 6 | from garmire_SSrGE.config import VCF_FOLDER_PATH
 7 | from garmire_SSrGE.config import GTF_PATH
 8 | from garmire_SSrGE.config import INDEX_SAVE_PATH
 9 | 
10 | from garmire_SSrGE.generate_refgenome_index import main as make_index
11 | 
12 | from os.path import isdir
13 | from os.path import isfile
14 | 
15 | import warnings
16 | 
17 | 
18 | class TestPackage(unittest.TestCase):
19 |     """ """
20 |     def test_gtf_path(self):
21 |         """
22 |         test if the GTF path defined in  config exists
23 |         """
24 |         self.assertTrue(isfile(GTF_PATH))
25 | 
26 |     def test_gtf_index(self):
27 |         """
28 |         test the gtf index creation
29 |         """
30 |         self.assertTrue(make_index())
31 | 
32 |     def test_gtf_index_path(self):
33 |         """
34 |         test the gtf index path
35 |         """
36 |         self.assertTrue(INDEX_SAVE_PATH)
37 | 
38 |     def test_vcf_dir_exits(self):
39 |         """
40 |         test if the vcf directory defined in config exists
41 |         """
42 |         self.assertTrue(isdir(VCF_FOLDER_PATH))
43 | 
44 |     def test_snv_matrices(self):
45 |         """
46 |         test if the snv extraction matrix on one sample
47 |         """
48 |         extract_matrix = ExtractMatrix(limit=1)
49 |         matrix = extract_matrix.extract_SNV_mat()
50 | 
51 |         if isinstance(matrix, type(None)):
52 |             warnings.warn(
53 |                 'SNV matrix is None beacause vcf folder is not defined!')
54 |             return
55 | 
56 |         self.assertTrue(matrix.shape)
57 | 
58 |     def test_expression_matrix_dir_exits(self):
59 |         """
60 |         test if the expression matrix directory defined in config exists
61 |         """
62 |         self.assertTrue(isdir(EXPRESSION_MATRIX_FOLDER_PATH))
63 | 
64 |     def test_ge_matrices(self):
65 |         """
66 |         test if the gene expression extraction matrix on one sample
67 |         """
68 |         extract_matrix = ExtractMatrix(limit=1)
69 |         matrix = extract_matrix.extract_GE_mat()
70 | 
71 |         if isinstance(matrix, type(None)):
72 |             warnings.warn(
73 |                 'gene expression matrix is none beacause GE folder is not defined!')
74 |             return
75 | 
76 |         self.assertTrue(matrix.shape)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/deploy_BSseeker_call_methylation.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | from multiprocessing import Pool
 4 | 
 5 | from os import popen
 6 | 
 7 | from os.path import isdir
 8 | from os.path import isfile
 9 | from os.path import split as pathsplit
10 | 
11 | from glob import glob
12 | 
13 | from time import sleep
14 | from random import random
15 | 
16 | from fnmatch import fnmatch
17 | 
18 | from distutils.dir_util import mkpath
19 | 
20 | from garmire_SNV_calling.config import PATH_OUTPUT
21 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN
22 | from garmire_SNV_calling.config import BSSEEKER2_REP
23 | from garmire_SNV_calling.config import REF_GENOME
24 | from garmire_SNV_calling.config import PYTHON
25 | 
26 | 
27 | ################ VARIABLE ##################################
28 | 
29 | BAM_PATH = PATH_OUTPUT + '/BSseeker/'
30 | PROCESS_THREADS = 2
31 | BISMARK_OPTION = ''
32 | REF_GENOME_PATH = pathsplit(REF_GENOME)[0]
33 | REF_GENOME_RRBS_DB = REF_GENOME_PATH + '/genome.fa_rrbs_20_500_bowtie2/'
34 | ############################################################
35 | 
36 | 
37 | sleep(2 * random())
38 | if not isdir(BAM_PATH):
39 |     mkpath(BAM_PATH)
40 | 
41 | 
42 | def main():
43 |     pool = Pool(PROCESS_THREADS)
44 |     # process_one_file(glob(BAM_PATH + '/*')[0])
45 |     pool.map(process_one_file, glob(BAM_PATH + '/*'))
46 | 
47 | def process_one_file(folder):
48 |     """ """
49 |     print(folder)
50 |     if isfile(folder):
51 |         return False
52 | 
53 |     if PATTERN and not fnmatch(folder, PATTERN):
54 |         return False
55 | 
56 |     print("====> folder to be processed:", folder)
57 | 
58 |     input_bam_file_name = glob(folder + '/*.bam')
59 | 
60 |     if not input_bam_file_name:
61 |         print('no bam file detected for :{0}\nskipping...'\
62 |             .format(folder))
63 |         return False
64 | 
65 |     if len(input_bam_file_name) > 1:
66 |         print('multiple bam files detected: {0}. selecting the first'.format(
67 |             input_bam_file_name))
68 | 
69 |     input_bam = input_bam_file_name[0]
70 |     output_file = input_bam.rsplit('.', 1)[0] + '.CpG.CGmap'
71 | 
72 |     cmd = "{0} {1}/bs_seeker2-call_methylation.py -i {2}  --CGmap {3} --db {4} --txt " \
73 |         .format(PYTHON,
74 |                 BSSEEKER2_REP,
75 |                 input_bam,
76 |                 output_file,
77 |                 REF_GENOME_RRBS_DB,
78 |         )
79 | 
80 |     _run_cmd(cmd)
81 |     _run_cmd('rm {0}/*_sorted*'.format(folder))
82 | 
83 |     return True
84 | 
85 | 
86 | def _run_cmd(cmd, *args):
87 |     """run cmd"""
88 |     popen(cmd).read()
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     main()
93 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/config.py:
--------------------------------------------------------------------------------
 1 | from garmire_SNV_calling.config import ANNOTATION_PATH
 2 | from garmire_SNV_calling.config import PATH_OUTPUT as PROJECT_PATH
 3 | from garmire_SNV_calling.config import SOFT_PATH as SNV_CALLING_SOFT_PATH
 4 | 
 5 | """
 6 | CONFIG file for SSrGE
 7 | 
 8 | Principal default values for SSrGE class parameters
 9 | 
10 | The config file gives also parameters in order to
11 | extract SNV and GE matrices from a given project
12 | 
13 | """
14 | 
15 | ######## SSrGE VARIABLE ##############################
16 | TIME_LIMIT = 5
17 | # time limit for one linear regression model
18 | MIN_OBS_FOR_REGRESS = 10
19 | # Min number of cell having non null gene expression
20 | # to infer a sparse linear model
21 | NB_THREADS = 4
22 | # Number of threads to run in parallel
23 | CROSS_VAL_NFOLD = 5
24 | # Number of folds to perform the cross validation
25 | ######################################################
26 | 
27 | 
28 | ##################################### vcf and ecpression data #######################
29 | # Paths used to create the SNVs and the gene expression matrices
30 | # The folder architecture used by default is the one from the SNV calling package
31 | # see the file ./garmire_SSrGE/garmire_SNV_calling/config.py
32 | # All the paths defined bellow can be overwritted using a user defined path instead
33 | 
34 | # path to save the GTF index
35 | GTF_PATH = ANNOTATION_PATH
36 | # path used in the SNV_calling module
37 | SOFT_PATH = SNV_CALLING_SOFT_PATH # OPTIONNAL, path of the .soft file from ncbi
38 | # internal index used to link SNVs and genes
39 | INDEX_SAVE_PATH = "{0}/gtf_index/".format(PROJECT_PATH)
40 | 
41 | # the path for the folders containing the expression matrix files
42 | # one folder per single cell and each folder contains a unique expression matrix (.txt) file
43 | EXPRESSION_MATRIX_FOLDER_PATH = '/STAR/'
44 | # the name of the gene expression matrix present inside each single-cell folder
45 | GENE_MATRIX_NAME = 'matrix_counts.txt'
46 | 
47 | # the SNV caller used
48 | USED_CALLER = 'MONOVAR' # {'MONOVAR', 'GATK'}
49 | 
50 | ######################## Monovar caller ###############################################
51 | # The folder containing the .vcf files produced by Monovar and the .txt input files
52 | VCF_MONOVAR_PATH = '/data/monovar/'
53 | 
54 | ######################## GATK caller ##################################################
55 | # the name of the folder containing the folders containing the .vcf files
56 | # one folder per single cell and each single-cell folder contains a unique .vcf file
57 | # the path for the folders containing the .vcf files
58 | VCF_FOLDER_PATH = '/data/'
59 | # the name of the file containing the vcf inside each folder
60 | VCF_NAME = 'snv_filtered.vcf'
61 | ######################################################################################
62 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/process_multiple_generic.py:
--------------------------------------------------------------------------------
 1 | """ generic class to perform multi-processing """
 2 | 
 3 | from multiprocessing import Process
 4 | from multiprocessing import Queue
 5 | 
 6 | from time import sleep
 7 | 
 8 | 
 9 | class MPI():
10 |     """generic multiprocessing class"""
11 |     def __init__(self, input_list,
12 |                  ProcessClass,
13 |                  nb_processes=1,
14 |                  verbose=True):
15 |         """
16 |         id_list: list    should be a list of input
17 |         ProcessClass: class with process method and id attribute
18 |         """
19 |         self.input_queue = Queue()
20 |         self.processes = []
21 |         self.verbose = verbose
22 |         self.nb_processes = nb_processes
23 |         self.ProcessClass = ProcessClass
24 | 
25 |         for inpt in input_list:
26 |             self.input_queue.put(inpt)
27 | 
28 |         for i in range(nb_processes):
29 |             self.processes.append(
30 |                 MultiprocessingInstance(
31 |                     input_queue=self.input_queue,
32 |                     ProcessClass=ProcessClass,
33 |                     id=i)
34 |             )
35 | 
36 |     def _run(self):
37 |         for p in self.processes:
38 |             p.start()
39 | 
40 |         while self.input_queue.qsize():
41 |             for p in self.processes:
42 |                 if p.exitcode:
43 |                     raise KeyboardInterrupt
44 |             sleep(1)
45 | 
46 |     def run(self):
47 |         if self.verbose:
48 |             rep = raw_input(
49 |                 'launching {0} processes with class {1} continue? (Y/n)'\
50 |                 .format(self.nb_processes, self.ProcessClass))
51 |             if rep != 'Y':
52 |                 return
53 | 
54 |         try:
55 |             self._run()
56 | 
57 |         except KeyboardInterrupt:
58 |             for p in self.processes:
59 |                 p.terminate()
60 | 
61 | class MultiprocessingInstance(Process):
62 |     """
63 |     generic multiprocessing class
64 |     """
65 |     def __init__(self, input_queue, ProcessClass, id):
66 |         """
67 |         input_queue: Multiprocessing.Queue
68 |         ProcessClass: class with process method and id attribute
69 |         """
70 |         self.input_queue = input_queue
71 |         self.id = id
72 |         self.process_instance = ProcessClass(id=id)
73 |         Process.__init__(self)
74 | 
75 |     def run(self):
76 |         while self.input_queue.qsize():
77 |             try:
78 |                 sample = self.input_queue.get(True, 0.2)
79 |             except Exception as e:
80 |                 print "exception:{0}".format(e)
81 |                 continue
82 |             else:
83 |                 print "sample for sample {0} with id {1}"\
84 |                     .format(sample, self.id)
85 |                 self.process_instance.process(sample)
86 | 


--------------------------------------------------------------------------------
/garmire_download_ncbi_sra/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | NCBI RNAseq dataset downloader
 3 | 
 4 | **** CONFIG FILE ****
 5 | 
 6 | """
 7 | from argparse import ArgumentParser
 8 | 
 9 | ARGPARSER = ArgumentParser(description='Argument for the SRA downloading pipeline',
10 |                                    prefix_chars='-')
11 | 
12 | ARGPARSER.add_argument('-project_name',
13 |                        help='name of the project folder and where to find the fastq files (default: sample_test)',
14 |                        default="sample_test",
15 |                        metavar='str')
16 | 
17 | ARGPARSER.add_argument('-dl_nb_threads',
18 |                        help=' number of CPU to be used to extract the .sra files (default: 4)',
19 |                        default=4,
20 |                        type=int,
21 |                        metavar='int')
22 | 
23 | ARGPARSER.add_argument('-nb_cpus',
24 |                        help=' number of CPU to be used to extract the .sra files',
25 |                        default=4,
26 |                        type=int,
27 |                        metavar='int')
28 | 
29 | ARGPARSER.add_argument('-max_nb_samples',
30 |                        help=' max number of samples downloaded (default None)',
31 |                        default=0,
32 |                        type=int,
33 |                        metavar='int')
34 | 
35 | ARGPARSER.add_argument('-soft_id',
36 |                        help=' SRA ID used to download the corresponding .soft file (example: "GSE79457")',
37 |                        default="",
38 |                        type=str,
39 |                        metavar='str')
40 | 
41 | ARGS = ARGPARSER.parse_known_args()[0]
42 | 
43 | ############ Variables #################################################
44 | # The name of the project (defining the name of the folder)
45 | PROJECT_NAME = ARGS.project_name
46 | # The name of the project (defining the name of the folder)
47 | PROJECT_NAME = ARGS.project_name
48 | # The absolute path where the project will be created
49 | PATH_DATA = "/data/results/{0}".format(PROJECT_NAME)
50 | # and the SRA files downloaded and extracted
51 | # path toward the .soft file (with the corresponding ftp addresses for the .sra files)
52 | PATH_SOFT =  "{0}/{1}.soft".format(PATH_DATA, PROJECT_NAME)
53 | # number of threads to use for downloading rsa files
54 | NB_THREADS = ARGS.dl_nb_threads
55 | # number of CPU to be used to extract the .sra files
56 | NB_CPU = ARGS.nb_cpus
57 | # path to the fastq software
58 | FASTQ_DUMP = "fastq-dump"
59 | # options to use to extract the sra (using fastqdump)
60 | # "--split-3 -B is the default" and it is strongly recommended to keep it
61 | FASTQ_DUMP_OPTION = "--split-3 -B"
62 | # define the maximum number of sra files to be downloaded
63 | LIMIT = ARGS.max_nb_samples
64 | #soft ID
65 | SOFT_ID = ARGS.soft_id
66 | ########################################################################
67 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/process_snv_calling_with_monovar.py:
--------------------------------------------------------------------------------
  1 | from config import PATH_OUTPUT
  2 | from config import OUTPUT_PATH_STAR
  3 | from config import MONOVAR_REP
  4 | from config import MONOVAR_SAMTOOLS
  5 | from config import REF_GENOME
  6 | from config import PYTHON
  7 | from config import NB_PROCESS_SNV
  8 | 
  9 | from os.path import isdir
 10 | from os import mkdir
 11 | 
 12 | from glob import glob
 13 | 
 14 | from random import sample
 15 | 
 16 | from os import remove
 17 | from os import popen
 18 | 
 19 | import re
 20 | 
 21 | from multiprocessing import Pool
 22 | 
 23 | 
 24 | ######## LOCAL VARIABLES ############################
 25 | PATH_MONOVAR = '{0}/monovar/'.format(PATH_OUTPUT)
 26 | CHUNCK_SIZE = 20
 27 | THREAD_NB = 3
 28 | #####################################################
 29 | 
 30 | 
 31 | if not isdir(PATH_MONOVAR):
 32 |     mkdir(PATH_MONOVAR)
 33 | 
 34 | 
 35 | def main():
 36 |     """ """
 37 |     create_list_file()
 38 |     launch_monovar()
 39 | 
 40 | def launch_monovar():
 41 |     """
 42 |     """
 43 |     cmd_list = []
 44 | 
 45 |     for fil in glob('{0}/monovar_input*.txt'.format(PATH_MONOVAR)):
 46 | 
 47 |         cmd = '{0} mpileup -BQ0 -d10000 -f {1} -b {6} '\
 48 |               '| {3} {4}/src/monovar.py -p 0.002 -a 0.2 -t 0.05 -f {1}'\
 49 |               ' -b {6} -m {5} -o {7}.vcf'\
 50 |               .format(MONOVAR_SAMTOOLS,
 51 |                       REF_GENOME,
 52 |                       PATH_MONOVAR,
 53 |                       PYTHON,
 54 |                       MONOVAR_REP,
 55 |                       NB_PROCESS_SNV,
 56 |                       fil, fil.rsplit('.', 1)[0])
 57 | 
 58 |         cmd_list.append(cmd)
 59 | 
 60 |     pool = Pool(THREAD_NB)
 61 |     pool.map(_multiprocessing_func, cmd_list)
 62 | 
 63 | def _multiprocessing_func(cmd):
 64 |     """ """
 65 |     print('###### command launched:\n{0}\n########'.format(cmd))
 66 |     fil  = re.findall('-f (?P<file>.+?) -b', cmd)[0]
 67 | 
 68 |     popen(cmd).read()
 69 | 
 70 |     print('monovar finished for: {0}').format(fil)
 71 | 
 72 | def create_list_file():
 73 |     """
 74 |     create the input file used by monovar containing all the input files
 75 |     """
 76 |     for fil in glob('{0}/monovar_input*'.format(PATH_MONOVAR)):
 77 |         remove(fil)
 78 | 
 79 |     file_list = set()
 80 | 
 81 |     for folder in glob('{0}/*'.format(OUTPUT_PATH_STAR)):
 82 |         if not isdir(folder):
 83 |             continue
 84 | 
 85 |         file_list.add('{0}/Aligned.sortedByCoord.out.bam'.format(folder))
 86 | 
 87 |     nb_file = 0
 88 | 
 89 |     chunck_size = CHUNCK_SIZE
 90 | 
 91 |     while file_list:
 92 |         if len(file_list) < chunck_size:
 93 |             chunck_size = len(file_list)
 94 | 
 95 |         sample_list = sample(file_list, chunck_size)
 96 |         file_list = file_list.difference(sample_list)
 97 | 
 98 |         f_input = open('{0}/monovar_input_{1}.txt'.format(PATH_MONOVAR, nb_file), 'w')
 99 | 
100 |         for fil in sample_list:
101 |             f_input.write('{0}\n'.format(fil))
102 | 
103 |             nb_file += 1
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/generate_refgenome_index.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | """ Generate homemade reference genome gtf index using python objects """
 4 | 
 5 | from collections import defaultdict
 6 | import re
 7 | from time import time
 8 | import cPickle
 9 | 
10 | from os.path import isdir
11 | from distutils.dir_util import mkpath
12 | 
13 | from garmire_SSrGE.config import INDEX_SAVE_PATH
14 | from garmire_SSrGE.config import GTF_PATH
15 | 
16 | 
17 | def main():
18 |     t = time()
19 |     print('loading index...')
20 |     index_start, index_end = load_indexed_gene_annotations()
21 |     position_index = create_position_indexes(index_start, index_end)
22 |     print('done in {0} s'.format(time() - t))
23 |     save_indexes(index_start, index_end, position_index)
24 | 
25 |     return True
26 | 
27 | def save_indexes(index_start,
28 |                  index_end,
29 |                  position_index,
30 |                  save_path=INDEX_SAVE_PATH):
31 |     """ """
32 |     if not isdir(save_path):
33 |         r = raw_input("{0} doesn't exist create it? (y/N)".format(save_path))
34 |         if r != 'y':
35 |             return
36 | 
37 |     mkpath(save_path)
38 | 
39 |     with open(save_path + 'index_start.pickle', 'w') as f:
40 |         cPickle.dump(index_start, f)
41 |     with open(save_path + 'index_end.pickle', 'w') as f:
42 |         cPickle.dump(index_end, f)
43 |     with open(save_path + 'position_index.pickle', 'w') as f:
44 |         cPickle.dump(position_index, f)
45 |     print('data saved')
46 | 
47 | def create_position_indexes(index_start, index_end):
48 |     """
49 |     create ordered set of gene first NA position
50 |     according to an index per chromosome
51 |     """
52 |     position_index = defaultdict(defaultdict)
53 |     for key in index_start:
54 |         position_index['start'][key] = sorted(index_start[key].keys())
55 |     for key in index_end:
56 |         position_index['end'][key] = sorted(index_end[key].keys())
57 | 
58 |     return position_index
59 | 
60 | def load_indexed_gene_annotations(gtf_path=GTF_PATH):
61 |     """
62 |     load index of genes according to chromosomes annotations:
63 |     chr1    unknown exon    11874   12227   .       +       .       gene_id "DDX11L1"; gene_name "DDX11L1"; transcript_id "NR_046018"; tss_id "TSS16932";
64 |     chr1    unknown exon    12613   12721   .       +       .       gene_id "DDX11L1"; gene_name "DDX11L1"; transcript_id "NR_046018"; tss_id "TSS16932";
65 |     """
66 |     regex = re.compile('gene\_id "(?P<geneid>.+)"\; gene')
67 |     f = open(gtf_path, "r")
68 |     index_start = defaultdict(dict)
69 |     index_end = defaultdict(dict)
70 | 
71 |     for line in f:
72 |         line = line.split('\t')
73 | 
74 |         if int(line[3]) not in index_start[line[0]]:
75 |             index_start[line[0]][int(line[3])] = []
76 | 
77 |         if int(line[4]) not in index_end[line[0]]:
78 |             index_end[line[0]][int(line[4])] = []
79 | 
80 | 
81 |         index_start[line[0]][int(line[3])].append(
82 |             (int(line[4]),
83 |             regex.findall(line[8])[0]))
84 |         index_end[line[0]][int(line[4])].append(
85 |             (int(line[3]),
86 |              regex.findall(line[8])[0]))
87 | 
88 |     return index_start, index_end
89 | 
90 | 
91 | if __name__ =="__main__":
92 |     main()
93 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/load_data.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | 
  3 | """load data """
  4 | 
  5 | from garmire_SSrGE.config import PROJECT_PATH
  6 | from garmire_SSrGE.config import SOFT_PATH
  7 | 
  8 | from garmire_SSrGE.generate_refgenome_index import INDEX_SAVE_PATH
  9 | from garmire_SSrGE.generate_refgenome_index import main as generate_refgenome
 10 | 
 11 | import cPickle
 12 | import re
 13 | 
 14 | from time import time
 15 | from os.path import isfile
 16 | 
 17 | from collections import defaultdict
 18 | 
 19 | 
 20 | def load_indexes(path_indexes=INDEX_SAVE_PATH):
 21 |     t = time()
 22 | 
 23 |     if not isfile(path_indexes + 'index_start.pickle'):
 24 |         print('indexes not found. Creating indexes...')
 25 |         generate_refgenome()
 26 | 
 27 |     with open(path_indexes + 'index_start.pickle', 'r') as f:
 28 |         index_start = cPickle.load(f)
 29 |     with open(path_indexes + 'index_end.pickle', 'r') as f:
 30 |         index_end = cPickle.load(f)
 31 |     with open(path_indexes + 'position_index.pickle', 'r') as f:
 32 |         position_index = cPickle.load(f)
 33 | 
 34 |     print('gene position indexes loaded in {0} s'.format(time() - t))
 35 |     return index_start, index_end, position_index
 36 | 
 37 | def process_line_from_vcf_file(line):
 38 |     """ process one line from the svf file"""
 39 | 
 40 |     if line[0] == '#':
 41 |         return
 42 |     line = line.split('\t')
 43 | 
 44 |     snv_id = None
 45 | 
 46 |     # process only passed SNV
 47 |     if line[6] != 'PASS':
 48 |         return
 49 | 
 50 |     # take annotation
 51 |     if line[2] != '.':
 52 |         snv_id = line[2]
 53 | 
 54 |     chrid, start = line[0], int(line[1])
 55 |     end = start
 56 |     return chrid, start, end, snv_id
 57 | 
 58 | def load_gsm_and_sample_names_from_soft(soft_path=SOFT_PATH):
 59 |     """
 60 |     load GSM and sample names from soft
 61 | 
 62 |     return:
 63 |         dict(GSM:sample name)
 64 |     """
 65 |     if not soft_path:
 66 |         return defaultdict(str)
 67 | 
 68 |     regex_gsm = re.compile("(?<=\^SAMPLE = )GSM[0-9]+")
 69 |     regex_name = re.compile("(?<=!Sample_title = ).+(?=\n)")
 70 | 
 71 |     if not isfile(soft_path):
 72 |         return {}
 73 | 
 74 |     read = open(soft_path, 'r').read()
 75 | 
 76 |     gsms = regex_gsm.findall(read)
 77 |     names = regex_name.findall(read)
 78 | 
 79 |     return defaultdict(str, zip(gsms, names))
 80 | 
 81 | def process_line_from_annotated_vcf_file(line):
 82 |     """ process one line from the vcf file"""
 83 | 
 84 |     if line[0] == '#':
 85 |         return
 86 |     line = line.split('\t')
 87 | 
 88 |     # process only passed SNV
 89 |     if line[6] != 'PASS':
 90 |         return
 91 | 
 92 |     if ONLY_KNOWN_SNV:
 93 |         # process only SNV with a known id
 94 |         if line[2] == '.':
 95 |             return
 96 |     results = []
 97 |     annotations = line[7].split(';')[-1].split(',')
 98 | 
 99 |     for annotation in annotations:
100 |         annotation = annotation.split('|')
101 | 
102 |         if len(annotation) < 7:
103 |             continue
104 |         if not annotation[1]:
105 |             continue
106 | 
107 |         result = {'type': annotation[1],
108 |                   'impact': annotation[2],
109 |                   'gene impacted': annotation[3],
110 |                   'feature type': annotation[5],
111 |                   'biotype': annotation[7]
112 |         }
113 |         results.append(result)
114 |     chrid, start = line[0], int(line[1])
115 |     return (annotation[3], start), results
116 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/compute_frequency_matrix.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | 
  3 | from os.path import isfile
  4 | from os.path import isdir
  5 | 
  6 | from os import listdir
  7 | 
  8 | from os.path import getsize
  9 | from distutils.dir_util import mkpath
 10 | 
 11 | from garmire_SNV_calling.config import FEATURE_COUNT
 12 | from garmire_SNV_calling.config import ANNOTATION_PATH
 13 | from garmire_SNV_calling.config import MATRIX_OUTPUT_PATH as OUTPUT_PATH
 14 | from garmire_SNV_calling.config import OUTPUT_PATH_STAR as STAR_PATH
 15 | from garmire_SNV_calling.config import STAR_THREADS
 16 | 
 17 | from multiprocessing import Pool
 18 | 
 19 | from garmire_SNV_calling.bash_utils import exec_cmd
 20 | 
 21 | 
 22 | ############ VARIABLE ################
 23 | DEFAULT_ALIGNER = 'STAR'
 24 | 
 25 | OUTPUT_FILENAME = {
 26 |     'STAR': 'Aligned.sortedByCoord.out.bam'
 27 | }
 28 | 
 29 | PATH_DICT = {
 30 |     'STAR': STAR_PATH
 31 | }
 32 | ######################################
 33 | 
 34 | 
 35 | def main():
 36 |     if DEFAULT_ALIGNER not in OUTPUT_FILENAME.keys():
 37 |         raise Exception('{0} not a regular aligner!'\
 38 |                         .format(DEFAULT_ALIGNER))
 39 | 
 40 |     aligner_path=PATH_DICT[DEFAULT_ALIGNER]
 41 |     output_filename = OUTPUT_FILENAME[DEFAULT_ALIGNER]
 42 | 
 43 |     do_expression_profile(aligner_path, output_filename)
 44 | 
 45 | 
 46 | def do_expression_profile(aligner_path, output_filename):
 47 |     """
 48 |     compute expression matrix according to aligner path results
 49 |     and output_filename (ex: output.bam)
 50 |     """
 51 |     cmd_list = []
 52 | 
 53 |     for folder in listdir(aligner_path):
 54 | 
 55 |         if not isdir(aligner_path + folder):
 56 |             print('not a folder! continuing', folder)
 57 |             continue
 58 | 
 59 |         bam_file = "{0}/{1}/{2}"\
 60 |                    .format(aligner_path, folder, output_filename)
 61 | 
 62 |         if not isfile(bam_file):
 63 |             print('no bam file for {0}'.format(bam_file))
 64 |             continue
 65 | 
 66 |         out_folder = "{0}/{1}/{2}"\
 67 |                    .format(OUTPUT_PATH, DEFAULT_ALIGNER, folder)
 68 | 
 69 |         out_file = '{0}/{1}'.format(out_folder, "matrix_count.txt")
 70 | 
 71 |         if isfile(out_file) and getsize(out_file):
 72 |             print('expression matrix already exists for: {0}'.format(out_folder))
 73 |             continue
 74 | 
 75 |         cmd_list.append((bam_file, out_folder))
 76 | 
 77 |     pool = Pool(STAR_THREADS)
 78 |     pool.map(_multiprocess_func, cmd_list)
 79 | 
 80 | 
 81 | def _multiprocess_func(inp):
 82 |     """ """
 83 |     bam_file, out_folder = inp
 84 |     bam_file_to_expression_matrix(bam_file, out_folder)
 85 | 
 86 | 
 87 | def bam_file_to_expression_matrix(
 88 |         bam_file,
 89 |         out_folder,
 90 |         feature_count=FEATURE_COUNT,
 91 |         annotation_path=ANNOTATION_PATH,
 92 |         stdout=None,
 93 |         matrix_name="matrix_counts.txt"):
 94 |     """ """
 95 |     if not isdir(out_folder):
 96 |             mkpath(out_folder)
 97 | 
 98 |     cmd = "{0} -pPBCM --primary -T 1 -a {1} -o {2}/{4}"\
 99 |           " {3}".format(feature_count,
100 |                         annotation_path,
101 |                         out_folder,
102 |                         bam_file,
103 |                         matrix_name)
104 |     print('launching cmd: {0}\n'.format(cmd))
105 |     try:
106 |         exec_cmd(cmd, stdout)
107 |     except Exception as e:
108 |         print('exception with featureCount cmd: {0}\n'.format(cmd))
109 |         print('exception: {0}\n'.format(e))
110 | 
111 |         assert(isfile('{0}/{1}'.format(out_folder, matrix_name)))
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/deploy_bismark.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Pool
  2 | 
  3 | from os import popen
  4 | from subprocess import Popen
  5 | from subprocess import PIPE
  6 | 
  7 | from os import listdir
  8 | from os import mkdir
  9 | from os.path import isdir
 10 | from os.path import isfile
 11 | from os.path import getsize
 12 | from os.path import split as pathsplit
 13 | 
 14 | import re
 15 | 
 16 | from sys import stdout as STDOUT
 17 | 
 18 | from glob import glob
 19 | 
 20 | from time import sleep
 21 | from random import random
 22 | from sys import argv
 23 | from sys import stdout
 24 | 
 25 | from fnmatch import fnmatch
 26 | 
 27 | from distutils.dir_util import mkpath
 28 | 
 29 | from garmire_SNV_calling.config import FASTQ_PATH
 30 | from garmire_SNV_calling.config import PATH_OUTPUT
 31 | from garmire_SNV_calling.config import REF_GENOME
 32 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN
 33 | 
 34 | 
 35 | ################ VARIABLE ##################################
 36 | BISMARK_SOFTWARE = '/home/opoirion/prog/Bismark/bismark'
 37 | OUTPUT_PATH = PATH_OUTPUT + '/bismark/'
 38 | THREADS = 4
 39 | PROCESS_THREADS = 4
 40 | BISMARK_OPTION = ''
 41 | REF_GENOME_DIR = pathsplit(REF_GENOME)[0]
 42 | ############################################################
 43 | 
 44 | 
 45 | sleep(2 * random())
 46 | if not isdir(OUTPUT_PATH):
 47 |     mkpath(OUTPUT_PATH)
 48 | 
 49 | 
 50 | def main():
 51 |     pool = Pool(PROCESS_THREADS)
 52 |     # process_one_file(listdir(FASTQ_PATH)[0])
 53 |     pool.map(process_one_file, listdir(FASTQ_PATH))
 54 | 
 55 | def process_one_file(fil):
 56 |     """ """
 57 |     if isfile(FASTQ_PATH + fil):
 58 |         return
 59 | 
 60 |     if PATTERN and not fnmatch(fil, PATTERN):
 61 |         return
 62 | 
 63 |     print "====> file to be aligned:", fil
 64 | 
 65 |     if not isdir(OUTPUT_PATH + fil):
 66 |         mkdir(OUTPUT_PATH + fil)
 67 | 
 68 |     bam_file_name = glob(OUTPUT_PATH + fil + '/*.bam')
 69 | 
 70 |     if bam_file_name \
 71 |        and getsize(bam_file_name[0]):
 72 |         print 'bam file result alreay exists for:{0}\nskipping...'\
 73 |             .format(bam_file_name[0])
 74 |         return
 75 | 
 76 |     fastq_str = ""
 77 | 
 78 |     fastq_files = list(set(glob(FASTQ_PATH + fil + '/*.fastq')))
 79 |     print 'fastq files founds: {0}'.format(fastq_files)
 80 | 
 81 |     if len(fastq_files) > 2:
 82 |         print 'tow many fastq files!'
 83 |         return
 84 | 
 85 |     elif len(fastq_files) == 2:
 86 |         fastq_1 = [fastq for fastq in fastq_files
 87 |                    if re.match('.+_1\.fastq', fastq)]
 88 | 
 89 |         assert(fastq_1)
 90 | 
 91 |         fastq_1 = fastq_1[0]
 92 | 
 93 |         fastq_2 = [fastq for fastq in fastq_files
 94 |                    if re.match('.+_2\.fastq', fastq,)]
 95 |         assert(fastq_2)
 96 | 
 97 |         fastq_2 = fastq_2[0]
 98 |         fastq_str = ' -1 {0} -2 {1} '.format(fastq_1, fastq_2)
 99 | 
100 |     elif len(fastq_files) == 1:
101 |         fastq_str = ' {0} '.format(fastq_files[0])
102 | 
103 |     stdout = open(OUTPUT_PATH + fil + "/log.out", 'w')
104 | 
105 |     if not fastq_str:
106 |         print 'no fastq file found for:{0}!\nskipping'.format(fil)
107 |         return
108 | 
109 |     cmd = "{0} -p {1} -o {2} --temp_dir {2} {3} --genome {4} {5} > {2}/stdlog.out" \
110 |           .format(BISMARK_SOFTWARE,
111 |                   THREADS,
112 |                   OUTPUT_PATH + fil + "/",
113 |                   BISMARK_OPTION,
114 |                   REF_GENOME_DIR,
115 |                   fastq_str
116 |           )
117 | 
118 |     _run_cmd(cmd, stdout)
119 | 
120 | 
121 | def _run_cmd(cmd, stdout):
122 |     """run cmd"""
123 | 
124 |     process = popen(cmd).read()
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     main()
129 | 


--------------------------------------------------------------------------------
/test/test_snv.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import unittest
  3 | 
  4 | from os import popen
  5 | from os import listdir
  6 | 
  7 | from commands import getstatusoutput
  8 | 
  9 | from os.path import isfile
 10 | from os.path import isdir
 11 | 
 12 | from garmire_SNV_calling.config import OUTPUT_ROOT
 13 | 
 14 | from garmire_SNV_calling.config import TYPE_VAR
 15 | from garmire_SNV_calling.config import FASTQ_PATH
 16 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN
 17 | 
 18 | from garmire_SNV_calling.config import JAVA
 19 | from garmire_SNV_calling.config import GATK_DIR
 20 | from garmire_SNV_calling.config import GATK_JAR
 21 | from garmire_SNV_calling.config import PICARD_DIR
 22 | from garmire_SNV_calling.config import PATH_STAR_SOFTWARE
 23 | from garmire_SNV_calling.config import SAMTOOLS
 24 | from garmire_SNV_calling.config import FREEBAYES
 25 | 
 26 | from fnmatch import fnmatch
 27 | 
 28 | 
 29 | class TestPackage(unittest.TestCase):
 30 |     """ """
 31 |     def test_output_root(self):
 32 |         """assert that OUTPUT_ROOT folder exists"""
 33 |         self.assertTrue(isdir(OUTPUT_ROOT))
 34 | 
 35 |     # def test_soft(self):
 36 |     #     """assert that .soft file exists"""
 37 |     #     self.assertTrue(isfile(SOFT_PATH))
 38 | 
 39 |     def test_ref_genome(self):
 40 |         """assert that ref genome file exists"""
 41 |         for typ in TYPE_VAR:
 42 |             self.assertTrue(isfile(TYPE_VAR[typ]['REF_GENOME']))
 43 | 
 44 |     # def test_annotation_path(self):
 45 |     #     """assert that STAR ref folder exists"""
 46 |     #     for typ in TYPE_VAR:
 47 |     #         self.assertTrue(isdir(pathsplit(
 48 |     #             TYPE_VAR[typ]['STAR_INDEX_PATH'])[0]))
 49 | 
 50 |     def test_gtf_file(self):
 51 |         """assert that GTF file exists"""
 52 |         for typ in TYPE_VAR:
 53 |             self.assertTrue(isfile(TYPE_VAR[typ]['ANNOTATION_PATH']))
 54 | 
 55 |     def test_dbsnp(self):
 56 |         """assert that dbsnp vcf file exists"""
 57 |         for typ in TYPE_VAR:
 58 |             self.assertTrue(isfile(TYPE_VAR[typ]['DBSNP']))
 59 | 
 60 |     def test_vcf_resources(self):
 61 |         """assert that additional vcf file exist"""
 62 |         for typ in TYPE_VAR:
 63 |             for vcf in TYPE_VAR[typ]['VCF_RESOURCES']:
 64 |                 self.assertTrue(isfile(vcf))
 65 | 
 66 |     def test_fastq_path(self):
 67 |         """assert that fastq path exists"""
 68 |         self.assertTrue(isdir(FASTQ_PATH))
 69 | 
 70 |     def test_fastq_path_not_empty(self):
 71 |         """assert that fastq path not empty"""
 72 |         self.assertTrue(len(listdir(FASTQ_PATH)))
 73 | 
 74 |     def test_fastq_path_with_folders_with_fastqfile(self):
 75 |         """assert that fastq folder exists and that .fastq files are inside"""
 76 | 
 77 |         for fastq_folder in listdir(FASTQ_PATH):
 78 |             if isfile(FASTQ_PATH + fastq_folder):
 79 |                 continue
 80 |             if PATTERN and not fnmatch(fastq_folder, PATTERN):
 81 |                 continue
 82 | 
 83 |             folder = "{0}/{1}".format(FASTQ_PATH, fastq_folder)
 84 | 
 85 |             print 'testing if {0} is empty'.format(folder)
 86 |             self.assertTrue(filter(lambda fil: fnmatch(fil, '*.fastq'),
 87 |                             listdir(folder)))
 88 | 
 89 |     def test_java(self):
 90 |         """assert that java is installed and > 1.8"""
 91 |         res = getstatusoutput('{0} -version'.format(JAVA))[1]
 92 |         self.assertIsNotNone(res)
 93 | 
 94 |         version = res.split('"')[1].rsplit('.', 1)[0]
 95 |         self.assertTrue(float(version) >= 1.8)
 96 | 
 97 |     def test_GATK(self):
 98 |         """assert that GATK .jar file exists"""
 99 |         self.assertTrue(isfile('{0}/{1}'.format(GATK_DIR, GATK_JAR)))
100 | 
101 |     def test_freebayes(self):
102 |         """assert that freebayes file exists"""
103 |         self.assertTrue(isfile(FREEBAYES))
104 | 
105 |     def test_samtools(self):
106 |         """assert that freebayes file exists"""
107 |         self.assertTrue(isfile(SAMTOOLS))
108 | 
109 |     def test_picard_tools(self):
110 |         """assert that picard-tools .jar files exist"""
111 |         self.assertTrue(isfile('{0}/picard.jar'.format(PICARD_DIR)))
112 |         self.assertTrue(isfile('{0}/picard-lib.jar'.format(PICARD_DIR)))
113 | 
114 |     def test_STAR_aligner(self):
115 |         """assert that STAR aligner bin exists and return version"""
116 |         self.assertIsNotNone(popen('{0} --version'.format(PATH_STAR_SOFTWARE)))
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     unittest.main()
121 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/linear_cross_validation.py:
--------------------------------------------------------------------------------
  1 | """ """
  2 | 
  3 | from sklearn.model_selection import KFold
  4 | from garmire_SSrGE.ssrge import SSrGE
  5 | 
  6 | from garmire_SSrGE.config import CROSS_VAL_NFOLD
  7 | 
  8 | import numpy as np
  9 | 
 10 | 
 11 | def debug():
 12 |     """
 13 |     #### DEBUG ####
 14 |     **** Test function ****
 15 |     """
 16 |     from garmire_SSrGE.examples import create_example_matrix_v1
 17 | 
 18 |     X, Y, W = create_example_matrix_v1()
 19 | 
 20 |     cross_val = LinearCrossVal(
 21 |         model='LASSO',
 22 |         SNV_mat=X,
 23 |         GE_mat=Y
 24 |     )
 25 | 
 26 |     path = cross_val.regularization_path('alpha',  [0.01, 0.1, 0.2])
 27 | 
 28 |     return path
 29 | 
 30 | 
 31 | class LinearCrossVal():
 32 |     """
 33 |     Class to perform cross-validation
 34 |     """
 35 |     def __init__(self,
 36 |                  SNV_mat,
 37 |                  GE_mat,
 38 |                  n_folds=CROSS_VAL_NFOLD,
 39 |                  verbose=True,
 40 |                  **ssrge_params):
 41 |         """ """
 42 |         if GE_mat.shape[0] == SNV_mat.shape[0] and \
 43 |            GE_mat.shape[1] != SNV_mat.shape[1]:
 44 |             GE_mat = GE_mat.T
 45 | 
 46 |         self.SNV_mat = SNV_mat
 47 |         self.GE_mat = GE_mat
 48 |         self.verbose = verbose
 49 |         self.n_folds = n_folds
 50 | 
 51 |         self.ssrge_params = ssrge_params
 52 | 
 53 |         self.errs_models = None
 54 |         self.errs_empty_models = None
 55 |         self.nb_coefs_list = None
 56 |         self.nb_models = None
 57 |         self.nb_model_mean = None
 58 |         self.regularization_value_list = None
 59 | 
 60 |     def regularization_path(self, param_name, value_list):
 61 |         """
 62 |         :param_name: str    the name of the param to test
 63 |         :value_list: list(float)
 64 |         """
 65 |         self.err_model_mean = []
 66 |         self.err_empty_mean = []
 67 |         self.nb_coefs_mean = []
 68 |         self.nb_model_mean = []
 69 |         self.intercept_mean = []
 70 |         self.regularization_value_list = value_list
 71 | 
 72 |         for value in value_list:
 73 |             self.ssrge_params[param_name] = value
 74 | 
 75 |             (errs_models,
 76 |              errs_null_models,
 77 |              nb_coefs,
 78 |              nb_models,
 79 |              intercepts
 80 |             ) = self.fit()
 81 | 
 82 |             self.err_model_mean.append(errs_models)
 83 |             self.err_empty_mean.append(errs_null_models)
 84 |             self.nb_model_mean.append(nb_models)
 85 |             self.nb_coefs_mean.append(nb_coefs)
 86 |             self.intercept_mean.append(intercepts)
 87 | 
 88 |             if self.verbose:
 89 |                 print('\nmean error model:', errs_models)
 90 |                 print('mean error null model:', errs_null_models)
 91 |                 print('mean number of model:', nb_models)
 92 |                 print('mean number of eeSNVs:', nb_coefs)
 93 | 
 94 |         return self.err_model_mean
 95 | 
 96 |     def fit(self):
 97 |         """ """
 98 |         i = 0
 99 | 
100 |         errs_models = []
101 |         errs_null_models = []
102 |         nb_coefs = []
103 |         nb_models = []
104 |         intercepts = []
105 | 
106 |         print('\n######## cross validation\n####parameters:{0}'\
107 |               .format(self.ssrge_params))
108 | 
109 |         ssrge = SSrGE(**self.ssrge_params)
110 | 
111 |         kfold = KFold(n_splits=self.n_folds,
112 |                                  shuffle=True)
113 | 
114 |         for train, test in kfold.split(self.SNV_mat):
115 |             i += 1
116 |             print('\n## fold nb {0}'.format(i))
117 | 
118 |             X_train = self.SNV_mat[train]
119 |             Y_train = self.GE_mat.T[train].T
120 | 
121 |             X_test = self.SNV_mat[test]
122 |             Y_test = self.GE_mat.T[test].T
123 | 
124 |             ssrge.fit(X_train, Y_train)
125 | 
126 |             score, score_null = ssrge.score(X_test, Y_test)
127 | 
128 |             errs_models.append(score)
129 |             errs_null_models.append(score_null)
130 |             nb_coefs.append(len(ssrge.eeSNV_weight))
131 |             nb_models.append(len(ssrge.intercepts))
132 |             intercepts.append(np.mean(list(ssrge.intercepts.values())))
133 | 
134 |         return  (np.mean(errs_models),
135 |                  np.mean(errs_null_models),
136 |                  np.mean(nb_coefs),
137 |                  np.mean(nb_models),
138 |                  np.mean(intercepts)
139 |                  )
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     debug()
144 | 


--------------------------------------------------------------------------------
/test/test_ssrge.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | 
  4 | class TestPackage(unittest.TestCase):
  5 |     """ """
  6 |     def test_multiprocess(self):
  7 |         """ Test multiprocess procedure """
  8 |         from garmire_SSrGE.multiprocess_fitting import debug
  9 | 
 10 |         g_index, coefs, intercepts = debug()
 11 | 
 12 |         self.assertTrue(g_index)
 13 |         self.assertTrue(coefs)
 14 |         self.assertTrue(coefs[0])
 15 |         self.assertTrue(sum(intercepts))
 16 | 
 17 | 
 18 |     def test_ssrge(self):
 19 |         """test ssrge procedure"""
 20 |         from garmire_SSrGE.examples import create_example_matrix_v1
 21 |         from garmire_SSrGE.ssrge import SSrGE
 22 | 
 23 |         X, Y, W = create_example_matrix_v1()
 24 |         ssrge = SSrGE(alpha=0.01)
 25 | 
 26 |         ssrge.fit(X, Y)
 27 |         self.assertTrue(ssrge.eeSNV_weight)
 28 | 
 29 |         Xr = ssrge.transform(X)
 30 | 
 31 |         self.assertTrue(Xr.sum())
 32 |         self.assertTrue(Xr.shape[0] == X.shape[0])
 33 |         self.assertTrue(Xr.shape[1] < X.shape[1])
 34 | 
 35 |         snv_ranked = ssrge.rank_eeSNVs()
 36 | 
 37 |         self.assertTrue(snv_ranked)
 38 | 
 39 |         score = ssrge.score(X,Y)
 40 | 
 41 |         self.assertTrue(score[0] < score[1])
 42 | 
 43 |     def test_ssrge_elasticnet(self):
 44 |         """test ssrge procedure with elasticnet model"""
 45 |         from garmire_SSrGE.examples import create_example_matrix_v1
 46 |         from garmire_SSrGE.ssrge import SSrGE
 47 | 
 48 |         X, Y, W = create_example_matrix_v1()
 49 |         ssrge = SSrGE(alpha=0.01, model='ElasticNet')
 50 | 
 51 |         ssrge.fit(X, Y)
 52 |         self.assertTrue(ssrge.eeSNV_weight)
 53 | 
 54 |         Xr = ssrge.transform(X)
 55 | 
 56 |         self.assertTrue(Xr.sum())
 57 |         self.assertTrue(Xr.shape[0] == X.shape[0])
 58 |         self.assertTrue(Xr.shape[1] <= X.shape[1])
 59 | 
 60 |         snv_ranked = ssrge.rank_eeSNVs()
 61 | 
 62 |         self.assertTrue(snv_ranked)
 63 | 
 64 |         score = ssrge.score(X,Y)
 65 | 
 66 |         self.assertTrue(score[0] < score[1])
 67 | 
 68 |     def test_ssrge_cnv(self):
 69 |         """test ssrge procedure with cnv matrix"""
 70 |         from garmire_SSrGE.examples import create_example_matrix_v3
 71 |         from garmire_SSrGE.ssrge import SSrGE
 72 | 
 73 |         X, Y, C, W = create_example_matrix_v3()
 74 |         ssrge = SSrGE(alpha=0.01)
 75 | 
 76 |         ssrge.fit(X, Y, C)
 77 |         self.assertTrue(ssrge.eeSNV_weight)
 78 | 
 79 |         Xr = ssrge.transform(X)
 80 | 
 81 |         self.assertTrue(Xr.sum())
 82 |         self.assertTrue(Xr.shape[0] == X.shape[0])
 83 |         self.assertTrue(Xr.shape[1] < X.shape[1])
 84 | 
 85 |         snv_ranked = ssrge.rank_eeSNVs()
 86 | 
 87 |         self.assertTrue(snv_ranked)
 88 | 
 89 |         score = ssrge.score(X,Y)
 90 | 
 91 |         self.assertTrue(score[0] < score[1])
 92 | 
 93 |     def test_ssrge_rank_gene(self):
 94 |         """test ssrge and rank genes and snvs"""
 95 |         from garmire_SSrGE.examples import create_example_matrix_v2
 96 |         from garmire_SSrGE.ssrge import SSrGE
 97 | 
 98 |         X, Y, gene_id_list, snv_id_list = create_example_matrix_v2()
 99 |         ssrge = SSrGE(
100 |             snv_id_list=snv_id_list,
101 |             gene_id_list=gene_id_list,
102 |             nb_ranked_features=2,
103 |             alpha=0.01)
104 | 
105 |         ssrge.fit(X, Y)
106 |         self.assertTrue(ssrge.eeSNV_weight)
107 | 
108 |         Xr = ssrge.transform(X)
109 | 
110 |         self.assertTrue(Xr.sum())
111 |         self.assertTrue(Xr.shape[0] == X.shape[0])
112 |         self.assertTrue(Xr.shape[1] < X.shape[1])
113 | 
114 |         snv_ranked = ssrge.rank_eeSNVs()
115 | 
116 |         self.assertTrue(snv_ranked)
117 | 
118 |         self.assertTrue(len(ssrge.retained_snvs) == ssrge.nb_ranked_features)
119 |         self.assertTrue(len(ssrge.retained_genes) == ssrge.nb_ranked_features)
120 | 
121 |         score = ssrge.score(X,Y)
122 | 
123 |         self.assertTrue(score[0] < score[1])
124 | 
125 |         subgroup = ssrge.rank_features_for_a_subgroup(range(10))
126 | 
127 |         self.assertTrue(len(subgroup.gene_expr_distrib[gene_id_list[0]]) == 10)
128 |         self.assertTrue(subgroup.snv_weights_distrib)
129 |         self.assertTrue(subgroup.exp_snv_distrib_comp)
130 | 
131 |     def test_cross_validation(self):
132 |         """test cross validation procedure"""
133 | 
134 |         from garmire_SSrGE.linear_cross_validation import debug
135 | 
136 |         path = debug()
137 |         self.assertTrue(path)
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     unittest.main()
142 | 


--------------------------------------------------------------------------------
/README_download_ncbi_rsa.md:
--------------------------------------------------------------------------------
  1 | # Download SRA files from NCBI (GEO)
  2 | 
  3 | This module provides scripts to  download and extract SRA files for High-throughput genomic data from NCBI (GEO portal) using NCBI .soft file
  4 | 
  5 | 
  6 | 
  7 | # SRA project download using docker
  8 | 
  9 | ## Requirements
 10 | * docker
 11 | * possible root access
 12 | * 13.8 GB of free memory (docker image)
 13 | 
 14 | ## installation (local)
 15 | 
 16 | ```bash
 17 | docker pull opoirion/ssrge
 18 | mkdir /<Results data folder>/
 19 | cd /<Results data folder>/
 20 | PATHDATA=`pwd`
 21 | ```
 22 | 
 23 | ## usage
 24 | 
 25 | The pipeline consists of 3 steps (for downloading the data) and 4 steps for aligning and calling SNVs:
 26 | 
 27 | ```bash
 28 | # Download
 29 | docker run --rm opoirion/ssrge download_soft_file -h
 30 | docker run --rm opoirion/ssrge download_sra -h
 31 | docker run --rm opoirion/ssrge extract_sra -h
 32 | ```
 33 | 
 34 | ## example
 35 | 
 36 | Let's download and process 2 samples from GSE79457 in a project name test_n2
 37 | 
 38 | ```bash
 39 | # download of the soft file containing the metadata for GSE79457
 40 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_soft_file -project_name test_n2 -soft_id GSE79457
 41 | # download sra files
 42 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_sra -project_name test_n2 -max_nb_samples 2
 43 | # exctract sra files
 44 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge extract_sra -project_name test_n2
 45 | # rm sra files (optionnal)
 46 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge rm_sra -project_name test_n2
 47 | ```
 48 | 
 49 | # Installation from github (*not updated!!* => Use the docker image for now)
 50 | 
 51 | 
 52 | ## Requirements
 53 | * [python 2 (>=2.7)](https://www.python.org/download/releases/2.7.2/)
 54 | * The only external software needed is [fastq-dump](http://ncbi.github.io/sra-tools/install_config.html) to extract the .sra files. Path toward the executable must be given to the config file or parsed as argument
 55 | * A folder with the name of the project must be created and the absolute path toward that folder must be given to the config file or parsed as argument
 56 | * The .soft file from [NCBI GEO](http://www.ncbi.nlm.nih.gov/geo/) website file related to the project must be downloaded (and put into the project folder (default))
 57 |   * link for dataset description GEO webpage [example](http://ftp.ncbi.nlm.nih.gov/geo/series/GSE85nnn/GSE85183/soft/)
 58 |   * An example soft file is also available in the ./example/ folder of the repository (default folder)
 59 | 
 60 | ## configuration
 61 | * all global variables can be set into the file ./garmire_download_ncbi_sra/config.py or parsed as function attributes
 62 | * arguments description can be found at any time by invoking the -h (or -H) option or by consulting the config file:
 63 | 
 64 | ```text
 65 | -PROJECT_NAME    The name of the project (defining the name of the folder)
 66 | -PATH_DATA    The absolute path where the project will be created and the SRA files downloaded and extracted
 67 | -PATH_SOFT    path toward the .soft file (with the corresponding ftp addresses for the .sra files)
 68 | -NB_THREADS    number of threads (download in parallel) to use for downloading rsa files (default 2)
 69 | -FASTQ_DUMP    path to the fastq-dump software
 70 | -FASTQ_DUMP_OPTION    options to use to extract the sra (using fastq-dump) "--split-3 -B is the default" and it is strongly recommended to keep it
 71 | -LIMIT    define the maximum number of sra files to be downloaded (default None)
 72 | ```
 73 | 
 74 | ## usage
 75 | move to folder of the git project (https://github.com/lanagarmire/SSrGE.git)
 76 | 
 77 | ```bash
 78 | cd SSrGE
 79 | ```
 80 | 
 81 | * Setting the global variables into the config file (download_ncbi_sra/config.py) or parsing them each time as arguments
 82 | * [optional] Running the tests:
 83 | 
 84 | ```bash
 85 |   python ./test/test_download.py -v
 86 |   ```
 87 | 
 88 | * download and extract data (download by default .sra files from the example .soft file):
 89 | 
 90 | ```bash
 91 | python garmire_download_ncbi_sra/download_data.py
 92 | ```
 93 | * download and extract data (with parsing options):
 94 | 
 95 | ```bash
 96 | python garmire_download_ncbi_sra/download_data.py -NB_THREADS 5 -PATH_SOFT tutut/...
 97 | ```
 98 | * extract SRA file
 99 | 
100 | ```bash
101 | python garmire_download_ncbi_sra/extract_data.py
102 | ```
103 | * remove SRA file
104 | 
105 | ```bash
106 | python garmire_download_ncbi_sra/remove_sra.py
107 | ```
108 | 
109 | ## contact and credentials
110 | * Developer: Olivier Poirion (PhD)
111 | * contact: opoirion@hawaii.edu


--------------------------------------------------------------------------------
/garmire_SNV_calling/parse_10x_bam_file_to_fastq_files.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | 
  3 | from os import mkdir
  4 | from os.path import isdir
  5 | from os.path import isfile
  6 | 
  7 | from sys import stdout
  8 | 
  9 | from os import popen
 10 | 
 11 | from collections import defaultdict
 12 | 
 13 | import cPickle
 14 | 
 15 | from multiprocessing import Pool
 16 | 
 17 | from glob import glob
 18 | 
 19 | from os import popen
 20 | 
 21 | from os import remove
 22 | 
 23 | 
 24 | ######################## VARIABLE ############################
 25 | PATH_FASTQ = '/mnt/nas_rna2/opoirion/10x_data/fastq/'
 26 | PATH_BAM =  '/mnt/nas_rna2/opoirion/10x_data/neurons_900_possorted_genome_bam.bam'
 27 | 
 28 | # maximum of reads for a cells
 29 | MAX_READS = None
 30 | # number of cells you want
 31 | NB_CELLS = 1000
 32 | FASTQ_THREAD = 10
 33 | ###############################################################
 34 | 
 35 | 
 36 | def main():
 37 |     """
 38 |     """
 39 |     stats, cell_list = get_cell_stats()
 40 |     write_bam_files(cell_list)
 41 |     bam_to_fastq(cell_list)
 42 | 
 43 | 
 44 | def bam_to_fastq(cell_list):
 45 |     """
 46 |     """
 47 |     print('converting all the bam files into fastq files...')
 48 |     pool = Pool(FASTQ_THREAD)
 49 |     pool.map(_bam_to_fastq, cell_list)
 50 | 
 51 | def _bam_to_fastq(cell):
 52 |     """
 53 |     """
 54 |     bam_files = glob('{0}/{1}/*.bam'.format(PATH_FASTQ, cell))
 55 | 
 56 |     if not bam_files:
 57 |         return
 58 | 
 59 |     for bam_file in  bam_files:
 60 |         cmd = 'bamToFastq -i {0} -fq {1}/{2}/{2}.fastq'.format(bam_file, PATH_FASTQ, cell)
 61 |         popen(cmd).read()
 62 |         remove(bam_file)
 63 | 
 64 | def write_bam_files(cell_list):
 65 |     """
 66 |     """
 67 |     print('\n#### SECOND PASS ####')
 68 |     cmd = "samtools idxstats {0} | awk -F '\t' '{{s+=$3+$4}}END{{print s}}'".format(PATH_BAM)
 69 |     nb_reads = int(popen(cmd).read().strip('\n'))
 70 | 
 71 |     if MAX_READS is None:
 72 |         max_reads = nb_reads
 73 |     else:
 74 |         max_reads = MAX_READS
 75 | 
 76 |     file_dict = {}
 77 | 
 78 |     f_raw = pysam.AlignmentFile(PATH_BAM, 'rb')
 79 |     header = f_raw.header
 80 | 
 81 |     i = 0
 82 | 
 83 |     while i < max_reads:
 84 |         try:
 85 |             reads = f_raw.next()
 86 |         except Exception:
 87 |             break
 88 | 
 89 |         try:
 90 |             bc_tags = reads.get_tag('CB')
 91 |         except KeyError:
 92 |             continue
 93 | 
 94 |         i += 1
 95 | 
 96 |         stdout.write('\r nb reads {0} / {1}'.format(i, nb_reads))
 97 |         stdout.flush()
 98 | 
 99 |         if bc_tags not in cell_list:
100 |             continue
101 | 
102 |         if bc_tags not in file_dict:
103 |             folder = '{0}/{1}'.format(PATH_FASTQ, bc_tags)
104 | 
105 |             if not isdir(folder):
106 |                 mkdir(folder)
107 | 
108 |             file_dict[bc_tags] = pysam.AlignmentFile(
109 |                 '{0}/bc_tags.bam'.format(folder), 'wb', header=header)
110 | 
111 |         file_dict[bc_tags].write(reads)
112 | 
113 |     f_raw.close()
114 | 
115 | def get_cell_stats():
116 |     """
117 |     """
118 |     path_pickle = '{0}/cell_stats.pickle'.format(PATH_FASTQ)
119 | 
120 |     if isfile(path_pickle):
121 |         stats_dict = cPickle.load(open(path_pickle))
122 |     else:
123 |         stats_dict = _get_cell_stats()
124 |         cPickle.dump(stats_dict, open(path_pickle, 'w'))
125 | 
126 |     cells, count = zip(*sorted(stats_dict.items(), key=lambda x:x[1], reverse=True)[:NB_CELLS])
127 | 
128 |     return stats_dict, set(cells)
129 | 
130 | def _get_cell_stats():
131 |     """
132 |     """
133 |     print('#### FIRST PASS ####')
134 |     cmd = "samtools idxstats {0} | awk -F '\t' '{{s+=$3+$4}}END{{print s}}'".format(PATH_BAM)
135 |     nb_reads = int(popen(cmd).read().strip('\n'))
136 | 
137 |     if MAX_READS is None:
138 |         max_reads = nb_reads
139 |     else:
140 |         max_reads = MAX_READS
141 | 
142 |     stats_dict = defaultdict(int)
143 | 
144 |     f_raw = pysam.AlignmentFile(PATH_BAM, 'rb')
145 | 
146 |     i = 0
147 | 
148 |     while i < max_reads:
149 |         try:
150 |             reads = f_raw.next()
151 |         except Exception:
152 |             break
153 | 
154 |         try:
155 |             bc_tags = reads.get_tag('CB')
156 |         except KeyError:
157 |             continue
158 | 
159 |         stats_dict[bc_tags] += 1
160 | 
161 |         i += 1
162 | 
163 |         stdout.write('\r nb reads {0} / {1}'.format(i, nb_reads))
164 |         stdout.flush()
165 | 
166 |     f_raw.close()
167 | 
168 |     return stats_dict
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     main()
173 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/process_annotate_snv.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """ process one fastqc report"""
  3 | 
  4 | from os import popen
  5 | from os import listdir
  6 | 
  7 | from os.path import isdir
  8 | from os.path import isfile
  9 | from subprocess import Popen
 10 | 
 11 | from distutils.dir_util import mkpath
 12 | from shutil import rmtree
 13 | from shutil import copyfile
 14 | from shutil import move
 15 | from sys import stdout as STDOUT
 16 | from sys import argv
 17 | from random import randint
 18 | from random import random
 19 | from time import sleep
 20 | from time import time
 21 | from fnmatch import fnmatch
 22 | 
 23 | from garmire_SNV_calling.config import OUTPUT_PATH_SNV
 24 | from garmire_SNV_calling.config import SNPEFF
 25 | from garmire_SNV_calling.config import JAVA
 26 | from garmire_SNV_calling.config import SNPEFF_DB
 27 | 
 28 | from garmire_SNV_calling.process_multiple_generic import MPI
 29 | 
 30 | ############ VARIABLES ############################################
 31 | SRR_TO_PROCESS = "" # for debug purpose
 32 | PROCESS_ID = randint(0, 1000000)
 33 | INPUT_PATH = OUTPUT_PATH_SNV + '/data/'
 34 | 
 35 | if "--specific_folder" in argv:
 36 |     SRR_TO_PROCESS = argv[
 37 |         argv.index("--specific_folder") + 1]
 38 | if "--process_id" in argv:
 39 |     PROCESS_ID = int(argv[
 40 |         argv.index("--process_id") + 1])
 41 | if "--nb_threads" in argv:
 42 |     NB_THREADS = int(argv[
 43 |         argv.index("--nb_threads") + 1])
 44 | else:
 45 |     NB_THREADS = None
 46 | ###################################################################
 47 | 
 48 | 
 49 | def main():
 50 |     if NB_THREADS:
 51 |         input_list = listdir(INPUT_PATH)
 52 |         mpi = MPI(input_list=input_list,
 53 |                   ProcessClass=ProcessAnnotateSNV,
 54 |                   nb_processes=NB_THREADS)
 55 |         mpi.run()
 56 |     else:
 57 |         process_annotate_snv = ProcessAnnotateSNV(id=PROCESS_ID)
 58 |         process_annotate_snv.process(SRR_TO_PROCESS)
 59 | 
 60 | class ProcessAnnotateSNV():
 61 |     """
 62 |     Process SNV annotation using snpEff software
 63 |     """
 64 |     def __init__(self,
 65 |                  path_to_data=OUTPUT_PATH_SNV,
 66 |                  id="1",
 67 |                  clean_tmp=True,
 68 |     ):
 69 |         self.path_to_data = path_to_data
 70 |         self.time_start = None
 71 |         self.id = str(id)
 72 |         self.stdout = None
 73 | 
 74 |     def process(self, srr_to_process=SRR_TO_PROCESS):
 75 |         """
 76 |         process one fastq file using fastqc
 77 |         """
 78 |         tmppath = self.path_to_data + "/tmp/" + self.id
 79 |         inputpath = self.path_to_data + "/data/"
 80 |         input_file = '{0}/{1}/snv_filtered.vcf'\
 81 |                      .format(inputpath, srr_to_process)
 82 |         tmp_file = '{0}/snv_filtered_annotated.vcf'\
 83 |                       .format(tmppath)
 84 |         output_file = '{0}/{1}/snv_filtered_annotated.vcf'\
 85 |                       .format(inputpath, srr_to_process)
 86 | 
 87 |         if not isdir(inputpath):
 88 |             print '{0} is not a folder!'.format(
 89 |                 self.path_to_data + srr_to_process)
 90 |             return
 91 | 
 92 |         if not isfile(input_file):
 93 |             print 'no input file: {0}!'.format(
 94 |                 input_file)
 95 |             return
 96 | 
 97 |         if isfile(output_file):
 98 |             print '{0} already exists!'.format(
 99 |                 output_file)
100 |             return
101 | 
102 |         sleep(random())
103 |         if not isdir(tmppath):
104 |             mkpath(tmppath)
105 | 
106 |         popen("rm {0}/*".format(tmppath)).read()
107 | 
108 |         self.stdout = open(tmppath + '/stdout.log', 'w')
109 | 
110 |         cmd = "{0} -jar {1} eff -v {2} {3} -noStats > {4}"\
111 |               .format(JAVA, SNPEFF, SNPEFF_DB, input_file, tmp_file)
112 | 
113 |         self._exec_cmd(cmd)
114 |         self._exec_cmd("mv {0} {1}"\
115 |                        .format(tmp_file, output_file))
116 |         self._exec_cmd("mv {0}/stdout.log {1}/{2}/snv_annotation.log"\
117 |                        .format(tmppath, inputpath, srr_to_process))
118 |         self.stdout.close()
119 |         popen('rm -r {0}/*'.format(tmppath))
120 | 
121 |     def _exec_cmd(self, cmd):
122 |         """ execute cmd """
123 |         process = Popen(cmd,
124 |                         stdout=self.stdout,
125 |                         stderr=self.stdout,
126 |                         shell=True)
127 | 
128 |         process.communicate()
129 |         if process.returncode:
130 |             raise Exception('{0} raise non 0 return code!\n'\
131 |                             .format(cmd))
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/deploy_BSseeker.py:
--------------------------------------------------------------------------------
  1 | """ """
  2 | 
  3 | from multiprocessing import Pool
  4 | 
  5 | from os import popen
  6 | 
  7 | from os import listdir
  8 | from os import mkdir
  9 | from os.path import isdir
 10 | from os.path import isfile
 11 | from os.path import getsize
 12 | from os.path import split as pathsplit
 13 | 
 14 | import re
 15 | 
 16 | from glob import glob
 17 | 
 18 | from time import sleep
 19 | from random import random
 20 | 
 21 | from fnmatch import fnmatch
 22 | 
 23 | from distutils.dir_util import mkpath
 24 | 
 25 | from garmire_SNV_calling.config import FASTQ_PATH
 26 | from garmire_SNV_calling.config import PATH_OUTPUT
 27 | from garmire_SNV_calling.config import REF_GENOME
 28 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN
 29 | from garmire_SNV_calling.config import BSSEEKER2_REP
 30 | from garmire_SNV_calling.config import PYTHON
 31 | from garmire_SNV_calling.config import BOWTIE_REP
 32 | from garmire_SNV_calling.config import DO_TRIMGALORE
 33 | from garmire_SNV_calling.config import TRIMGALORE_REP
 34 | 
 35 | 
 36 | ################ VARIABLE ##################################
 37 | 
 38 | OUTPUT_PATH = PATH_OUTPUT + '/BSseeker/'
 39 | PROCESS_THREADS = 2
 40 | BISMARK_OPTION = ''
 41 | REF_GENOME_PATH = pathsplit(REF_GENOME)[0]
 42 | ############################################################
 43 | 
 44 | 
 45 | sleep(2 * random())
 46 | if not isdir(OUTPUT_PATH):
 47 |     mkpath(OUTPUT_PATH)
 48 | 
 49 | 
 50 | def main():
 51 |     pool = Pool(PROCESS_THREADS)
 52 |     pool.map(process_one_file, listdir(FASTQ_PATH))
 53 | 
 54 | def process_one_file(fil):
 55 |     """ """
 56 |     print(fil)
 57 |     if isfile(FASTQ_PATH + fil):
 58 |         return False
 59 | 
 60 |     if PATTERN and not fnmatch(fil, PATTERN):
 61 |         return False
 62 | 
 63 |     print("====> file to be aligned:", fil)
 64 | 
 65 |     if not isdir(OUTPUT_PATH + fil):
 66 |         mkdir(OUTPUT_PATH + fil)
 67 | 
 68 |     bam_file_name = glob(OUTPUT_PATH + fil + '/*.bam')
 69 | 
 70 |     if bam_file_name \
 71 |        and getsize(bam_file_name[0]):
 72 |         print('bam file result alreay exists for:{0}\nskipping...'\
 73 |             .format(bam_file_name[0]))
 74 |         return False
 75 | 
 76 |     fastq_str = ""
 77 | 
 78 |     fastq_files = list(set(glob(FASTQ_PATH + fil + '/*.fastq')))
 79 |     print('fastq files founds: {0}'.format(fastq_files))
 80 | 
 81 |     stdout = open(OUTPUT_PATH + fil + "/log.out", 'w')
 82 | 
 83 |     if len(fastq_files) > 2:
 84 |         print('tow many fastq files!')
 85 |         return False
 86 | 
 87 |     elif len(fastq_files) == 2:
 88 |         fastq_1 = [fastq for fastq in fastq_files
 89 |                    if re.match('.+_1\.fastq', fastq)]
 90 | 
 91 |         assert(fastq_1)
 92 | 
 93 |         fastq_1 = fastq_1[0]
 94 | 
 95 |         fastq_2 = [fastq for fastq in fastq_files
 96 |                    if re.match('.+_2\.fastq', fastq,)]
 97 |         assert(fastq_2)
 98 | 
 99 |         fastq_2 = fastq_2[0]
100 | 
101 |         if DO_TRIMGALORE:
102 |             cmd_trim = "{0}/trim_galore {1} {2} --paired --no_report_file -o {3}".format(
103 |                 TRIMGALORE_REP, fastq_1, fastq_2, FASTQ_PATH + fil)
104 |             _run_cmd(cmd_trim, stdout)
105 | 
106 |             fastq_1 = '{0}_val_1.fq'.format(fastq_1.rsplit('.', 1)[0])
107 |             fastq_2 = '{0}_val_2.fq'.format(fastq_2.rsplit('.', 1)[0])
108 | 
109 |         fastq_str = ' -1 {0} -2 {1} '.format(fastq_1, fastq_2)
110 | 
111 |     elif len(fastq_files) == 1:
112 |         if DO_TRIMGALORE:
113 |             fastq_file = fastq_files[0]
114 |             cmd_trim = "{0}/trim_galore {1} --no_report_file -o {2}".format(
115 |                 TRIMGALORE_REP, fastq_file, FASTQ_PATH + fil)
116 |             _run_cmd(cmd_trim, stdout)
117 |             fastq_file = '{0}_trimmed.fq'.format(fastq_file.rsplit('.', 1)[0])
118 | 
119 |         fastq_str = ' -i {0} '.format(fastq_file)
120 | 
121 |     if not fastq_str:
122 |         print('no fastq file found for:{0}!\nskipping'.format(fil))
123 |         return False
124 | 
125 |     cmd = "{0} {1}/bs_seeker2-align.py -g {2}"\
126 |           " --aligner=bowtie2 -p {3} --db {4} -r {5}"\
127 |         .format(PYTHON,
128 |                 BSSEEKER2_REP,
129 |                 REF_GENOME,
130 |                 BOWTIE_REP,
131 |                 REF_GENOME_PATH,
132 |                 fastq_str
133 |     )
134 | 
135 |     _run_cmd(cmd, stdout)
136 |     _run_cmd('mv {0}/*.bam {1}/; mv {0}/*log {1}/ '.format(
137 |         FASTQ_PATH + fil, OUTPUT_PATH + fil), stdout)
138 | 
139 |     return True
140 | 
141 | 
142 | def _run_cmd(cmd, *args):
143 |     """run cmd"""
144 |     popen(cmd).read()
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     main()
149 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/process_fastqc_report.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """ process one fastqc report"""
  3 | 
  4 | from os import popen
  5 | from os import listdir
  6 | 
  7 | from os.path import isdir
  8 | from os.path import isfile
  9 | from os.path import getsize
 10 | from subprocess import Popen
 11 | 
 12 | from distutils.dir_util import mkpath
 13 | from shutil import rmtree
 14 | from shutil import copyfile
 15 | from shutil import move
 16 | from sys import stdout as STDOUT
 17 | from sys import argv
 18 | from random import randint
 19 | from random import random
 20 | from time import sleep
 21 | from time import time
 22 | from fnmatch import fnmatch
 23 | 
 24 | from garmire_SNV_calling.config import FASTQC
 25 | from garmire_SNV_calling.config import PATH_OUTPUT
 26 | from garmire_SNV_calling.config import FASTQ_PATH
 27 | 
 28 | from garmire_SNV_calling.process_multiple_generic import MPI
 29 | 
 30 | ############ VARIABLES ############################################
 31 | SRR_TO_PROCESS = "" # for debug purpose
 32 | PROCESS_ID = randint(0, 1000000)
 33 | 
 34 | if "--specific_folder" in argv:
 35 |     SRR_TO_PROCESS = argv[
 36 |         argv.index("--specific_folder") + 1]
 37 | if "--process_id" in argv:
 38 |     PROCESS_ID = int(argv[
 39 |         argv.index("--process_id") + 1])
 40 | if "--nb_threads" in argv:
 41 |     NB_THREADS = int(argv[
 42 |         argv.index("--nb_threads") + 1])
 43 | else:
 44 |     NB_THREADS = None
 45 | ###################################################################
 46 | 
 47 | 
 48 | def main():
 49 |     if NB_THREADS:
 50 |         input_list = listdir(FASTQ_PATH)
 51 |         mpi = MPI(input_list=input_list,
 52 |                   ProcessClass=ProcessFastqC,
 53 |                   nb_processes=NB_THREADS)
 54 |         mpi.run()
 55 |     else:
 56 |         process_fastqc = ProcessFastqC(id=PROCESS_ID)
 57 |         process_fastqc.process(SRR_TO_PROCESS)
 58 | 
 59 | class ProcessFastqC():
 60 |     """ Process Fastqc report"""
 61 |     def __init__(self,
 62 |                  path_to_data=PATH_OUTPUT,
 63 |                  id="1",
 64 |                  clean_tmp=True,
 65 |     ):
 66 |         self.output_path = PATH_OUTPUT + '/fastqc/'
 67 |         self.path_to_data = path_to_data
 68 |         self.time_start = None
 69 |         self.id = str(id)
 70 |         self.stdout = None
 71 | 
 72 |     def process(self, srr_to_process=SRR_TO_PROCESS):
 73 |         """
 74 |         process one fastq file using fastqc
 75 |         """
 76 |         tmppath = self.output_path + "/tmp/" + self.id
 77 |         outpath = self.output_path + "/data/"
 78 | 
 79 |         if not isdir(FASTQ_PATH + srr_to_process):
 80 |             print '{0} is not a folder!'.format(
 81 |                 FASTQ_PATH + srr_to_process)
 82 |             return
 83 | 
 84 |         if isdir("{1}/{0}_fastqc"\
 85 |                  .format(srr_to_process, outpath)):
 86 |             print '{0} output already exists'.format(
 87 |                 "{1}/{0}_fastqc"\
 88 |                  .format(srr_to_process, outpath))
 89 |             return
 90 | 
 91 |         sleep(random())
 92 |         if not isdir(tmppath):
 93 |             mkpath(tmppath)
 94 |         if not isdir(outpath):
 95 |             mkpath(outpath)
 96 | 
 97 |         popen("rm {0}/*".format(tmppath)).read()
 98 |         path_fastq = ""
 99 | 
100 |         for fil in listdir(FASTQ_PATH + srr_to_process):
101 |             if fnmatch(fil, '*.fastq'):
102 |                 path_fastq = '{0}/{1}/{2}'.format(FASTQ_PATH,
103 |                                                   srr_to_process,
104 |                                                   fil)
105 |                 fil = fil.rsplit('.', 1)[0]
106 |                 break
107 |         if not path_fastq:
108 |             print 'No fastq file for :{0}'.format(path_fastq)
109 |             return
110 | 
111 |         self.stdout = open(tmppath + '/stdout.log', 'w')
112 | 
113 |         cmd = "{0} {1} -o {2} -d {2} --extract"\
114 |               .format(FASTQC, path_fastq, tmppath)
115 | 
116 |         self._exec_cmd(cmd)
117 |         self._exec_cmd("mv {0}/{1}_fastqc {3}/{2}_fastqc"\
118 |                        .format(tmppath, fil, srr_to_process, outpath))
119 |         self.stdout.close()
120 |         popen('rm -r {0}/*'.format(tmppath))
121 | 
122 |     def _exec_cmd(self, cmd):
123 |         """ execute cmd """
124 |         process = Popen(cmd,
125 |                         stdout=self.stdout,
126 |                         stderr=self.stdout,
127 |                         shell=True)
128 | 
129 |         process.communicate()
130 |         if process.returncode:
131 |             raise Exception('{0} raise non 0 return code!\n'\
132 |                             .format(cmd))
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main()
137 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/process_multiple_snv.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | 
  3 | """ process multiple bam file with SNV"""
  4 | 
  5 | from multiprocessing import Process
  6 | from multiprocessing import Queue
  7 | 
  8 | from time import sleep
  9 | from os import listdir
 10 | from os.path import isfile
 11 | from os.path import isdir
 12 | from shutil import rmtree as rmdir
 13 | 
 14 | from garmire_SNV_calling.process_snv_GATK import ProcessGATKSNV
 15 | from garmire_SNV_calling.process_freebayes import ProcessFreebayesCaller
 16 | 
 17 | from garmire_SNV_calling.config import PATH_OUTPUT
 18 | 
 19 | from garmire_SNV_calling.config import OUTPUT_PATH_GATK
 20 | from garmire_SNV_calling.config import OUTPUT_PATH_FREEBAYES
 21 | 
 22 | from sys import argv
 23 | 
 24 | 
 25 | ######## VARIABLE ##############################
 26 | CLEANING_MODE = True
 27 | 
 28 | 
 29 | if '--freebayes' in argv or '--do_both_callers' in argv:
 30 |     SNVCLASS = ProcessFreebayesCaller
 31 |     OUTPUT_PATH_SNV = OUTPUT_PATH_FREEBAYES
 32 |     print('GATK SNV caller used. To use Freebayes, add --freebayes option')
 33 | else:
 34 |     SNVCLASS = ProcessGATKSNV
 35 |     print('freebayes SNV caller used')
 36 |     OUTPUT_PATH_SNV = OUTPUT_PATH_GATK
 37 | 
 38 | if '--limit'  in argv:
 39 |     LIMIT = int(argv[argv.index('--limit') + 1 ])
 40 | else:
 41 |     LIMIT = None
 42 | 
 43 | if "--nb_processes" in argv:
 44 |     NB_PROCESS = eval(argv[
 45 |         argv.index("--nb_processes") + 1])
 46 | else:
 47 |     from garmire_SNV_calling.config import NB_PROCESS_SNV as NB_PROCESS
 48 | ################################################
 49 | 
 50 | 
 51 | def main():
 52 |     print("launching SNV on {0} processes (Y/n)"\
 53 |               .format(NB_PROCESS))
 54 | 
 55 |     mp_analysis = Mp_Analysis()
 56 |     mp_analysis.run()
 57 | 
 58 | 
 59 | class Mp_Analysis():
 60 |     def __init__(self):
 61 |         """ """
 62 | 
 63 |         self.mp_queue = Queue()
 64 | 
 65 |         output_star = listdir(PATH_OUTPUT + "star/")
 66 | 
 67 |         if LIMIT:
 68 |             output_star = output_star[:LIMIT]
 69 | 
 70 |         for fil in output_star:
 71 |             if not isfile(PATH_OUTPUT + "star/" + fil + \
 72 |                           "/Aligned.sortedByCoord.out.bam"):
 73 |                 print('no star bam file for {0} skipping'.format(fil))
 74 | 
 75 |                 if isdir(PATH_OUTPUT + "star/" + fil) and CLEANING_MODE:
 76 |                     rmdir(PATH_OUTPUT + "star/" + fil)
 77 |                 continue
 78 | 
 79 |             if isfile("{0}/data/{1}/snv_filtered.vcf"\
 80 |                       .format(OUTPUT_PATH_SNV, fil)):
 81 |                 print('VCF file output already exists for {0} skipping...'\
 82 |                     .format(fil))
 83 |                 continue
 84 | 
 85 |             print("file to be processed:", fil)
 86 |             self.mp_queue.put(fil)
 87 | 
 88 |         print("\n #### now launching multiprocessing analysis #### \n")
 89 | 
 90 |         self.processes = [TrSNVMultiprocessing(self.mp_queue, id=i)
 91 |                           for i in range(NB_PROCESS)]
 92 |     def _run(self):
 93 |         for p in self.processes:
 94 |             p.start()
 95 | 
 96 |         while self.mp_queue.qsize():
 97 |             for p in self.processes:
 98 |                 if p.exitcode:
 99 |                     raise KeyboardInterrupt
100 |             sleep(1)
101 | 
102 |     def run(self):
103 |         try:
104 |             self._run()
105 | 
106 |         except KeyboardInterrupt:
107 |             for p in self.processes:
108 |                 p.terminate()
109 | 
110 | 
111 | class TrSNVMultiprocessing(Process):
112 |     """
113 |     Launch and control several instance of SNV process
114 |     """
115 |     def __init__(self, input_queue, id):
116 |         self.input_queue = input_queue
117 |         self.id = id
118 |         Process.__init__(self)
119 |         self.process_snv = SNVCLASS(id=self.id)
120 | 
121 |     def run(self):
122 |         while self.input_queue.qsize():
123 |             try:
124 |                 patient = self.input_queue.get(True, 0.2)
125 |             except Exception as e:
126 |                 print("exception:{0}".format(e))
127 |                 continue
128 |             else:
129 |                 print("processing for file {0} with id {1}"\
130 |                     .format(patient, self.id))
131 | 
132 |                 if '--do_both_callers' in argv:
133 |                     error = self.process_snv.process_ALL_callers(patient)
134 |                 else:
135 |                     error = self.process_snv.process(patient)
136 | 
137 |                 if error is not None:
138 |                     print('error {1} found for patient: {0}'.format(patient, error))
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()
143 | 


--------------------------------------------------------------------------------
/garmire_download_ncbi_sra/download_data.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | 
  3 | """
  4 | download data from NCBI according to GEO accession number
  5 | """
  6 | 
  7 | from os.path import isdir
  8 | from os.path import isfile
  9 | from os import popen
 10 | 
 11 | from os import mkdir
 12 | 
 13 | import urllib2
 14 | 
 15 | from distutils.dir_util import mkpath
 16 | 
 17 | import json
 18 | 
 19 | from garmire_download_ncbi_sra.config import PATH_DATA
 20 | from garmire_download_ncbi_sra.config import NB_THREADS
 21 | from garmire_download_ncbi_sra.config import LIMIT
 22 | 
 23 | from garmire_SNV_calling.bash_utils import exec_cmd
 24 | 
 25 | from urllib2 import URLError
 26 | 
 27 | import re
 28 | 
 29 | from multiprocessing.pool import ThreadPool
 30 | 
 31 | from time import sleep
 32 | 
 33 | 
 34 | ############ VARIABLES ############
 35 | PATH_SEQ = PATH_DATA + '/fastq/'
 36 | 
 37 | if not isdir(PATH_SEQ):
 38 |     mkdir(PATH_SEQ)
 39 | ###################################
 40 | 
 41 | 
 42 | def main():
 43 |     download_data()
 44 | 
 45 | def _download_old(url):
 46 |     """ """
 47 |     gsm, address = url
 48 | 
 49 |     try:
 50 |         srx = address.rsplit('/', 1)[-1]
 51 |         url = urllib2.urlopen(address).read().split()
 52 |         srr = url[-1]
 53 | 
 54 |         srr_url = "{0}/{1}/{1}.sra".format(address, srr)
 55 |         f_name = "{0}{1}__{2}__{3}.sra".format(PATH_SEQ,
 56 |                                                gsm,
 57 |                                                srx,
 58 |                                                srr)
 59 |     except Exception as e:
 60 |         print('error with SRX {0}!!!'.format(address))
 61 |         return "{1} {0}".format(str(e), address)
 62 | 
 63 |     if isfile(f_name):
 64 |         print("{0} already exists continue...".format(f_name))
 65 |         return "{0} already exists continue...".format(f_name)
 66 | 
 67 |     try:
 68 |         print('downloading {0} to {1}...'.format(srr_url, f_name))
 69 |         popen("wget -O {0} {1} --no-verbose".format(
 70 |             f_name,
 71 |             srr_url)).read()
 72 |         print('{0} successfully downloaded'.format(f_name))
 73 |         return
 74 | 
 75 |     except Exception as e:
 76 |         print('error while downloading {0}!!!'.format(address))
 77 |         return "{1} {0}\n".format(str(e), address)
 78 | 
 79 |     sleep(0.2)
 80 | 
 81 | def _download(data, verbose=True):
 82 |     """ """
 83 |     gsm, url_address = data
 84 | 
 85 |     waiting_list = [10, 20, 30]
 86 | 
 87 |     f_name = "{0}/{1}.sra".format(PATH_SEQ, gsm)
 88 | 
 89 |     if isfile('{0}/download_successfull.log'.format(PATH_SEQ)):
 90 |         msg = 'file {0} already downloaded. skipping...'.format(f_name)
 91 |         print(msg)
 92 |         return msg
 93 |     while True:
 94 |         try:
 95 |             url = urllib2.urlopen(url_address).read()
 96 |         except URLError as e:
 97 |             if waiting_list:
 98 |                 sleep_time = waiting_list.pop()
 99 |                 print('error when downloading: {1} sleeping {0} s...'.format(sleep_time, e))
100 |                 sleep(sleep_time)
101 |             else:
102 |                 raise e
103 |         else:
104 |             break
105 | 
106 |     srr = re.findall('run=(?P<srr>SRR[0-9]+)', url)[0]
107 | 
108 |     srr_url = "ftp://ftp-trace.ncbi.nlm.nih.gov"\
109 |               "/sra/sra-instant/reads/ByRun/sra/SRR/{0}/{1}/{1}.sra".format(
110 |                   srr[0:6], srr)
111 | 
112 |     print('downloading: {0}'.format(srr_url))
113 | 
114 |     if verbose:
115 |         verb = ''
116 |     else:
117 |         verb = '--no-verbose'
118 |     cmd = "wget {2} -O {0} {1} ".format(f_name, srr_url, verb)
119 |     print('launching: {0}'.format(cmd))
120 | 
121 |     exec_cmd(cmd)
122 | 
123 |     msg = '{0} successfully downloaded'.format(f_name)
124 |     print(msg)
125 | 
126 |     f_log = open('{0}/download_successfull.log'.format(PATH_SEQ), 'w')
127 |     f_log.write('download complete')
128 | 
129 |     return msg
130 | 
131 | def download_data():
132 |     """download dataset from ncbi """
133 | 
134 |     urls = get_urls()
135 | 
136 |     if LIMIT:
137 |         urls = urls[:LIMIT]
138 | 
139 |     if not isdir(PATH_SEQ):
140 |         mkpath(PATH_SEQ)
141 | 
142 |     f_error = open(PATH_DATA + "/error_log.txt", "w")
143 | 
144 |     thread_pool = ThreadPool(processes=NB_THREADS)
145 | 
146 |     res = thread_pool.map(_download, urls)
147 | 
148 |     print("######## errors founds:")
149 |     for err in res:
150 |         if err:
151 |             print(err)
152 |             f_error.write('{0}\n'.format(err))
153 | 
154 | def get_urls():
155 |     """
156 |     get download addresses as GSM id according to the following template:
157 |     169. TuMP2-10b
158 |     Organism:	Mus musculus
159 |     Source name:	mouse pancreatic tumor
160 |     Platform: GPL15907 Series: GSE51372
161 |     FTP download: SRA SRX364871
162 |                             ftp://ftp-trace.ncbi.nlm.nih.gov/
163 |                             sra/sra-instant/reads/ByExp/sra/SRX/SRX364/SRX364871/
164 |     Sample		Accession: GSM1243834	ID: 301243834
165 |     """
166 |     f_meta = open('{0}/metadata.json'.format(PATH_DATA))
167 |     metadata = json.load(f_meta)
168 | 
169 |     gsms, urls = [], []
170 | 
171 |     for sample in metadata:
172 |         if 'SRA' in metadata[sample]:
173 |             gsms.append(sample)
174 |             urls.append(metadata[sample]['SRA'])
175 | 
176 |     return zip(gsms, urls)
177 | 
178 | if __name__ == "__main__":
179 |     main()
180 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/deploy_star.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | from fnmatch import fnmatch
  3 | 
  4 | from os import listdir
  5 | from os import mkdir
  6 | from os.path import isdir
  7 | from os.path import isfile
  8 | from os.path import getsize
  9 | 
 10 | from garmire_SNV_calling.bash_utils import exec_cmd
 11 | 
 12 | from time import sleep
 13 | from random import random
 14 | 
 15 | from os import popen
 16 | 
 17 | from distutils.dir_util import mkpath
 18 | 
 19 | from garmire_SNV_calling.config import PATH_STAR_SOFTWARE \
 20 |     as PATH_SOFTWARE
 21 | 
 22 | from garmire_SNV_calling.config import STAR_THREADS as THREADS
 23 | 
 24 | from garmire_SNV_calling.bash_utils import printf
 25 | 
 26 | 
 27 | ############ VARIABLES ############################################
 28 | 
 29 | from garmire_SNV_calling.config import SPECIFIC_FILENAME_PATTERN as PATTERN
 30 | from garmire_SNV_calling.config import FASTQ_PATH
 31 | from garmire_SNV_calling.config import STAR_INDEX_PATH
 32 | from garmire_SNV_calling.config import STAR_INDEX_READ_LENGTH
 33 | 
 34 | from garmire_SNV_calling.config import OUTPUT_PATH_STAR \
 35 |     as OUTPUT_PATH
 36 | 
 37 | from garmire_SNV_calling.config import SIMULATED_REF_GENOME
 38 | 
 39 | ###################################################################
 40 | 
 41 | 
 42 | def star_analysis(
 43 |         output_path=OUTPUT_PATH,
 44 |         fastq_path=FASTQ_PATH,
 45 |         pattern=PATTERN,
 46 |         star_index_path=STAR_INDEX_PATH,
 47 |         star_index_read_length=STAR_INDEX_READ_LENGTH,
 48 |         simulated_ref_genome=SIMULATED_REF_GENOME,
 49 |         path_software=PATH_SOFTWARE,
 50 |         threads=THREADS,
 51 |         cufflinks_compatibility=None,
 52 |         custom_star_index_name=True,
 53 |         stdout=None,
 54 |         printf=printf):
 55 |     """
 56 |     """
 57 |     sleep(2 * random())
 58 | 
 59 |     options = ''
 60 | 
 61 |     if cufflinks_compatibility:
 62 |         options = '--outSAMstrandField intronMotif'\
 63 |                   ' --outFilterIntronMotifs RemoveNoncanonical'
 64 | 
 65 |     if not isdir(output_path):
 66 |         mkpath(output_path)
 67 | 
 68 |     for fil in listdir(fastq_path):
 69 |         if isfile(fastq_path + '/' + fil):
 70 |             continue
 71 | 
 72 |         if pattern and not fnmatch(fil, pattern):
 73 |             continue
 74 | 
 75 |         printf("====> file to be aligned: {0}".format(fil))
 76 | 
 77 |         if not isdir(output_path + fil):
 78 |             mkdir(output_path + fil)
 79 | 
 80 |         if isfile(output_path + fil + "/Aligned.sortedByCoord.out.bam") \
 81 |            and getsize(output_path + fil + "/Aligned.sortedByCoord.out.bam"):
 82 |             printf('bam file result alreay exists for:{0}\nskipping...'\
 83 |                 .format(fil))
 84 |             continue
 85 | 
 86 |         fastq_str = ""
 87 | 
 88 |         for fastq_fil in sorted(listdir(fastq_path + '/' +  fil)):
 89 |             if fnmatch(fastq_fil, "*.fastq"):
 90 |                 fastq_str += "{0}/{1}/{2} ".format(fastq_path, fil, fastq_fil)
 91 | 
 92 |         if not fastq_str:
 93 |             printf('no fastq file found for:{0}!\nskipping'.format(fil))
 94 |             continue
 95 | 
 96 |         if custom_star_index_name:
 97 |             star_index_path_ready = "{0}READ{1}".format(star_index_path.rstrip('/'),
 98 |                                                         star_index_read_length)
 99 | 
100 |             if simulated_ref_genome:
101 |                     star_index_path_ready = star_index_path_ready.rstrip('/') \
102 |                           + 'SIM{0}/'.format(simulated_ref_genome)
103 |         else:
104 |             star_index_path_ready = star_index_path
105 | 
106 |         tmp_path = '{0}/_STARtmp'.format(output_path + '/' + fil + "/")
107 | 
108 |         if isdir(tmp_path):
109 |             exec_cmd('rm -r {0}'.format(tmp_path), stdout)
110 | 
111 |         cmd = "{0} --readFilesIn {1} --runThreadN {2}"\
112 |               " --twopassMode Basic --outSAMtype BAM SortedByCoordinate" \
113 |               "  --outTmpDir {5} --outFileNamePrefix {3} --genomeDir {4} {6}"\
114 |               .format(path_software,
115 |                       fastq_str,
116 |                       threads,
117 |                       output_path + '/' + fil + "/",
118 |                       star_index_path_ready,
119 |                       tmp_path,
120 |                       options
121 |               )
122 | 
123 |         printf('star cmd to be launched:{0}'.format(cmd))
124 |         exec_cmd(cmd, stdout)
125 | 
126 | def check_star_folder(new_star_path):
127 |     """
128 |     """
129 |     if isfile('{0}/star_aligned_successfull.log'.format(new_star_path)):
130 |         return '#### STAR already aligned successfully in: {0}'.format(new_star_path)
131 | 
132 | def clean_star_folder(path_star_results):
133 |     """
134 |     """
135 |     path_bam = '{0}/Aligned.sortedByCoord.out.bam'.format(path_star_results)
136 |     path_log_final = '{0}/Log.final.out'.format(path_star_results)
137 |     path_sj = '{0}/SJ.out.tab'.format(path_star_results)
138 | 
139 |     assert(isfile(path_bam) and isfile(path_log_final) and isfile(path_sj))
140 | 
141 |     popen('rm -r {0}/_STAR*'.format(path_star_results)).read()
142 |     popen('rm -r {0}/Log.out'.format(path_star_results)).read()
143 |     popen('rm -r {0}/Log.progress.out'.format(path_star_results)).read()
144 | 
145 |     f_log = open('{0}/star_aligned_successfull.log'.format(path_star_results), 'w')
146 |     f_log.write('STAR successfull')
147 | 
148 |     return
149 | 
150 | 
151 | if __name__ == "__main__":
152 |     star_analysis()
153 | 


--------------------------------------------------------------------------------
/garmire_download_ncbi_sra/download_soft_file.py:
--------------------------------------------------------------------------------
  1 | from garmire_SNV_calling.bash_utils import exec_cmd
  2 | 
  3 | from garmire_download_ncbi_sra.config import PATH_DATA
  4 | from garmire_download_ncbi_sra.config import PROJECT_NAME
  5 | from garmire_download_ncbi_sra.config import SOFT_ID
  6 | 
  7 | from glob import glob
  8 | 
  9 | from collections import Counter
 10 | 
 11 | from os import mkdir
 12 | 
 13 | from os.path import isfile
 14 | 
 15 | import re
 16 | 
 17 | from collections import defaultdict
 18 | 
 19 | from os.path import isdir
 20 | 
 21 | from datetime import datetime
 22 | import json
 23 | 
 24 | 
 25 | def main():
 26 |     """ """
 27 |     if not isdir(PATH_DATA):
 28 |         mkdir(PATH_DATA)
 29 | 
 30 |     download_and_process_soft(SOFT_ID)
 31 | 
 32 | 
 33 | def download_and_process_soft(gse, erase=False):
 34 |     """
 35 |     """
 36 |     if not erase:
 37 |         if glob('{0}/{1}*'.format(PATH_DATA, gse)):
 38 |             print('soft file seems existing for: {0}'.format(gse))
 39 |             return
 40 | 
 41 |     print('downloadin: {0}...'.format(gse))
 42 | 
 43 |     address = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/{0}nnn/{1}/soft/{1}_family.soft.gz'.format(
 44 |         gse[:-3],
 45 |         gse
 46 |     )
 47 | 
 48 |     exec_cmd('wget {0} -O {1}/{2}.soft.gz'.format(address, PATH_DATA, PROJECT_NAME))
 49 |     exec_cmd('gzip -d {0}/{1}.soft.gz'.format(PATH_DATA, PROJECT_NAME))
 50 | 
 51 |     read_soft('{0}/{1}.soft'.format(PATH_DATA, PROJECT_NAME))
 52 | 
 53 | def read_soft(soft_file):
 54 |     """
 55 |     """
 56 |     gse_dict = extract_gsm_from_soft(soft_file)
 57 | 
 58 |     if not gse_dict:
 59 |         print('soft_file:{0} empty!'.format(soft_file))
 60 |         return
 61 | 
 62 |     n_samples = len(gse_dict)
 63 | 
 64 |     organism = Counter([gse_dict[gse]['organism_code'] for gse in gse_dict])
 65 | 
 66 |     organism = sorted(organism.items(), key=lambda x:x[1], reverse=True)[0][0]
 67 |     organism = organism.split()[0]
 68 | 
 69 |     f_stat = open('{0}/statistics.json'.format(PATH_DATA), 'w')
 70 |     f_meta = open('{0}/metadata.json'.format(PATH_DATA), 'w')
 71 | 
 72 |     f_meta.write(json.dumps(gse_dict, indent=2))
 73 | 
 74 |     f_stat.write(json.dumps({
 75 |         'organism':organism,
 76 |         "nb_samples": n_samples,
 77 |         "soft ID": SOFT_ID
 78 |     }, indent=2))
 79 | 
 80 |     print("organism: {0}".format(organism))
 81 |     print("number of samples: {0}".format(n_samples))
 82 | 
 83 | 
 84 | def extract_gsm_from_soft(
 85 |         soft_file,
 86 |         flatten_gsm=False,
 87 |         remove_not_sra=True):
 88 |     """
 89 |     """
 90 |     gse_dict = {}
 91 | 
 92 |     assert(isfile(soft_file))
 93 | 
 94 |     f_soft = open(soft_file)
 95 |     line = f_soft.readline()
 96 | 
 97 |     while line:
 98 |         if line.count('^SAMPLE'):
 99 |             data = defaultdict(list)
100 |             gse = line.strip('\n').split(' = ', 1)[1].strip()
101 |             data['GSE'] = gse
102 | 
103 |             line = f_soft.readline()
104 | 
105 |             while line and line[0] == '!':
106 |                 key, value = line.split(' = ', 1)
107 | 
108 |                 key = key.strip('! ')
109 |                 key = key.replace('/', '_')
110 |                 value = value.strip('\n ')
111 | 
112 |                 if key[:7] == 'Sample_':
113 |                     key = key[7:]
114 | 
115 |                 if value[:6] == 'ftp://':
116 |                     data['ftp'].append(value)
117 | 
118 |                 sra = re.findall('https://www.ncbi.nlm.nih.gov/sra?term=SRX[0-9]+', value)
119 | 
120 |                 geo_organism = data['organism_ch1']
121 | 
122 |                 if geo_organism:
123 |                     geo_organism = geo_organism[0]
124 | 
125 |                     if geo_organism == 'Homo sapiens' or geo_organism == 'Homo':
126 |                         data['organism_code'] = 'HUMAN'
127 |                     elif geo_organism == 'Mus musculus' or geo_organism == 'Mus':
128 |                         data['organism_code'] = 'MOUSE'
129 | 
130 |                 if sra:
131 |                     data['SRA'].append(sra[0])
132 | 
133 |                 data[key].append(value)
134 | 
135 |                 line = f_soft.readline()
136 | 
137 |             if 'relation' in data:
138 |                 for relation in data['relation']:
139 |                     key, value = relation.split(': ')
140 |                     key.strip(), value.strip()
141 |                     data[key] = value
142 | 
143 |             if flatten_gsm:
144 |                 for key in data:
145 |                     data[key] = check_value(key, data[key])
146 | 
147 |                     if len(data[key]) == 1:
148 |                         data[key] = data[key][0]
149 | 
150 |             if 'SRA' in data or not remove_not_sra:
151 |                 gse_dict[gse] = data
152 | 
153 |         else:
154 |             line = f_soft.readline()
155 | 
156 |     return gse_dict
157 | 
158 | 
159 | def check_value(key, values):
160 |     """
161 |     """
162 |     is_list= True
163 | 
164 |     if not isinstance(values, list):
165 |         is_list = False
166 |         values = [values]
167 | 
168 |     values = map(format_value, values)
169 | 
170 |     if key.count('zip') and not isinstance(values[0], int):
171 |         values = map(lambda x:0, values)
172 | 
173 |     if key.count('phone') and not isinstance(values[0], int):
174 |         values = map(lambda x:0, values)
175 | 
176 |     if not is_list:
177 |         values = values[0]
178 | 
179 |     return values
180 | 
181 | def format_value(value):
182 |     """
183 |     """
184 |     if value.isdigit():
185 |         value = int(value)
186 |     elif re.findall('[A-Z][a-z][a-z] [0-9]{2} [0-9]{4}', value):
187 |         value = re.findall('[A-Z][a-z][a-z] [0-9]{2} [0-9]{4}', value)[0]
188 |         value = datetime.strptime(value, '%b %d %Y')
189 | 
190 |     return value
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     main()
195 | 


--------------------------------------------------------------------------------
/README_snv_calling.md:
--------------------------------------------------------------------------------
  1 | # SNV computation pipeline
  2 | 
  3 | This module aims to align reads from FASTQ files and infer SNVs from RNA-seq dataset. The pipeline is largely inspired from the [GATK variant calling good practices.](http://gatkforums.broadinstitute.org/wdl/discussion/3891/calling-variants-in-rnaseq). Also, it can optionally infer raw gene expression, annotate SNV and doing Quality Control (QC) check.
  4 | 
  5 | * GATK reference:
  6 |   * [From FastQ data to high confidence variant calls: the Genome Analysis Toolkit best practices pipeline.](http://www.ncbi.nlm.nih.gov/pubmed/25431634)
  7 | 
  8 | * Pipeline schema:
  9 |   ![Pipeline schema:](./img/workflow.png)
 10 | 
 11 | 
 12 | ##
 13 | 
 14 | 
 15 | # STAR alignment and SNV calling from scratch using docker
 16 | 
 17 | ## Requirements
 18 | * docker
 19 | * possible root access
 20 | * 13.8 GB of free memory (docker image) + memory for STAR indexes (usually 20 GB per index) and downloaded data
 21 | 
 22 | ## installation (local)
 23 | 
 24 | ```bash
 25 | docker pull opoirion/ssrge
 26 | mkdir /<Results data folder>/
 27 | cd /<Results data folder>/
 28 | PATHDATA=`pwd`
 29 | ```
 30 | 
 31 | ## usage
 32 | 
 33 | The pipeline consists of  4 steps for aligning and calling SNVs:
 34 | 
 35 | ```bash
 36 | # align and SNV calling
 37 | docker run --rm opoirion/ssrge star_index -h
 38 | docker run --rm opoirion/ssrge process_star -h
 39 | docker run --rm opoirion/ssrge feature_counts -h
 40 | docker run --rm opoirion/ssrge process_snv -h
 41 | 
 42 | ```
 43 | 
 44 | ## example
 45 | 
 46 | Let's download and process 2 samples from GSE79457 in a project name test_n2
 47 | 
 48 | ```bash
 49 | # download of the soft file containing the metadata for GSE79457 (see download section)
 50 | ## all these data can also be obtained using other alternative workflows
 51 | # here you need to precise which read length to use for creating a STAR index and which ref organism (MOUSE/HUMAN)
 52 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge star_index -project_name test_n2 -read_length 100 -cell_type HUMAN
 53 | # STAR alignment
 54 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_star -project_name test_n2 -read_length 100 -cell_type HUMAN
 55 | # sample-> gene count matrix
 56 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge feature_counts -project_name test_n2
 57 | #SNV inference
 58 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_snv -project_name test_n2 -cell_type HUMAN
 59 | ```
 60 | 
 61 | # Installation from github (*not updated!!* => Use the docker image for now)
 62 | 
 63 | ## Requirements
 64 | * The pipeline requires that the following programs are installed:
 65 |     * Linux/ Unix (not tested) working environment
 66 |     * [python 2 (>=2.7)](https://www.python.org/download/releases/2.7.2/)
 67 |     * [STAR Aligner](https://github.com/alexdobin/STAR)
 68 |     * [GATK](https://software.broadinstitute.org/gatk/download/)
 69 |     * [picard-tools](https://broadinstitute.github.io/picard/)
 70 |     * [Java (>=1.8)](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html)
 71 |     * [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc) \[OPTIONAL\]
 72 |     * [featureCounts](http://subread.sourceforge.net/) \[OPTIONAL\]
 73 |     * [snpEff](http://snpeff.sourceforge.net/) \[OPTIONAL\]
 74 |       * Appropriate snpEff database should be downloaded and installed (see config.py). (It can be done using snpEff command line, see documentation)
 75 | 
 76 | * For each sample, FASTQ files must be inside a specific folder. Also, all the FASTQ folders must be inside a specific folder. (see config.py file)
 77 | * reference genome (.fa file) and gene annotations file (.gtf) must be provided (see config.py file)
 78 | * Reference variant files must be also provided for the SNV calling procedure (see config.py file).
 79 |   * \[HUMAN\]:
 80 |     * dbsnp can be downloaded here: [ftp://ftp.ncbi.nih.gov/snp/organisms/](ftp://ftp.ncbi.nih.gov/snp/organisms/)
 81 |     * additional reference SNV resources can be downloaded here: [ftp://ftp.broadinstitute.org/bundle/2.8/hg19](ftp://ftp.broadinstitute.org/bundle/2.8/hg19)
 82 |   * \[MOUSE\]:
 83 |     * Mouse reference variant and indel databases can be downloaded here: [ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/](ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/). However, vcf files should probably be resorted toward the mouse reference genome using the sequence dictionnary.
 84 | 
 85 | ## configuration
 86 | 
 87 | move to folder of the git project (https://github.com/lanagarmire/SSrGE.git)
 88 | 
 89 | ```bash
 90 | cd SSrGE
 91 | ```
 92 | 
 93 | All the environment variables should be set into the ./garmire_SNV_calling//config.py file
 94 | 
 95 | ## usage
 96 | 
 97 | * Once all the environment variables are defined, one should run the test scripts:
 98 | 
 99 | * [optional] Running all the tests:
100 |   *
101 | 
102 | ```bash
103 |   python test/test_snv.py -v
104 |   python test/test_snv_optional.py -v # test optionnal features described above
105 |   ```
106 | 
107 | * create a STAR index for the used reference genome and the read length used:
108 | 
109 | ```bash
110 | python garmire_SNV_calling/generate_STAR_genome_index.py
111 | ```
112 | 
113 | * Align the reads
114 | 
115 | ```bash
116 | python garmire_SNV_calling/deploy_star.py
117 | ```
118 | 
119 | * infer SNVs
120 | 
121 | ```bash
122 | python garmire_SNV_calling/process_multiple_snv.py
123 | ```
124 | 
125 | * Check STAR overall quality (generate a csv file with the percentage of unique reads mapped for each sample in OUTPUT_PATH)
126 | 
127 | ```bash
128 | python garmire_SNV_calling/check_star_overall_quality.py
129 | ```
130 | 
131 | * generate a fastqc report for each sample \[argument --nb_threads: number of processes in parallel\]
132 | 
133 | ```bash
134 | python garmire_SNV_calling/process_fastqc_report.py --nb_threads <int>
135 | ```
136 | 
137 | * Use the FastQC report to generate a csv file in OUTPUT_PATH reporting, for each sample, if the [duplicated test](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/8%20Duplicate%20Sequences.html) of fastqc is passed.
138 | 
139 | ```bash
140 | python garmire_SNV_calling/check_fastqc_stats.py
141 | ```
142 | 
143 | * Generate gene expression matrices (raw count)
144 | 
145 | ```bash
146 | python garmire_SNV_calling/compute_frequency_matrix.py
147 | ```
148 | 
149 | * Annotate SNV: generate new .vcf files with SNV annotations. \[argument --nb_threads: number of processes in parallel\]
150 | 
151 | ```bash
152 | python garmire_SNV_calling/process_annotate_snv.py --nb_threads <int>
153 | ```
154 | 
155 | ## contact and credentials
156 | * Developer: Olivier Poirion (PhD)
157 | * contact: opoirion@hawaii.edu


--------------------------------------------------------------------------------
/garmire_SSrGE/extract_data.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | from collections import defaultdict
  3 | from collections import Counter
  4 | 
  5 | from bisect import bisect
  6 | 
  7 | import numpy as np
  8 | 
  9 | from garmire_SSrGE.config import EXPRESSION_MATRIX_FOLDER_PATH
 10 | from garmire_SSrGE.config import GENE_MATRIX_NAME
 11 | from garmire_SSrGE.config import VCF_FOLDER_PATH
 12 | from garmire_SSrGE.config import VCF_NAME
 13 | 
 14 | from garmire_SSrGE.load_data import process_line_from_vcf_file
 15 | from garmire_SSrGE.load_data import process_line_from_annotated_vcf_file
 16 | from garmire_SSrGE.load_data import load_indexes
 17 | 
 18 | 
 19 | def debug():
 20 |     """ DEBUG """
 21 |     ExtractData()
 22 | 
 23 | 
 24 | class ExtractData():
 25 |     """ """
 26 |     def __init__(
 27 |             self,
 28 |             expression_matrix_folder_path=EXPRESSION_MATRIX_FOLDER_PATH,
 29 |             gene_matrix_name=GENE_MATRIX_NAME,
 30 |             vcf_folder_path=VCF_FOLDER_PATH,
 31 |             vcf_name=VCF_NAME):
 32 |         """ """
 33 |         self.expression_matrix_folder_path = expression_matrix_folder_path
 34 |         self.gene_matrix_name = gene_matrix_name
 35 |         self.vcf_folder_path = vcf_folder_path
 36 |         self.vcf_name = vcf_name
 37 | 
 38 |         self.index = None
 39 |         self.position_index = None
 40 |         self.index_start = None
 41 |         self.index_end = None
 42 |         self.snv_id_dict = defaultdict(str)
 43 |         self._snvs_index = {}
 44 | 
 45 |         self.average_expression = defaultdict(list)
 46 | 
 47 |     def _load_indexes(self):
 48 |         """ """
 49 |         if isinstance(self.index_end, type(None)):
 50 |             (self.index_start,
 51 |              self.index_end,
 52 |              self.position_index) = load_indexes()
 53 | 
 54 |     def _load_annotate_snv_from_vcf(self, snv_path):
 55 |         """load annotated snv """
 56 |         f_snv = open(snv_path, 'r')
 57 | 
 58 |         result = defaultdict(list)
 59 | 
 60 |         for line in f_snv:
 61 |             res = process_line_from_annotated_vcf_file(line)
 62 |             if res:
 63 |                 snvid, snvinfolist = res
 64 |                 result[snvid] = snvinfolist
 65 |         return result
 66 | 
 67 |     def load_snv_from_vcf_file(self, vcf_path):
 68 |         """
 69 |         load snv from a vcf file
 70 | 
 71 |         input:
 72 |             :vcf_path: path to the vcf file
 73 |         """
 74 |         self._load_indexes()
 75 | 
 76 |         f_snv = open(vcf_path, 'r')
 77 | 
 78 |         wrong_count = 0
 79 |         good_count = 0
 80 | 
 81 |         result = Counter()
 82 | 
 83 |         for line in f_snv:
 84 |             res = process_line_from_vcf_file(line)
 85 | 
 86 |             if not res:
 87 |                 continue
 88 | 
 89 |             chrid, start, end, snv_id = res
 90 | 
 91 |             if chrid not in self.position_index['start']:
 92 |                 continue
 93 | 
 94 |             ref_start = bisect(self.position_index['start'][chrid], start)
 95 |             ref_start_list = self.position_index['start'][chrid][max(ref_start-10, 0):
 96 |                                                                  ref_start]
 97 |             ref_end = bisect(self.position_index['end'][chrid], end)
 98 |             ref_end_list = self.position_index['end'][chrid][ref_end:
 99 |                                                              ref_end+10]
100 |             ref_start_from_end = set([en[0] for e in ref_end_list
101 |                                       for en in self.index_end[chrid][e]])
102 |             genes_hit_by_snv = ref_start_from_end\
103 |                                .intersection(ref_start_list)
104 | 
105 |             if genes_hit_by_snv:
106 | 
107 |                 for gene_begin in genes_hit_by_snv:
108 |                     for gene_end_tuple in self.index_start[chrid][gene_begin]:
109 |                         gene_end = gene_end_tuple[0]
110 | 
111 |                         if not ((gene_begin < start) and (end < gene_end) ):
112 |                             wrong_count += 1
113 |                             continue
114 | 
115 |                         good_count += 1
116 |                         gene_id = gene_end_tuple[1]
117 |                         snv_name = (gene_id, start)
118 | 
119 |                         if snv_id:
120 |                             self.snv_id_dict[(gene_id, start)] = snv_id
121 | 
122 |                         result[snv_name] = 1.0
123 | 
124 |                         snv_index = (chrid, start)
125 | 
126 |                         self._snvs_index[snv_index] = snv_name
127 | 
128 |         return result
129 | 
130 |     def load_snv_from_cell(self, folder):
131 |         """
132 |         Return SNV found as a dict:
133 |         Counter(snv_id: 1)
134 | 
135 |         input:
136 |             :folder: str    id of the sample
137 |         """
138 |         f_path = "{0}/{1}/{2}".format(
139 |             self.vcf_folder_path, folder, self.vcf_name)
140 | 
141 |         return self.load_snv_from_vcf_file(f_path)
142 | 
143 |     def load_expression_profile_from_cell(self, folder):
144 |         """
145 |         Return cell log FPKM as a dict:
146 |         Counter(gene_id: expr_profile)
147 | 
148 |         input:
149 |             :folder: str    id of the sample
150 |         """
151 |         f_path = "{0}/{1}/{2}".format(
152 |             self.expression_matrix_folder_path, folder, self.gene_matrix_name)
153 | 
154 |         fpkm_dict = self.load_expression_profile_from_file(f_path)
155 | 
156 |         for gid in fpkm_dict:
157 |             fpkm_dict[gid] = np.log(1.0 + fpkm_dict[gid])
158 | 
159 |         return fpkm_dict
160 | 
161 |     def get_average_expression_dict(self):
162 |         """ """
163 |         for gid in self.average_expression:
164 | 
165 |             self.average_expression[gid] = np.mean(
166 |                 self.average_expression[gid])
167 | 
168 |         return self.average_expression
169 | 
170 |     def load_expression_profile_from_file(self, f_path):
171 |         """
172 |         load expression profile from file
173 |         input:
174 |             :f_path: path to the matrix file
175 |         """
176 |         count_array = defaultdict(list)
177 | 
178 |         res = Counter()
179 | 
180 |         f_expr = open(f_path, 'r')
181 |         f_expr.readline()
182 |         f_expr.readline()
183 | 
184 |         tot_nb_read = 0
185 | 
186 |         for line in f_expr:
187 |             line = line.strip('\n').split('\t')
188 |             value = float(line[-1])
189 |             g_start = float(line[2].split(';', 1)[0])
190 |             g_end = float(line[3].split(';', 1)[0])
191 |             gene_id = line[0]
192 | 
193 |             if value:
194 |                 tot_nb_read += value
195 |                 pre_fpkm = value / (g_end - g_start)
196 |                 res[gene_id] = pre_fpkm
197 | 
198 |             else:
199 |                    count_array[gene_id].append(value)
200 | 
201 |         for gid in res:
202 |             res[gid] *= 10**9 / tot_nb_read
203 |             count_array[gid].append(res[gid])
204 | 
205 |             self.average_expression[gid].append(res[gid])
206 | 
207 |         return res
208 | 
209 |     def get_snv_id_dict(self):
210 |         """ """
211 |         return defaultdict(str, self.snv_id_dict)
212 | 
213 | 
214 | if __name__ == "__main__":
215 |     debug()
216 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/process_freebayes.py:
--------------------------------------------------------------------------------
  1 | from config import SAMTOOLS
  2 | from config import PYTHON
  3 | 
  4 | from sys import argv
  5 | from os.path import isfile
  6 | 
  7 | from time import time
  8 | 
  9 | from garmire_SNV_calling.process_snv_GATK import ProcessGATKSNV
 10 | from garmire_SNV_calling.process_snv_GATK import PICARD_DIR
 11 | from garmire_SNV_calling.process_snv_GATK import PLATEFORM
 12 | from garmire_SNV_calling.process_snv_GATK import ORGANISM
 13 | from garmire_SNV_calling.process_snv_GATK import REF_GENOME
 14 | from garmire_SNV_calling.process_snv_GATK import DBSNP
 15 | from garmire_SNV_calling.process_snv_GATK import VCF_RESOURCES
 16 | from garmire_SNV_calling.process_snv_GATK import PROCESS_ID
 17 | 
 18 | from garmire_SNV_calling.config import PATH_OPOSSUM
 19 | from garmire_SNV_calling.config import PATH_FREEBAYES
 20 | 
 21 | from garmire_SNV_calling.config import OUTPUT_PATH_GATK
 22 | 
 23 | 
 24 | if "--do_both_callers" in argv:
 25 |     DO_BOTH_CALLERS = True
 26 | else:
 27 |     DO_BOTH_CALLERS = False
 28 | 
 29 | if "--path_to_data" in argv:
 30 |     PATH_TO_DATA = argv[
 31 |         argv.index("--path_to_data") + 1]
 32 |     PATH_OUTPUT =  PATH_TO_DATA + '/freebayes/'
 33 | else:
 34 |     from garmire_SNV_calling.config import PATH_OUTPUT
 35 |     from garmire_SNV_calling.config import OUTPUT_PATH_FREEBAYES
 36 | 
 37 | 
 38 | def main():
 39 |     """ """
 40 |     process_freebayes = ProcessFreebayesCaller(id=PROCESS_ID)
 41 |     if DO_BOTH_CALLERS:
 42 |         process_freebayes.process_ALL_callers()
 43 |     else:
 44 |         process_freebayes.process()
 45 | 
 46 | 
 47 | class ProcessFreebayesCaller(ProcessGATKSNV):
 48 |     """ """
 49 |     def __init__(self,
 50 |                  output_path=OUTPUT_PATH_FREEBAYES,
 51 |                  path_to_data=PATH_OUTPUT,
 52 |                  picard_dir=PICARD_DIR,
 53 |                  plateform=PLATEFORM,
 54 |                  organism=ORGANISM,
 55 |                  path_freebayes=PATH_FREEBAYES,
 56 |                  ref_genome=REF_GENOME,
 57 |                  samtools=SAMTOOLS,
 58 |                  dbsnp=DBSNP,
 59 |                  vcf_resources=VCF_RESOURCES,
 60 |                  output_path_gatk=OUTPUT_PATH_GATK,
 61 |                  respath_gatk=None,
 62 |                  **kwargs):
 63 |         """ """
 64 |         self.output_path_gatk = output_path_gatk
 65 |         self.path_freebayes =path_freebayes
 66 |         self.samtools = samtools
 67 | 
 68 |         ProcessGATKSNV.__init__(
 69 |             self,
 70 |             output_path=output_path,
 71 |             path_to_data=path_to_data,
 72 |             picard_dir=picard_dir,
 73 |             plateform=plateform,
 74 |             organism=organism,
 75 |             ref_genome=ref_genome,
 76 |             dbsnp=dbsnp,
 77 |             vcf_resources=vcf_resources,
 78 |             **kwargs)
 79 | 
 80 |         self.respath_gatk = respath_gatk
 81 | 
 82 |     def process(self, srr_to_process=None):
 83 |         """
 84 |         """
 85 |         if srr_to_process:
 86 |             self.srr_to_process = srr_to_process
 87 | 
 88 |         msg = self._init_process()
 89 | 
 90 |         if msg:
 91 |             print(msg)
 92 |             self.stdout.write(msg)
 93 |             return
 94 | 
 95 |         self._launch_picard_readgroups()
 96 |         self._launch_picard_markduplicates()
 97 |         self._launch_gatk_cigar()
 98 |         self._launch_gatk_realigner_target_creator()
 99 |         self._launch_gatk_realigner_indel()
100 |         self._launch_gatk_base_recalibrator()
101 |         self._launch_gatk_print_reads()
102 |         self._process_freebayes('recal.bam')
103 |         self._finish_process()
104 |         self._rm_tmp_file()
105 | 
106 |     def process_ALL_callers(self, srr_to_process=None):
107 |         """
108 |         """
109 |         if srr_to_process:
110 |             self.srr_to_process = srr_to_process
111 | 
112 |         msg = self._init_process()
113 | 
114 |         if msg:
115 |             print(msg)
116 |             self.stdout.write(msg)
117 |             return
118 | 
119 |         self._init_process_gatk()
120 |         self._launch_picard_readgroups()
121 |         self._launch_picard_markduplicates()
122 |         self._launch_gatk_cigar()
123 |         self._launch_gatk_realigner_target_creator()
124 |         self._launch_gatk_realigner_indel()
125 |         self._launch_gatk_base_recalibrator()
126 |         self._launch_gatk_print_reads()
127 |         self._process_freebayes('recal.bam')
128 |         self._launch_gatk_variant_calling()
129 |         self._launch_gatk_variant_filtering()
130 | 
131 |         self._finish_process(ext="_GATK", out="_GATK")
132 |         self._finish_process(ext="_freebayes", out="_freebayes")
133 |         self._rm_tmp_file()
134 | 
135 |     def _init_process_gatk(self):
136 |         """
137 |         """
138 |         if not self.respath_gatk:
139 |             self.respath_gatk = self.output_path_gatk + \
140 |                                 "/data/" + self.srr_to_process
141 | 
142 |     def _process_samtools_calmd(self, bam_input="Aligned.sortedByCoord.out.bam"):
143 |         """
144 |         """
145 |         if self.check_if_output_exists(
146 |             "{0}/md.bam".format(self.tmppath)):
147 |             return
148 | 
149 |         self._run_cmd(
150 |             'echo "\n\n######## LAUNCHING SAMTOOLS CALMD ########\n"')
151 | 
152 |         cmd = "{0} calmd -b {1}/{2} {3} > {1}/md.bam".format(
153 |             self.samtools,
154 |             self.tmppath,
155 |             bam_input,
156 |             self.ref_genome)
157 | 
158 |         self._run_cmd(cmd)
159 | 
160 |         cmd = "{0} index {1}/md.bam".format(
161 |             self.samtools,
162 |             self.tmppath)
163 | 
164 |         self._run_cmd(cmd)
165 | 
166 |     def _process_opossum(self, bam_input="md.bam"):
167 |         """
168 |              " --SoftClipsExist True --KeepMismatches True " \
169 |         """
170 |         if self.check_if_output_exists(
171 |             "{0}/opossum.bam".format(self.tmppath)):
172 |             return
173 | 
174 |         self._run_cmd(
175 |             'echo "\n\n######## LAUNCHING opossum ########\n"')
176 | 
177 |         cmd = "{0} {1}/Opossum.py --BamFile {2}/{3} " \
178 |               " --OutFile {2}/clean.bam ".format(
179 |             PYTHON,
180 |             PATH_OPOSSUM,
181 |             self.tmppath,
182 |             bam_input
183 |             )
184 | 
185 |         self._run_cmd(cmd)
186 | 
187 |     def _process_freebayes(self, bam_input="clean.bam"):
188 |         """
189 |         """
190 |         if self.check_if_output_exists(
191 |             "{0}/snv_filtered_freebayes.vcf".format(self.tmppath)):
192 |             return
193 | 
194 |         self._run_cmd(
195 |             'echo "\n\n######## LAUNCHING freebayes ########\n"')
196 | 
197 |         start_time = time()
198 | 
199 |         cmd = "{0}  -f {1} {2}/{3} > {2}/snv_filtered_freebayes.vcf".format(
200 |             self.path_freebayes,
201 |             self.ref_genome,
202 |             self.tmppath,
203 |             bam_input
204 |             )
205 | 
206 |         self._run_cmd(cmd)
207 | 
208 |         self._run_cmd(
209 |             'echo "\n## freebayes done in {0} s##\n"'.format(time() - start_time))
210 | 
211 |         assert(isfile("{0}/snv_filtered_freebayes.vcf".format(self.tmppath)))
212 | 
213 | if __name__ == '__main__':
214 |     main()
215 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/extract_matrices_from_dataset.py:
--------------------------------------------------------------------------------
  1 | from garmire_SSrGE.extract_data import ExtractData
  2 | from garmire_SSrGE.load_data import load_gsm_and_sample_names_from_soft
  3 | 
  4 | from garmire_SSrGE.config import EXPRESSION_MATRIX_FOLDER_PATH
  5 | from garmire_SSrGE.config import VCF_FOLDER_PATH
  6 | from garmire_SSrGE.config import GENE_MATRIX_NAME
  7 | from garmire_SSrGE.config import VCF_NAME
  8 | 
  9 | from sklearn.feature_extraction import DictVectorizer
 10 | 
 11 | from os import listdir
 12 | 
 13 | from sys import stdout
 14 | 
 15 | from tabulate import tabulate
 16 | 
 17 | from collections import Counter
 18 | 
 19 | 
 20 | def debug():
 21 |     """ DEBUG """
 22 |     extract_matrix = ExtractMatrix()
 23 | 
 24 |     SNV_mat = extract_matrix.extract_SNV_mat()
 25 |     GE_mat = extract_matrix.extract_GE_mat()
 26 | 
 27 | 
 28 | class ExtractMatrix():
 29 |     """
 30 |     class to extract SNV_mat and GE_mat from existing dataset
 31 | 
 32 |     Project variables must be defined into the config file (config.py):
 33 | 
 34 |     PROJECT_PATH
 35 |     # path toward the project folder
 36 |     GTF_PATH
 37 |     # gtf file of the reference genome
 38 |     GTF_PATH
 39 |     # gtf file of the reference genome
 40 |     EXPRESSION_MATRIX_FOLDER_PATH
 41 |     # Path of the folders containing the gene expression matrices
 42 |     GENE_MATRIX_NAME
 43 |     # name of the gene expression matrix file
 44 |     VCF_FOLDER_PATH
 45 |     # Path of the folders containing the vcf files
 46 |     VCF_NAME
 47 |     # Name of the VCF the vcf files
 48 |     """
 49 | 
 50 |     def __init__(self,
 51 |                  min_shared_snv=None,
 52 |                  min_gene_expr=None,
 53 |                  min_average_gene_expr=2,
 54 |                  vcf_folder_path=VCF_FOLDER_PATH,
 55 |                  expression_matrix_folder_path=EXPRESSION_MATRIX_FOLDER_PATH,
 56 |                  gene_matrix_name=GENE_MATRIX_NAME,
 57 |                  vcf_name=VCF_NAME,
 58 |                  limit=None):
 59 |         """
 60 |         :min_shared_snv: int    min number of cells sharing a given snv
 61 |         :min_gene_expr: float    min number of gene expression value
 62 |         :min_average_gene_expr: float    min number of average gene expression value
 63 |                                          on average
 64 |         :vcf_folder_path: path to vcf folders (one folder per single cell)
 65 |         :expression_matrix_folder_path: path to expression matrices folders (one folder per single cell)
 66 |         :gene_matrix_name: name of the gene expression file for each SC folder
 67 |         :vcf_name: name of the .vcf file for each SC folder
 68 |         """
 69 |         self.vcf_folder_path = vcf_folder_path
 70 |         self.expression_matrix_folder_path = expression_matrix_folder_path
 71 |         self.gene_matrix_name = gene_matrix_name
 72 |         self.vcf_name = vcf_name
 73 | 
 74 |         self.min_shared_snv = min_shared_snv
 75 |         self.min_gene_expr = min_gene_expr
 76 |         self.min_average_gene_expr = min_average_gene_expr
 77 | 
 78 |         samples_with_vcf = set()
 79 |         samples_with_ge_mat = set()
 80 | 
 81 |         if self.vcf_folder_path:
 82 |             samples_with_vcf = set(listdir(self.vcf_folder_path))
 83 | 
 84 |         if self.expression_matrix_folder_path:
 85 |             samples_with_ge_mat = set(listdir(self.expression_matrix_folder_path))
 86 | 
 87 |         if samples_with_vcf and samples_with_ge_mat:
 88 |             self.samples = list(samples_with_vcf.intersection(samples_with_ge_mat))
 89 |         else:
 90 |             self.samples = list(samples_with_vcf.union(samples_with_ge_mat))
 91 | 
 92 |         if limit:
 93 |             self.samples = self.samples[:limit]
 94 | 
 95 |         self.samples_snv_dict = {}
 96 |         self.samples_ge_dict = {}
 97 | 
 98 |         self.extract_data = ExtractData(
 99 |             vcf_folder_path=self.vcf_folder_path,
100 |             expression_matrix_folder_path=self.expression_matrix_folder_path,
101 |             gene_matrix_name=self.gene_matrix_name,
102 |             vcf_name=self.vcf_name)
103 | 
104 |         self.gsm_to_name = load_gsm_and_sample_names_from_soft()
105 |         self.names = []
106 | 
107 |         for sample in self.samples:
108 |             gsm = sample.split('_')[0]
109 |             name = self.gsm_to_name[gsm] if gsm in self.gsm_to_name else gsm
110 |             self.names.append(name)
111 | 
112 |     def get_samples_list(self):
113 |         """ """
114 |         return self.samples
115 | 
116 |     def extract_SNV_mat(self):
117 |         """
118 |         construct SNV binary matrix (n_samples x n_SNVs),
119 |         using the project variables described into the config.py file
120 | 
121 |         return:
122 |             :SNV_mat: Matrix (n_samples x n_SNVs)
123 |         """
124 | 
125 |         if not self.vcf_folder_path:
126 |             return None
127 | 
128 |         i = 0
129 | 
130 |         for sample in self.samples:
131 |             self.samples_snv_dict[sample] = self.extract_data.\
132 |                                             load_snv_from_cell(sample)
133 |             i += 1
134 |             stdout.write('\r{0} / {1} VCF files readed'.format(i, len(self.samples)))
135 |             stdout.flush()
136 | 
137 |         average_snvs = Counter()
138 | 
139 |         for sample in self.samples_snv_dict:
140 |             average_snvs += self.samples_snv_dict[sample]
141 | 
142 |         if self.min_shared_snv:
143 |             for sample in self.samples_snv_dict:
144 |                 for snv in self.samples_snv_dict[sample].keys():
145 |                     if average_snvs[snv] < self.min_shared_snv:
146 |                         self.samples_snv_dict[sample].pop(snv)
147 | 
148 |         tab = []
149 | 
150 |         for sample, name in zip(self.samples, self.names):
151 |             tab.append((sample,
152 |                         name,
153 |                         len(self.samples_snv_dict[sample])))
154 | 
155 |         print('\n', tabulate(tab, headers=['sample', 'name', 'Number of SNVs']))
156 | 
157 |         vectorizer = DictVectorizer()
158 | 
159 |         f_matrix = vectorizer.fit_transform([self.samples_snv_dict[sample]
160 |                                              for sample in self.samples])
161 |         self.snv_index = vectorizer.vocabulary_
162 | 
163 |         print('number of SNVs in the dataset:', len(self.snv_index))
164 | 
165 |         return f_matrix
166 | 
167 |     def extract_GE_mat(self):
168 |         """
169 |         construct GE matrix (n_genes x n_samples),
170 |         using the project variables described into the config.py file
171 | 
172 |         return:
173 |             :GE_mat: Matrix  (n_genes x n_samples)
174 |         """
175 | 
176 |         if not self.expression_matrix_folder_path:
177 |             return None
178 | 
179 |         i = 0
180 | 
181 |         for sample, name in zip(self.samples, self.names):
182 |             self.samples_ge_dict[sample] = self.extract_data.\
183 |                                            load_expression_profile_from_cell(sample)
184 |             i += 1
185 |             stdout.write('\r{0} / {1} expression files readed'.format(i, len(self.samples)))
186 |             stdout.flush()
187 | 
188 |         average_expr = self.extract_data.get_average_expression_dict()
189 | 
190 |         tab = []
191 | 
192 |         if self.min_gene_expr or self.min_average_gene_expr:
193 |             for sample in self.samples_ge_dict:
194 |                 for gene in self.samples_ge_dict[sample].keys():
195 | 
196 |                     if self.min_average_gene_expr \
197 |                        and average_expr[gene] < self.min_average_gene_expr:
198 |                         self.samples_ge_dict[sample].pop(gene)
199 | 
200 |                     if self.min_gene_expr \
201 |                        and self.samples_ge_dict[sample][gene] < self.min_gene_expr:
202 |                         self.samples_ge_dict[sample].pop(gene, None)
203 | 
204 |         for sample, name in zip(self.samples, self.names):
205 |             tab.append((sample,
206 |                         name,
207 |                         len(self.samples_ge_dict[sample])))
208 | 
209 |         print('\n', tabulate(tab, headers=['sample', 'name', 'Number of genes']))
210 | 
211 |         vectorizer = DictVectorizer()
212 |         f_matrix = vectorizer.fit_transform([self.samples_ge_dict[sample]
213 |                                              for sample in self.samples])
214 |         self.ge_index = vectorizer.vocabulary_
215 | 
216 |         print('number of genes in the dataset:', len(self.ge_index))
217 | 
218 |         return f_matrix.T
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     debug()
223 | 


--------------------------------------------------------------------------------
/example/jones_pancreatic_cancer.soft:
--------------------------------------------------------------------------------
  1 | ^DATABASE = GeoMiame
  2 | !Database_name = Gene Expression Omnibus (GEO)
  3 | !Database_institute = NCBI NLM NIH
  4 | !Database_web_link = http://www.ncbi.nlm.nih.gov/geo
  5 | !Database_email = geo@ncbi.nlm.nih.gov
  6 | ^SERIES = GSE85183
  7 | !Series_title = Selective single cell isolation for genomics using microraft arrays
  8 | !Series_geo_accession = GSE85183
  9 | !Series_status = Public on Aug 05 2016
 10 | !Series_submission_date = Aug 04 2016
 11 | !Series_last_update_date = Aug 05 2016
 12 | !Series_summary = Genomic methods are used increasingly to interrogate the individual cells that compose specific tissues. However, current methods for single cell isolation struggle to phenotypically differentiate specific cells in a heterogeneous population and rely primarily on the use of fluorescent markers. Many cellular phenotypes of interest are too complex to be measured by this approach, making it difficult to connect genotype and phenotype at the level of individual cells. Here we demonstrate that microraft arrays, which are arrays containing thousands of individual cell culture sites, can be used to select single cells based on a variety of phenotypes, such as cell surface markers, cell proliferation and drug response. We then show that a common genomic procedure, RNA-seq, can be readily adapted to the single cells isolated from these rafts. We show that data generated using microrafts and our modified RNA-seq protocol compared favorably with the Fluidigm C1. We then used microraft arrays to select pancreatic cancer cells that proliferate in spite of cytotoxic drug treatment. Our single cell RNA-seq data identified several expected and novel gene expression changes associated with early drug resistance.
 13 | !Series_overall_design = 120 samples including cells isolated using microrafts and the Fluidigm C1
 14 | !Series_type = Expression profiling by high throughput sequencing
 15 | !Series_contributor = Joshua,D,Welch
 16 | !Series_contributor = Corbin,D,Jones
 17 | !Series_sample_id = GSM2259781
 18 | !Series_sample_id = GSM2259782
 19 | !Series_sample_id = GSM2259783
 20 | !Series_contact_name = Corbin,D.,Jones
 21 | !Series_contact_email = cdjones@email.unc.edu
 22 | !Series_contact_institute = The University of North Carolina at Chapel Hill
 23 | !Series_contact_address = 3159 Genome Sciences Building
 24 | !Series_contact_city = Chapel Hill
 25 | !Series_contact_state = NC
 26 | !Series_contact_zip/postal_code = 27599
 27 | !Series_contact_country = USA
 28 | !Series_supplementary_file = ftp://ftp.ncbi.nlm.nih.gov/pub/geo/DATA/supplementary/series/GSE85183/GSE85183_expression_levels.txt.gz
 29 | !Series_supplementary_file = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP080/SRP080915
 30 | !Series_platform_id = GPL15520
 31 | !Series_platform_taxid = 9606
 32 | !Series_sample_taxid = 9606
 33 | !Series_relation = BioProject: http://www.ncbi.nlm.nih.gov/bioproject/PRJNA336476
 34 | !Series_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRP080915
 35 | ^PLATFORM = GPL15520
 36 | !Platform_title = Illumina MiSeq (Homo sapiens)
 37 | !Platform_geo_accession = GPL15520
 38 | !Platform_status = Public on May 02 2012
 39 | !Platform_submission_date = May 02 2012
 40 | !Platform_last_update_date = Aug 05 2016
 41 | !Platform_technology = high-throughput sequencing
 42 | !Platform_distribution = virtual
 43 | !Platform_organism = Homo sapiens
 44 | !Platform_taxid = 9606
 45 | !Platform_contact_name = ,,GEO
 46 | !Platform_contact_country = USA
 47 | !Platform_data_row_count = 0
 48 | ^SAMPLE = GSM2259781
 49 | !Sample_title = mch2-1_TAAGGCG-TATCCTC_L001_RNA-seq
 50 | !Sample_geo_accession = GSM2259781
 51 | !Sample_status = Public on Aug 05 2016
 52 | !Sample_submission_date = Aug 04 2016
 53 | !Sample_last_update_date = Aug 05 2016
 54 | !Sample_type = SRA
 55 | !Sample_channel_count = 1
 56 | !Sample_source_name_ch1 = CFPAC-1_None_Bulk
 57 | !Sample_organism_ch1 = Homo sapiens
 58 | !Sample_taxid_ch1 = 9606
 59 | !Sample_characteristics_ch1 = cell line: CFPAC-1
 60 | !Sample_characteristics_ch1 = treated with: None
 61 | !Sample_characteristics_ch1 = isolation: Bulk
 62 | !Sample_characteristics_ch1 = num. cells: ~10000
 63 | !Sample_characteristics_ch1 = proliferative?: Unknown
 64 | !Sample_molecule_ch1 = total RNA
 65 | !Sample_extract_protocol_ch1 = ClonTech SMARTer kit
 66 | !Sample_extract_protocol_ch1 = Nextera XT
 67 | !Sample_description = RNA
 68 | !Sample_data_processing = Read alignment using MapSplice 2
 69 | !Sample_data_processing = Gene expression quantification using RSEM
 70 | !Sample_data_processing = Genome_build: hg19
 71 | !Sample_data_processing = Supplementary_files_format_and_content: Gene expression quantification (FPKMs)
 72 | !Sample_platform_id = GPL15520
 73 | !Sample_contact_name = Corbin,D.,Jones
 74 | !Sample_contact_email = cdjones@email.unc.edu
 75 | !Sample_contact_institute = The University of North Carolina at Chapel Hill
 76 | !Sample_contact_address = 3159 Genome Sciences Building
 77 | !Sample_contact_city = Chapel Hill
 78 | !Sample_contact_state = NC
 79 | !Sample_contact_zip/postal_code = 27599
 80 | !Sample_contact_country = USA
 81 | !Sample_instrument_model = Illumina MiSeq
 82 | !Sample_library_selection = cDNA
 83 | !Sample_library_source = transcriptomic
 84 | !Sample_library_strategy = RNA-Seq
 85 | !Sample_relation = BioSample: http://www.ncbi.nlm.nih.gov/biosample/SAMN05511259
 86 | !Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRX1999927
 87 | !Sample_supplementary_file_1 = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX199/SRX1999927
 88 | !Sample_series_id = GSE85183
 89 | !Sample_data_row_count = 0
 90 | ^SAMPLE = GSM2259782
 91 | !Sample_title = MCH2-2_AGGCAGA-AAGGAGT_L001_RNA-seq
 92 | !Sample_geo_accession = GSM2259782
 93 | !Sample_status = Public on Aug 05 2016
 94 | !Sample_submission_date = Aug 04 2016
 95 | !Sample_last_update_date = Aug 05 2016
 96 | !Sample_type = SRA
 97 | !Sample_channel_count = 1
 98 | !Sample_source_name_ch1 = CFPAC-1_None_Bulk
 99 | !Sample_organism_ch1 = Homo sapiens
100 | !Sample_taxid_ch1 = 9606
101 | !Sample_characteristics_ch1 = cell line: CFPAC-1
102 | !Sample_characteristics_ch1 = treated with: None
103 | !Sample_characteristics_ch1 = isolation: Bulk
104 | !Sample_characteristics_ch1 = num. cells: ~10000
105 | !Sample_characteristics_ch1 = proliferative?: Unknown
106 | !Sample_molecule_ch1 = total RNA
107 | !Sample_extract_protocol_ch1 = ClonTech SMARTer kit
108 | !Sample_extract_protocol_ch1 = Nextera XT
109 | !Sample_description = RNA
110 | !Sample_data_processing = Read alignment using MapSplice 2
111 | !Sample_data_processing = Gene expression quantification using RSEM
112 | !Sample_data_processing = Genome_build: hg19
113 | !Sample_data_processing = Supplementary_files_format_and_content: Gene expression quantification (FPKMs)
114 | !Sample_platform_id = GPL15520
115 | !Sample_contact_name = Corbin,D.,Jones
116 | !Sample_contact_email = cdjones@email.unc.edu
117 | !Sample_contact_institute = The University of North Carolina at Chapel Hill
118 | !Sample_contact_address = 3159 Genome Sciences Building
119 | !Sample_contact_city = Chapel Hill
120 | !Sample_contact_state = NC
121 | !Sample_contact_zip/postal_code = 27599
122 | !Sample_contact_country = USA
123 | !Sample_instrument_model = Illumina MiSeq
124 | !Sample_library_selection = cDNA
125 | !Sample_library_source = transcriptomic
126 | !Sample_library_strategy = RNA-Seq
127 | !Sample_relation = BioSample: http://www.ncbi.nlm.nih.gov/biosample/SAMN05511258
128 | !Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRX1999928
129 | !Sample_supplementary_file_1 = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX199/SRX1999928
130 | !Sample_series_id = GSE85183
131 | !Sample_data_row_count = 0
132 | ^SAMPLE = GSM2259783
133 | !Sample_title = RAFTSE10_CGTACTA-GCGTAAG_L001_RNA-seq
134 | !Sample_geo_accession = GSM2259783
135 | !Sample_status = Public on Aug 05 2016
136 | !Sample_submission_date = Aug 04 2016
137 | !Sample_last_update_date = Aug 05 2016
138 | !Sample_type = SRA
139 | !Sample_channel_count = 1
140 | !Sample_source_name_ch1 = CFPAC-1_None_Microraft
141 | !Sample_organism_ch1 = Homo sapiens
142 | !Sample_taxid_ch1 = 9606
143 | !Sample_characteristics_ch1 = cell line: CFPAC-1
144 | !Sample_characteristics_ch1 = treated with: None
145 | !Sample_characteristics_ch1 = isolation: Microraft
146 | !Sample_characteristics_ch1 = num. cells: 0
147 | !Sample_characteristics_ch1 = proliferative?: Unknown
148 | !Sample_molecule_ch1 = total RNA
149 | !Sample_extract_protocol_ch1 = ClonTech SMARTer kit
150 | !Sample_extract_protocol_ch1 = Nextera XT
151 | !Sample_description = RNA
152 | !Sample_data_processing = Read alignment using MapSplice 2
153 | !Sample_data_processing = Gene expression quantification using RSEM
154 | !Sample_data_processing = Genome_build: hg19
155 | !Sample_data_processing = Supplementary_files_format_and_content: Gene expression quantification (FPKMs)
156 | !Sample_platform_id = GPL15520
157 | !Sample_contact_name = Corbin,D.,Jones
158 | !Sample_contact_email = cdjones@email.unc.edu
159 | !Sample_contact_institute = The University of North Carolina at Chapel Hill
160 | !Sample_contact_address = 3159 Genome Sciences Building
161 | !Sample_contact_city = Chapel Hill
162 | !Sample_contact_state = NC
163 | !Sample_contact_zip/postal_code = 27599
164 | !Sample_contact_country = USA
165 | !Sample_instrument_model = Illumina MiSeq
166 | !Sample_library_selection = cDNA
167 | !Sample_library_source = transcriptomic
168 | !Sample_library_strategy = RNA-Seq
169 | !Sample_relation = BioSample: http://www.ncbi.nlm.nih.gov/biosample/SAMN05511257
170 | !Sample_relation = SRA: http://www.ncbi.nlm.nih.gov/sra?term=SRX1999929
171 | !Sample_supplementary_file_1 = ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX199/SRX1999929
172 | !Sample_series_id = GSE85183
173 | !Sample_data_row_count = 0
174 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SSrGE procedure
  2 | 
  3 | This procedure aims to fit sparse linear models using a binary matrix (n_samples x n_SNV) as features matrix and a gene expression matrix (n_genes x n_samples) as response. The procedure infers a sparse linear model (LASSO by default) for each gene (raw in the second matrix) and keeps the non-null inferred coefs.
  4 | 
  5 | This procedure can be used as  a dimension reduction/feature selection procedure or as a feature ranking. It is based on the Scikit-Learn library and is easy to re-implement. However, the package allows to parallelize the fitting procedures, implements a cross-validation procedure and performs eeSNVs and gene rankings.
  6 | 
  7 | SSrGE can be used as a stand-alone procedure to reduce any SNV matrix (raw:single-cell, col: SNV (binary)), using a gene expression matrix (raw: gene-expression (float), col:single-cell). However, we have developped two additional modules, included in this package, that can be used to download and process RNA-seq data:
  8 | * [download_ncbi_data](https://github.com/lanagarmire/SSrGE/blob/master/README_download_ncbi_rsa.md): download and extract .sra files from NCBI
  9 | * [SNV_calling](https://github.com/lanagarmire/SSrGE/blob/master/README_snv_calling.md): align reads/infer SNVs and infer gene expression matrices from .fastq files.
 10 | 
 11 | Alternatively, we compiled the download, alignment, and SNV calling pipelines into a docker container: opoirion/ssrge (see bellow).
 12 | 
 13 | 
 14 | ## installation (local)
 15 | 
 16 | ```bash
 17 | git clone https://github.com/lanagarmire/SSrGE.git
 18 | cd SSrGE
 19 | pip2 install -r requirements.txt --user # python 2.7.X must be used
 20 | ```
 21 | 
 22 | ## Requirements
 23 | * Linux working environment
 24 | * [python 2 (>=2.7)](https://www.python.org/download/releases/2.7.2/)
 25 | * Python libraries (automatically installed with the pip install command):
 26 |   * Numpy
 27 |   * Scipy
 28 |   * [Scikit-learn](http://scikit-learn.org/) (version = 0.18)
 29 |   * tabulate
 30 | 
 31 | ## usage
 32 | * test SSrGE is functional:
 33 | ```bash
 34 |   python2 test/test_ssrge.py -v
 35 |   ```
 36 | 
 37 | * Instantiate and fit SSrGE:
 38 | 
 39 | SSrGE should be used as a python package, below are usage example.
 40 | SSrGE takes as input two matrices (A SNV matrix (n_cells x n_SNVs) and a Gene matrix (n_cells x n_Genes)
 41 | In the original study, we encoded X with the following procedure:
 42 | if a given snv (s) is present into a given cell (c), then X_c,n = 1
 43 | However, any type of encoding or continuous values can be used (For example, one can use X_c,n = 1 for a 1/1 genotype and 0.5 for a 0/1 genotype)
 44 | 
 45 | ```python
 46 | from garmire_SSrGE.ssrge import SSrGE
 47 | from garmire_SSrGE.examples import create_example_matrix_v1 # create examples matrices
 48 | 
 49 | 
 50 | help(SSrGE) # See the different functions and specific variables
 51 | help(create_example_matrix_v1)
 52 | 
 53 | X, Y, W = create_example_matrix_v1()
 54 | 
 55 | ssrge = SSrGE()
 56 | 
 57 | ssrge.fit(X, Y)
 58 | 
 59 | score_models, score_null_models = ssrge.score(X, Y)
 60 | 
 61 | X_r = ssrge.transform(X)
 62 | 
 63 | print X_r.shape, X.shape
 64 | 
 65 | ranked_feature = ssrge.rank_eeSNVs()
 66 | 
 67 | ssrge_ES = SSrGE(model='ElasticNet', alpha=01, l1_ratio=0.5) # Fitting using sklearn ElasticNet instead
 68 | ssrge_ES.fit(X, Y)
 69 | 
 70 | ```
 71 | 
 72 | * Add CNV matrix:
 73 | 
 74 | The fit method can take an additional CNV matrix of shape (n_cells x n_genes), and describing the CNV level for each gene.
 75 | 
 76 | ```python
 77 | from garmire_SSrGE.examples import create_example_matrix_v3
 78 | 
 79 | X, Y, C, W = create_example_matrix_v3()
 80 | 
 81 | help(ssrge.fit) # see the specific documentation of the fit method
 82 | ssrge.fit(X, Y, C)
 83 | ```
 84 | 
 85 | * Rank eeSNVs:
 86 | 
 87 | ```python
 88 | ranked_feature = ssrge.rank_eeSNVs()
 89 | ```
 90 | 
 91 | * Performing cross-validation
 92 | 
 93 | ```python
 94 | from garmire_SSrGE.linear_cross_validation import LinearCrossVal
 95 | 
 96 | help(LinearCrossVal)
 97 | 
 98 | X, Y, W = create_example_matrix_v1()
 99 | 
100 | cross_val = LinearCrossVal(
101 | model='LASSO',
102 | SNV_mat=X,
103 | GE_mat=Y
104 | )
105 | 
106 | path = cross_val.regularization_path('alpha',  [0.01, 0.1, 0.2])
107 | ```
108 | 
109 | ## Use K top-ranked eeSNVs
110 | 
111 | Instead of relying on the regularization parameter (alpha), to select the number of eeSNVs, the `nb_ranked_features` argument can be specified to abotained a fixed  number of eeSNVs (assuming that nb_ranked_features is lower than the number of eeSNVs obtained with the specified alpha).
112 | 
113 | ```python
114 | ssrge_topk = SSrGE(nb_ranked_features=2)
115 | X_r_2 = ssrge_topk.fit_transform(X, Y)
116 | 
117 | print X_r_2.shape # (100, 2)
118 | 
119 | ```
120 | 
121 | ## Ranking genes using eeSNVs and providing SNV ids
122 | 
123 | In order to rank genes with eeSNVs, the SSrGE instance must be instantiated with SNV ids and gene ids list.
124 | 
125 | * the gene id order should correspond to the gene matrix
126 | * a SNV id should be a tuple containing the gene id harboring the given SNV and a user defined SNV id (genome position for example).
127 | 
128 | ```python
129 | gene_id_list_example = ['KRAS', 'HLA-A', 'SPARC']
130 | snv_id_list_example = [('KRAS', 10220), ('KRAS', 10520), ('SPARC', 0220)]
131 | 
132 | 
133 | ## real example
134 | from garmire_SSrGE.examples import create_example_matrix_v2
135 | 
136 | X, Y, gene_id_list, snv_id_list = create_example_matrix_v2()
137 | 
138 | ssrge = SSrGE(
139 |       snv_id_list=snv_id_list,
140 |       gene_id_list=gene_id_list,
141 |       nb_ranked_features=2,
142 |       alpha=0.01)
143 | 
144 | ssrge.fit(X, Y)
145 | 
146 | print ssrge.rank_genes()
147 | 
148 | ```
149 | 
150 | ## Analyzing a subgroup
151 | 
152 | Extract specific eeSNVs and impacted genes of a given subgroup. a given eeSNV is specific to a subgroup if it is signficantly more present amongst the cells from the given subgroup:
153 | 
154 | ```python
155 | 
156 | # Defining as a subgroup the first 6 elements from X
157 | subgroup = ssrge.rank_features_for_a_subgroup([0, 1, 2, 3, 4, 5])
158 | 
159 | print subgroup.ranked_genes
160 | print subgroup.ranked_eeSNVs
161 | 
162 | print subgroup.significant_genes
163 | print subgroup.significant_eeSNVs
164 | 
165 | ```
166 | 
167 | ## create SNV and GE matrices from .VCF files and gene expression files
168 | 
169 | It is possible to create an SNV matrix using preexisting .vcf files and also a Gene expression matrix using expression files.
170 | 
171 | Each cell must have a distinct .vcf file with a unique name (e.g. snv_filtered.vcf) inside a unique folder, specific of the cell, with the name of the cells:
172 | 
173 | * example:
174 | 
175 | ```bash
176 | 
177 | data
178 | |-- GSM2259781__SRX1999927__SRR3999457
179 | |   |-- snv_filtered.vcf
180 | |   `-- stdout.log
181 | `-- GSM2259782__SRX1999928__SRR3999458
182 |     |-- snv_filtered.vcf
183 |     `-- stdout.log
184 | 
185 | ```
186 | 
187 | (stdout.log is not used and were created by the previous analysis)
188 | 
189 | and similarly for the gene expression files (matrix_counts.txt):
190 | 
191 | ```bash
192 | 
193 | STAR
194 | |-- GSM2259781__SRX1999927__SRR3999457
195 | |   |-- matrix_counts.txt
196 | |   `-- matrix_counts.txt.summary
197 | `-- GSM2259782__SRX1999928__SRR3999458
198 |     |-- matrix_counts.txt
199 |     `-- matrix_counts.txt.summary
200 | 
201 | ```
202 | 
203 | (matrix_counts.txt.summary is not used and were created by the previous analysis)
204 | 
205 | * The format of the expression files supported is the following:
206 | 
207 | ```bash
208 | 
209 | #gene_name    chromsomes    starting position    ending position    additionnal columns    gene expression
210 | MIR6859-3    chr1;chr15;chr16    17369;102513727;67052   17436;102513794;67119   ...    200
211 | ```
212 | 
213 | * variables (paths and file names) specific to GE and SNV matrix extraction can be defined in the config file: garmire_SSrGE/config.py
214 | * First, a GTF index must be created:
215 | 
216 | ```bash
217 | python2 ./garmire_SSrGE/generate_refgenome_index.py
218 | ```
219 | 
220 | * Once the index generated, the matrices can be genereated easily:
221 | 
222 | 
223 | # SRA project download, STAR alignment and SNV calling from scratch using docker
224 | 
225 | ## Requirements
226 | * docker
227 | * possible root access
228 | * 13.8 GB of free memory (docker image) + memory for STAR indexes (usually 20 GB per index) and downloaded data
229 | 
230 | ## installation (local)
231 | 
232 | ```bash
233 | docker pull opoirion/ssrge
234 | mkdir /<Results data folder>/
235 | cd /<Results data folder>/
236 | PATHDATA=`pwd`
237 | ```
238 | 
239 | ## usage
240 | 
241 | The pipeline consists of 3 steps (for downloading the data) and 4 steps for aligning and calling SNVs:
242 | 
243 | ```bash
244 | # Download
245 | docker run --rm opoirion/ssrge download_soft_file -h
246 | docker run --rm opoirion/ssrge download_sra -h
247 | docker run --rm opoirion/ssrge extract_sra -h
248 | # align and SNV calling
249 | docker run --rm opoirion/ssrge star_index -h
250 | docker run --rm opoirion/ssrge process_star -h
251 | docker run --rm opoirion/ssrge feature_counts -h
252 | docker run --rm opoirion/ssrge process_snv -h
253 | 
254 | ```
255 | 
256 | ## example
257 | 
258 | Let's download and process 2 samples from GSE79457 in a project name test_n2
259 | 
260 | ```bash
261 | # download of the soft file containing the metadata for GSE79457
262 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_soft_file -project_name test_n2 -soft_id GSE79457
263 | # download sra files
264 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge download_sra -project_name test_n2 -max_nb_samples 2
265 | # exctract sra files
266 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge extract_sra -project_name test_n2
267 | # rm sra files (optionnal)
268 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge rm_sra -project_name test_n2
269 | ## all these data can also be obtained using other alternative workflows
270 | # here you need to precise which read length to use for creating a STAR index and which ref organism (MOUSE/HUMAN)
271 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge star_index -project_name test_n2 -read_length 100 -cell_type HUMAN
272 | # STAR alignment
273 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_star -project_name test_n2 -read_length 100 -cell_type HUMAN
274 | # sample-> gene count matrix
275 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge feature_counts -project_name test_n2
276 | #SNV inference
277 | docker run --rm -v $PATHDATA:/data/results/:Z opoirion/ssrge process_snv -project_name test_n2 -cell_type HUMAN
278 | ```
279 | 
280 | 
281 | ## contact and credentials
282 | * Developer: Olivier Poirion (PhD)
283 | * contact: opoirion@hawaii.edu
284 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/multiprocess_fitting.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Queue
  2 | from multiprocessing import Process
  3 | 
  4 | from contextlib import contextmanager
  5 | import signal
  6 | 
  7 | import numpy as np
  8 | 
  9 | from time import sleep
 10 | from sys import stdout
 11 | 
 12 | from garmire_SSrGE.config import MIN_OBS_FOR_REGRESS
 13 | from garmire_SSrGE.config import TIME_LIMIT
 14 | 
 15 | from collections import Counter
 16 | 
 17 | from numpy import hstack
 18 | from scipy.sparse import hstack as shstack
 19 | from scipy.sparse import issparse
 20 | 
 21 | 
 22 | import warnings
 23 | 
 24 | from time import time
 25 | 
 26 | 
 27 | class TimeoutException(Exception): pass
 28 | 
 29 | 
 30 | @contextmanager
 31 | def time_limit(seconds):
 32 |     warnings.catch_warnings()
 33 |     warnings.simplefilter("ignore")
 34 | 
 35 |     def signal_handler(signum, frame):
 36 |         raise TimeoutException("Timed out!")
 37 | 
 38 |     signal.signal(signal.SIGALRM, signal_handler)
 39 |     signal.alarm(seconds)
 40 | 
 41 |     try:
 42 |         yield
 43 |     finally:
 44 |         signal.alarm(0)
 45 | 
 46 | 
 47 | def debug():
 48 |     """
 49 |     #### DEBUG ####
 50 |     **** Test function ****
 51 |     """
 52 |     from garmire_SSrGE.examples import create_example_matrix_v3
 53 |     from sklearn.linear_model import Lasso
 54 | 
 55 | 
 56 |     X, Y, C, W = create_example_matrix_v3()
 57 | 
 58 |     multi_test = BatchFitting(I_mat=X,
 59 |                               O_mat=Y.T,
 60 |                               CNV_mat=C,
 61 |                               model=Lasso,
 62 |                               model_params={'alpha': 0.01},
 63 |                               nb_processes=1,
 64 |                               only_nonzero=False,
 65 |                               min_obs_for_regress=0,
 66 |                               cis_model=None)
 67 |     g_index, coefs, intercepts = multi_test.run()
 68 | 
 69 |     return g_index, coefs, intercepts
 70 | 
 71 | class MultiProcessFitting(Process):
 72 |     def __init__(self,
 73 |                  input_queue,
 74 |                  output_queue,
 75 |                  model,
 76 |                  model_params,
 77 |                  matrix,
 78 |                  process_id,
 79 |                  CNV_mat=None,
 80 |                  time_limit=TIME_LIMIT,
 81 |                  min_obs_for_regress=MIN_OBS_FOR_REGRESS,
 82 |                  only_nonzero=False,
 83 |                  cis_model=None):
 84 |         """ """
 85 |         Process.__init__(self)
 86 |         self.input_queue = input_queue
 87 |         self.output_queue = output_queue
 88 |         self.model = model
 89 |         self.model_params = model_params
 90 |         self.matrix = matrix
 91 |         self.CNV_mat = CNV_mat
 92 |         self.process_id = process_id
 93 |         self.only_nonzero = only_nonzero
 94 |         self.cis_model = cis_model
 95 |         self.time_limit = time_limit
 96 |         self.min_obs_for_regress = min_obs_for_regress
 97 | 
 98 |     def run(self):
 99 |         """ """
100 |         model = self.model(**self.model_params)
101 | 
102 |         while not self.input_queue.empty():
103 |             try:
104 |                 gene_i, y, data = self.input_queue.get(True, 0.1)
105 |             except Exception as e:
106 |                 continue
107 | 
108 |             index = None
109 | 
110 |             if self.only_nonzero:
111 |                 matrix, y, index = self._clean_matrix(y)
112 |             else:
113 |                 matrix = self.matrix
114 | 
115 |             if self.cis_model:
116 |                 matrix = self._matrix_to_cis_model(matrix, gene_i)
117 | 
118 |             if self.CNV_mat is not None:
119 |                 matrix = self._add_cnv(matrix, y, gene_i, index)
120 | 
121 |             if y.shape[0] > self.min_obs_for_regress and \
122 |                not isinstance(matrix, type(None)):
123 |                 try:
124 |                     with time_limit(self.time_limit):
125 |                         model.fit(X=matrix, y=y, **data)
126 |                 except Exception as e:
127 |                     intercept = np.nan
128 |                     coefs = np.empty(self.matrix.shape[1])
129 |                     coefs[:] = np.nan
130 |                     print('\n exception found for linear model:{0}\n skipping'\
131 |                           .format(e))
132 |                 else:
133 |                     if self.cis_model:
134 |                         coefs = np.zeros(self.matrix.shape[1])
135 |                         coefs[self.cis_model[gene_i]] = model.coef_
136 |                     else:
137 |                         coefs = model.coef_
138 | 
139 |                     intercept = model.intercept_
140 | 
141 |             else:
142 |                 intercept = np.nan
143 |                 coefs = np.empty(self.matrix.shape[1])
144 |                 coefs[:] = np.nan
145 | 
146 |             coefs = Counter({i:np.abs(coefs[i])
147 |                              for i in np.nonzero(np.nan_to_num(coefs))[0]})
148 | 
149 |             while True:
150 |                 try:
151 |                     self.output_queue.put((gene_i, coefs, intercept), timeout=0.1)
152 |                 except Exception as e:
153 |                     continue
154 |                 else:
155 |                     break
156 | 
157 |     def _matrix_to_cis_model(self, matrix, gene_i):
158 |         """ """
159 |         if not self.cis_model[gene_i]:
160 |             return None
161 |         return matrix.T[self.cis_model[gene_i]].T
162 | 
163 |     def _clean_matrix(self, y):
164 |         """ """
165 |         index = np.nonzero(y)[0]
166 |         return self.matrix[index], y[index], index
167 | 
168 |     def _add_cnv(self, matrix, y, gene_i, index):
169 |         """ """
170 |         stack = shstack if issparse(matrix) else hstack
171 |         CNV_mat = self.CNV_mat[index]
172 | 
173 |         return stack([matrix, CNV_mat.T[gene_i].T.reshape((matrix.shape[0], 1))])
174 | 
175 | 
176 | class BatchFitting():
177 |     """ """
178 |     def __init__(
179 |             self,
180 |             I_mat,
181 |             O_mat,
182 |             model,
183 |             model_params,
184 |             CNV_mat=None,
185 |             nb_processes=1,
186 |             time_limit=TIME_LIMIT,
187 |             min_obs_for_regress=MIN_OBS_FOR_REGRESS,
188 |             add_y_index=False,
189 |             only_nonzero=False,
190 |             cis_model=None,
191 |             ):
192 |         self.I_mat = I_mat
193 |         self.O_mat = O_mat
194 |         self.CNV_mat = CNV_mat
195 |         self.model = model
196 |         self.model_params = model_params
197 |         self.nb_processes = nb_processes
198 |         self.add_y_index = add_y_index
199 |         self.only_nonzero = only_nonzero
200 |         self.cis_model = cis_model
201 |         self.time_limit = time_limit
202 |         self.min_obs_for_regress = min_obs_for_regress
203 | 
204 |     def run(self):
205 |         """ run batch fitting """
206 | 
207 |         res = self._run()
208 | 
209 |         if isinstance(res, Exception):
210 |             raise res
211 | 
212 |         return res
213 | 
214 |     def _kill_processes(self):
215 |         """ """
216 |         for process in self.processes_list:
217 |             process.terminate()
218 | 
219 |     def _get_qsize(self, output_queue):
220 |         """ """
221 |         while True:
222 |             try:
223 |                 with time_limit(self.time_limit):
224 |                     try:
225 |                         out_qsize = output_queue.qsize()
226 |                     except NotImplementedError:
227 |                         out_qsize = None
228 |                 break
229 |             except Exception as e:
230 |                 print('exception was found for qsize:', e)
231 |                 continue
232 | 
233 |         return out_qsize
234 | 
235 |     def _run(self):
236 |         """custom unordered multiprocessing"""
237 |         input_queue = Queue()
238 |         output_queue = Queue()
239 |         self.processes_list = []
240 |         res_list = []
241 |         qsize = self.O_mat.shape[0]
242 |         i = 0
243 | 
244 |         for y in self.O_mat:
245 |             data = {}
246 |             if self.add_y_index:
247 |                 data['y_index'] = i
248 |             input_queue.put((i, y, data))
249 |             i += 1
250 | 
251 |         for i in range(self.nb_processes):
252 |             self.processes_list.append(
253 |                 MultiProcessFitting(
254 |                     input_queue=input_queue,
255 |                     output_queue=output_queue,
256 |                     model=self.model,
257 |                     model_params=self.model_params,
258 |                     matrix=self.I_mat,
259 |                     CNV_mat=self.CNV_mat,
260 |                     process_id=i,
261 |                     time_limit=self.time_limit,
262 |                     only_nonzero=self.only_nonzero,
263 |                     min_obs_for_regress=self.min_obs_for_regress,
264 |                     cis_model=self.cis_model)
265 |             )
266 | 
267 |         for process in self.processes_list:
268 |             process.start()
269 | 
270 |         terminate = False
271 | 
272 |         j = 0
273 |         prog = ['/', '-', '\\', '|']
274 | 
275 |         while True:
276 |             for process in self.processes_list:
277 |                 if process.exitcode:
278 |                     print('error with process with id: {0} terminating'\
279 |                           .format(process.process_id))
280 | 
281 |                     terminate = True
282 |                     break
283 | 
284 |             if terminate:
285 |                 break
286 | 
287 |             out_qsize = self._get_qsize(output_queue)
288 | 
289 |             if out_qsize is not None:
290 |                 stdout.write('\r{0} / {1} models done {2}'\
291 |                              .format(out_qsize, qsize, prog[j]))
292 |             else:
293 |                 stdout.write('\r Multithreqding queue not implemented for this OS'\
294 |                              ' cannot give an estimation of the models computed')
295 | 
296 |                 if input_queue.empty():
297 |                     sleep(self.time_limit)
298 |                     break
299 | 
300 |             stdout.flush()
301 | 
302 |             j += 1
303 | 
304 |             if j == 4:
305 |                 j = 0
306 | 
307 |             if out_qsize >= qsize or out_qsize == True:
308 |                 break
309 | 
310 |             sleep(0.5)
311 | 
312 |         if terminate:
313 |             print('one of the process raised an exception'\
314 |                   '\n killing process...')
315 |             self._kill_processes()
316 | 
317 |             return Exception('process not finished correctly!')
318 | 
319 |         print('\n')
320 | 
321 |         for i in range(qsize):
322 |             res_list.append(output_queue.get())
323 |             stdout.write('\r{0} / {1} results loaded'\
324 |                          .format(i + 1, qsize))
325 |             stdout.flush()
326 | 
327 |         del output_queue
328 |         del input_queue
329 | 
330 |         self._kill_processes()
331 | 
332 |         return zip(*res_list)
333 | 
334 | 
335 | if __name__ == "__main__":
336 |     debug()
337 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | config file for SNV calling pipeline
  3 | 
  4 | """
  5 | 
  6 | from os.path import split as pathsplit
  7 | from argparse import ArgumentParser
  8 | 
  9 | ARGPARSER = ArgumentParser(description='Argument for the SNV pipeline',
 10 |                                    prefix_chars='-')
 11 | 
 12 | ARGPARSER.add_argument('-project_name',
 13 |                        help='name of the project folder and where to find the fastq files (default: sample_test)',
 14 |                        default="sample_test",
 15 |                        metavar='str')
 16 | 
 17 | ARGPARSER.add_argument('-cell_type',
 18 |                        help=' (HUMAN/MOUSE) default: HUMAN',
 19 |                        default="HUMAN",
 20 |                        metavar='str')
 21 | 
 22 | ARGPARSER.add_argument('-read_length',
 23 |                        help=' star index read length (default: 51)',
 24 |                        default=51,
 25 |                        type=int,
 26 |                        metavar='int')
 27 | 
 28 | ARGPARSER.add_argument('-star_nb_threads',
 29 |                        help=' number of threads for STAR analysis (default 12)',
 30 |                        default=12,
 31 |                        type=int,
 32 |                        metavar='int')
 33 | 
 34 | ARGPARSER.add_argument('-snv_nb_threads',
 35 |                        help=' number of SNV calling pipelines executed in parallel (default 3)',
 36 |                        default=3,
 37 |                        type=int,
 38 |                        metavar='int')
 39 | 
 40 | ARGS = ARGPARSER.parse_known_args()[0]
 41 | 
 42 | # Project name. Used to create folder
 43 | PROJECT_NAME = ARGS.project_name
 44 | # type of the dataset (human or mouse). Used to select reference genomes
 45 | CELL_TYPE = ARGS.cell_type
 46 | # valid sequencing machine for picard tools:
 47 | # ILLUMINA, SLX, SOLEXA, SOLID, 454, LS454, COMPLETE, PACBIO,
 48 | # IONTORRE NT, CAPILLARY, HELICOS, UNKNOWN
 49 | PLATEFORM = 'ILLUMINA'
 50 | # Read length used to create star index for reference genome
 51 | STAR_INDEX_READ_LENGTH = ARGS.read_length
 52 | 
 53 | ############ FOLDER ARCHITECTURE  ####################################
 54 | #Alias to define the GLOBAL_DATA_ROOT, OUTPUT_ROOT and PROG_ROOT
 55 | # (could be overloaded using reference paths)
 56 | USER = 'opoirion'
 57 | # Alias to define the root folder for reference data
 58 | # (could be overloaded using reference paths)
 59 | GLOBAL_DATA_ROOT = '/data/'
 60 | # Alias to define the output folder
 61 | OUTPUT_ROOT = '/data/results/'
 62 | # Alias to define the folder containing softwares.
 63 | # (could be overloaded using reference paths)
 64 | PROG_ROOT = '/prog/'
 65 | # Absolute path for the .soft file (dataset description) from NCBI
 66 | SOFT_PATH = "{0}/{1}/{1}.soft".format(GLOBAL_DATA_ROOT, PROJECT_NAME)
 67 | ######################################################################
 68 | 
 69 | ############ STANDART VARIABLE #######################################
 70 | TYPE_VAR = {
 71 |     'HUMAN': {
 72 |         # gtf file containing annotated human genes
 73 |         'ANNOTATION_PATH': "/data/Illumina_hg19/Annotation/genes.gtf",
 74 |         # folder which will contains the STAR index using human genome
 75 |         'STAR_INDEX_PATH': "{0}/Illumina_hg19/Sequences/STARindex".format(OUTPUT_ROOT),
 76 |         # folder which will contains the BSSEQ index using human genome
 77 |         'BSSEQ_INDEX_PATH': "/data/Illumina_hg19/Sequences/BSSEQindex".format(OUTPUT_ROOT),
 78 |         # human reference fasta (.fa) file
 79 |         'REF_GENOME': "/data/Illumina_hg19/Sequences/WholeGenomeFasta/genome.fa",
 80 |         # Reference human genome used
 81 |         'ORGANISM': 'hg19',
 82 |         # reference variant database used. The last version can be downloaded from:
 83 |         # ftp://ftp.ncbi.nih.gov/snp/organisms/ (human_9607_b{version}_p2)
 84 |         'DBSNP': "/data/Illumina_hg19/vcf/dbsnp_138.hg19.reduced.vcf",\
 85 |         'VCF_RESOURCES': [
 86 |             # Other reference variant resources.
 87 |             # Can be downloaded from ftp://ftp.broadinstitute.org/bundle/2.8/hg19
 88 |             # "/data/hg19/vcf/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf",
 89 |             # Indel variant reference database
 90 |             # can be downloaded from ftp://ftp.broadinstitute.org/bundle/2.8/hg19
 91 |             # "/data/hg19/vcf/1000G_phase1.indels.hg19.sites.vcf",
 92 |             ]
 93 |     },
 94 |     'MOUSE': {
 95 |         # gtf file containing annotated mouse genes
 96 |         'ANNOTATION_PATH': "/data/Mus_musculus/UCSC/mm10/Annotation/genes.gtf",
 97 |         # folder which will contains the STAR index using mouse genome
 98 |         'STAR_INDEX_PATH': "{0}/Mus_musculus/UCSC/mm10/Sequence/STARindex".format(OUTPUT_ROOT),
 99 |         # folder which will contains the BS-SEQ index using mouse genome
100 |         'BSSEQ_INDEX_PATH': "{0}/Mus_musculus/UCSC/mm10/Sequence/BSSEQindex".format(OUTPUT_ROOT),
101 |         # Mouse reference fasta (.fa) file
102 |         'REF_GENOME': "/data/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa",
103 |         # Reference mouse genome used
104 |         'ORGANISM': 'mm10',
105 |         # reference variant database used. This version can be downloaded from:
106 |         # ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/.
107 |         'DBSNP': "/data/Mus_musculus/UCSC/mm10/vcf/mgp.v3.snps.rsIDdbSNPv137_ordered.reduced.vcf",
108 |         # reference indel variant database used. This version can be downloaded from:
109 |         # ftp://ftp-mouse.sanger.ac.uk/REL-1303- SNPs_Indels-GRCm38/.
110 |         # Mouse VCF files must be sorted toward the sequence dictionnary of the mouse reference genome using SortVCF function from picard-tools
111 |         'VCF_RESOURCES': [
112 |             # "/data/mm10/vcf/mgp.v3.indels.rsIDdbSNPv137_ordered.vcf"
113 |             ]
114 |     }
115 | }
116 | ######################################################################
117 | 
118 | ############ MOUSE/ HUMAN ############################################
119 | REF_GENOME = TYPE_VAR[CELL_TYPE]['REF_GENOME']
120 | ANNOTATION_PATH = TYPE_VAR[CELL_TYPE]['ANNOTATION_PATH']
121 | STAR_INDEX_PATH = TYPE_VAR[CELL_TYPE]['STAR_INDEX_PATH']
122 | ORGANISM = TYPE_VAR[CELL_TYPE]['ORGANISM']
123 | DBSNP = TYPE_VAR[CELL_TYPE]['DBSNP']
124 | VCF_RESOURCES = TYPE_VAR[CELL_TYPE]['VCF_RESOURCES']
125 | BSSEQ_INDEX_PATH = TYPE_VAR[CELL_TYPE]['BSSEQ_INDEX_PATH']
126 | ######################################################################
127 | 
128 | ############# DATASET ################################################
129 | # Absolute path for fastq files.
130 | # Fastq files must be organised using one folder for one SRX experiment
131 | FASTQ_PATH = "{0}/{1}/fastq/".format(OUTPUT_ROOT, PROJECT_NAME)
132 | # output path
133 | PATH_OUTPUT = "{0}/{1}/".format(OUTPUT_ROOT, PROJECT_NAME)
134 | #specific string pattern that a folder name must match
135 | SPECIFIC_FILENAME_PATTERN = ""
136 | ######################################################################
137 | 
138 | ################ RRBS reads specific PREPROCESSING ###################
139 | # Used aligner (star for reads from gene expression
140 | # bismark / BS-seeker2 (RRBS read alignment))
141 | USED_ALIGNER = 'STAR'
142 | # Are the reads from the bislufite pipeline for SNV calling?
143 | ARE_READS_BISULFITE = False
144 | # specific trimming preprocessing for RRBS reads
145 | DO_TRIMGALORE = True
146 | ######################################################################
147 | 
148 | ######################## MONOVAR #####################################
149 | MONOVAR_REP = '{0}/monovar/'.format(PROG_ROOT)
150 | MONOVAR_SAMTOOLS = '{0}/external/samtools/samtools'.format(MONOVAR_REP)
151 | ######################################################################
152 | 
153 | ############# SOFTWARE ###############################################
154 | # Available java version. Must be > 1.8
155 | JAVA = "java"
156 | # Max memory used by Java
157 | JAVA_MEM = "-Xmx110g"
158 | # GATK folder where can be found GATK software
159 | GATK_DIR = "{0}/GATK/".format(PROG_ROOT)
160 | # GATK jar name
161 | GATK_JAR = "GenomeAnalysisTK.jar"
162 | # picard-tools software
163 | PICARD_DIR = "{0}/picard-tools-2.1.1/".format(PROG_ROOT)
164 | # Perl
165 | PERL = 'perl'
166 | # python
167 | PYTHON = 'python'
168 | #BOWTIE ALIGNER (for BSSEEKER and bismark)
169 | BOWTIE_REP = '/usr/bin/'
170 | # software for RRBS bisulfite reads preprocessing
171 | TRIMGALORE_REP = '{0}/TrimGalore/'.format(PROG_ROOT)
172 | # BSseeker2 software to call methylation reads
173 | BSSEEKER2_REP = '{0}/BSseeker2/'.format(PROG_ROOT)
174 | # BS-Snper (SNP calling for bisulfite reads)
175 | BSSNPER = '{0}/BS-Snper/BS-Snper.pl'.format(PROG_ROOT)
176 | # bismark software for RRBS alignment
177 | BISMARK_SOFTWARE = '{0}/Bismark/bismark'.format(PROG_ROOT)
178 | # STAR aligner software
179 | PATH_STAR_SOFTWARE = "{0}/STAR/bin/Linux_x86_64_static/STAR"\
180 |                           .format(PROG_ROOT)
181 | # fastqc software [OPTIONAL]
182 | FASTQC = "fastqc"
183 | # snpEff software (vcf annotation) [OPTIONAL]
184 | SNPEFF = '{0}/snpEff/snpEff.jar'.format(PROG_ROOT)
185 | # required snpEff databases (vcf annotation) [OPTIONAL]
186 | SNPEFF_DICT = {'MOUSE': 'GRCm38.82',
187 |                'HUMAN': 'GRCh37.75'}
188 | SNPEFF_DB = SNPEFF_DICT[CELL_TYPE]
189 | # SAMtools
190 | SAMTOOLS = '{0}/samtools-1.5/bin/samtools'.format(PROG_ROOT)
191 | ######################################################################
192 | 
193 | #############  STAR #################################################
194 | # Number of threads used when using STAR aligner
195 | STAR_THREADS = ARGS.star_nb_threads
196 | # output path for STAR results
197 | OUTPUT_PATH_STAR = PATH_OUTPUT + "/star/"
198 | #####################################################################
199 | 
200 | ############ SNV CALLING PIPELINE ###################################
201 | # output path for SNVs inferred
202 | OUTPUT_PATH_GATK =  PATH_OUTPUT + '/snv_pipeline_GATK/'
203 | # Number of SNV calling processes launched in parallel
204 | NB_PROCESS_SNV = ARGS.snv_nb_threads
205 | ####################################################################
206 | 
207 | ########### FREEBAYES SNV CALLING PIPELINE ##########################
208 | OUTPUT_PATH_FREEBAYES = PATH_OUTPUT + '/snv_pipeline_freebayes/'
209 | PATH_OPOSSUM = '{0}/Opossum/'.format(PROG_ROOT)
210 | PATH_FREEBAYES = '{0}/freebayes/bin/freebayes'.format(PROG_ROOT)
211 | ####################################################################
212 | 
213 | ############ COMPUTE DISTANCE MATRIX [OPTIONAL] ##############################
214 | # software to infer gene expressions count with raw count for each single cells
215 | # [OPTIONAL]
216 | FEATURE_COUNT = "featureCounts"
217 | # path for gene expression matrices [OPTIONAL]
218 | MATRIX_OUTPUT_PATH = "{0}/{1}/expression_profile/"\
219 |                      .format(OUTPUT_ROOT, PROJECT_NAME)
220 | ###############################################################################
221 | 
222 | 
223 | ######################## SNV SIMULATION #######################################
224 | SIMULATED_REF_GENOME = None
225 | 
226 | if SIMULATED_REF_GENOME:
227 |     MUTATION_FILE = '{0}/Simulated{1}Mut/sim_snv.bed'.format(
228 |         pathsplit(pathsplit(REF_GENOME)[0])[0], SIMULATED_REF_GENOME)
229 |     SEQUENCES_PATH = pathsplit(pathsplit(REF_GENOME)[0])[0]
230 |     SIM_GENOME_DIR = '{0}/Simulated{1}Mut/'.format(SEQUENCES_PATH, SIMULATED_REF_GENOME)
231 |     REF_GENOME_ORIGINAL = REF_GENOME[:]
232 |     REF_GENOME = '{0}/genome.fa'.format(SIM_GENOME_DIR)
233 | else:
234 |     MUTATION_FILE = None
235 |     SEQUENCES_PATH = None
236 |     SIM_GENOME_DIR = None
237 |     REF_GENOME_ORIGINAL = None
238 | ###############################################################################
239 | 


--------------------------------------------------------------------------------
/garmire_SSrGE/ssrge.py:
--------------------------------------------------------------------------------
  1 | from garmire_SSrGE.multiprocess_fitting import BatchFitting
  2 | 
  3 | from garmire_SSrGE.config import TIME_LIMIT
  4 | from garmire_SSrGE.config import MIN_OBS_FOR_REGRESS
  5 | from garmire_SSrGE.config import NB_THREADS
  6 | 
  7 | from sklearn.linear_model import Lasso
  8 | from sklearn.linear_model import ElasticNet
  9 | from sklearn.linear_model import OrthogonalMatchingPursuit
 10 | from sklearn.linear_model import LassoLars
 11 | from sklearn.linear_model import LassoCV
 12 | 
 13 | from sklearn.metrics import median_absolute_error
 14 | 
 15 | from scipy.stats import fisher_exact
 16 | 
 17 | from collections import Counter
 18 | from collections import defaultdict
 19 | 
 20 | from scipy.sparse import issparse
 21 | 
 22 | from warnings import warn
 23 | 
 24 | from sys import stdout
 25 | 
 26 | import numpy as np
 27 | 
 28 | 
 29 | def debug():
 30 |     """
 31 |     #### DEBUG ####
 32 | 
 33 |     **** Test function ****
 34 | 
 35 |     """
 36 |     from garmire_SSrGE.examples import create_example_matrix_v4
 37 | 
 38 |     X, Y, C, ge_list, s_list = create_example_matrix_v4()
 39 | 
 40 |     ssrge = SSrGE(snv_id_list=s_list,
 41 |                   gene_id_list=ge_list,
 42 |                   nb_ranked_features=3,
 43 |                   alpha=0.01)
 44 | 
 45 |     ssrge.fit_transform(X, Y, C)
 46 |     ssrge.score(X, Y)
 47 | 
 48 |     print(ssrge.retained_snvs)
 49 |     print(ssrge.retained_genes)
 50 | 
 51 |     ssrge = SSrGE(nb_ranked_features=2,
 52 |                   alpha=0.01)
 53 | 
 54 |     ssrge.fit_transform(X, Y, C)
 55 | 
 56 |     ssrge.score(X,Y)
 57 |     print(ssrge.retained_snvs)
 58 |     print(ssrge.retained_genes)
 59 | 
 60 | 
 61 | class SSrGE():
 62 |     """
 63 |     Class to perform the SSrGE (Sparse SNV inference to reflect Gene Expression)
 64 |     """
 65 |     def __init__(
 66 |             self,
 67 |             snv_id_list=[],
 68 |             gene_id_list=[],
 69 |             nb_ranked_features=None,
 70 |             time_limit=TIME_LIMIT,
 71 |             min_obs_for_regress=MIN_OBS_FOR_REGRESS,
 72 |             nb_threads=NB_THREADS,
 73 |             model='LASSO',
 74 |             model_params=None,
 75 |             alpha=0.1,
 76 |             n_alphas=50,
 77 |             l1_ratio=0.5,
 78 |             verbose=True,
 79 |             **kwargs):
 80 |         """
 81 |         input:
 82 |             :gene_id_list: list of genes ids
 83 |             :snv_id_list: list(tuple) <snv ids, gene ids>    the gene ids corresponds
 84 |                                                               to the gene where the given
 85 |                                                               svn is found
 86 |             :nb_ranked_features: int    top ranked features (snvs and genes) to keep
 87 |             :n_alphas: number of alphas to use if model == LassoCV (see sklearn doc for LassoCV)
 88 |         """
 89 |         self.retained_genes = []
 90 |         self.retained_snvs = []
 91 |         self._do_rank_genes = False
 92 |         self._snv_ids_given = False
 93 |         self.snv_index = None
 94 |         self.gene_index = None
 95 |         self.snv_id_dict = None
 96 |         self.gene_id_dict = None
 97 | 
 98 |         self._cnv_used = None
 99 |         self.cnv_score = defaultdict(float)
100 | 
101 |         self.nb_ranked_features = nb_ranked_features
102 | 
103 |         if list(snv_id_list):
104 |             try:
105 |                 assert(all(len(snv) == 2 for snv in snv_id_list))
106 |             except Exception:
107 |                 warn('snv_id_list given but not conform and cannot be used.'\
108 |                      'to rank gene_id_list.'\
109 |                      '\ncorrect format: :snv_id_list: list(tuple) <snv ids, gene ids>')
110 |             else:
111 |                 self._do_rank_genes = True
112 |                 self._snv_ids_given = True
113 | 
114 |         self._create_dicts(snv_id_list, gene_id_list)
115 | 
116 |         self.snvs_ranked = [] # list of tupe (snv, score)
117 |         self.genes_ranked = [] # list of tupe (gene, score)
118 | 
119 |         self.gene_weights = None
120 | 
121 |         self.time_limit = time_limit
122 |         self.min_obs_for_regress = min_obs_for_regress
123 |         self.nb_threads = nb_threads
124 | 
125 |         self.eeSNV_weight = None # total eeSNV absolute weight
126 |         self.SNV_mat = None # fitted SNV_mat
127 |         self.GE_mat = None # fitted GE_mat
128 |         self.SNV_mat_shape = None # dim of the fitted SNV_mat
129 |         self.GE_mat_shape = None # dim of the GE_mat used as predicat
130 |         self.eeSNV_index = None # eeSNV index
131 |         self.intercepts = None # Intercepts for non null model: {index gene: intercept value}
132 |         self.coefs = None # coefs for non null model: {index gene: coefs dict}
133 |         self.verbose = verbose # whether to print ranking results into the terminal
134 |         self.eeSNV_CIS_score = defaultdict(float)
135 |         self.gene_CIS_score = defaultdict(float)
136 |         self._model_type = None
137 |         self.alpha = alpha
138 |         self.n_alphas = n_alphas
139 | 
140 |         if model == 'LASSO':
141 |             self._model_type = model
142 |             self.model = Lasso
143 |             self.model_params = {
144 |                 'alpha': alpha,
145 |                 'max_iter': 1000,
146 |             }
147 | 
148 |         elif model == 'OMP':
149 |             self._model_type = model
150 |             self.model = OrthogonalMatchingPursuit
151 |             self.model_params = {'n_nonzero_coefs': alpha}
152 | 
153 |         elif model == 'LassoLars':
154 |             self._model_type = model
155 |             self.model = LassoLars
156 |             self.model_params = {
157 |                 'alpha': alpha,
158 |                 'max_iter': 1000,}
159 | 
160 |         elif model == 'LassoCV':
161 |             self._model_type = model
162 |             self.model = LassoCV
163 |             self.model_params = {
164 |                 'n_alphas': n_alphas}
165 | 
166 |         elif model == 'ElasticNet':
167 |             self.model = ElasticNet
168 |             self.model_params = {
169 |                 'alpha': alpha,
170 |                 'l1_ratio': l1_ratio,
171 |                 'max_iter': 1000,
172 |             }
173 |         else:
174 |             self.model = model
175 |             self.model_params = model_params
176 | 
177 |     def _create_dicts(self, snv_id_list, gene_id_list):
178 |         """ """
179 |         self.snv_index = dict(enumerate(snv_id_list))
180 |         self.gene_index = dict(enumerate(gene_id_list))
181 | 
182 |         self.snv_id_dict = {name: pos
183 |                             for pos, name in self.snv_index.items()}
184 |         self.gene_id_dict = defaultdict(str, {name: pos
185 |                             for pos, name in self.gene_index.items()})
186 | 
187 |     def fit(self, SNV_mat, GE_mat, CNV_mat=None, to_dense=False):
188 |         """
189 |         infer eeSNV by fitting sparse linear models using SNV as features
190 |         and gene expression as objectives
191 | 
192 |         input:
193 |             :SNV_mat: (n_samples x n_SNVs) matrix (binary). Matrix can be sparse
194 |             :GE_mat: (n_samples x n_Genes) matrix (float value)
195 |             :to_dense: Bool    if True SNV_mat is converted as ndarray
196 | 
197 |         return:
198 |             SNV_index, eeSNV_mat
199 | 
200 |             :SNV_index: list<int>    List of eeSNV index from the SNV_matrix
201 |             :eeSNV_mat: (n_samples x n_eeSNVs) matrix (binary)    (len(n_eeSNVs) < len(n_SNVs))
202 |         """
203 |         if GE_mat.shape[0] == SNV_mat.shape[0] and \
204 |            GE_mat.shape[1] != SNV_mat.shape[1]:
205 |             GE_mat = GE_mat.T
206 | 
207 |         self.SNV_mat_shape = SNV_mat.shape
208 |         self.GE_mat_shape = GE_mat.shape
209 | 
210 |         if self._model_type == 'OMP' and self.alpha and \
211 |            0.0 < self.alpha < 1.0:
212 |             self.model_params['n_nonzero_coefs'] = int(np.floor(
213 |                 SNV_mat.shape[0] * self.alpha))
214 | 
215 |         if not self._snv_ids_given:
216 |             self._create_dicts(range(self.SNV_mat_shape[1]),
217 |                                range(self.GE_mat_shape[0]))
218 | 
219 |         assert(self.SNV_mat_shape[0] == self.GE_mat_shape[1])
220 |         assert(self.SNV_mat_shape[0] == self.GE_mat_shape[1])
221 | 
222 |         if issparse(GE_mat):
223 |             GE_mat = GE_mat.todense()
224 | 
225 |         if (to_dense or \
226 |             self._model_type == 'OMP' or \
227 |             self._model_type == 'LassoLars' ) \
228 |            and issparse(SNV_mat):
229 |             SNV_mat = SNV_mat.todense()
230 | 
231 |         if isinstance(GE_mat, np.matrix):
232 |             GE_mat = np.array(GE_mat)
233 | 
234 |         self._cnv_used = CNV_mat is not None
235 | 
236 |         g_index, coefs, intercepts = BatchFitting(
237 |             I_mat=SNV_mat,
238 |             O_mat=GE_mat,
239 |             CNV_mat=CNV_mat,
240 |             model=self.model,
241 |             model_params=self.model_params,
242 |             nb_processes=self.nb_threads,
243 |             time_limit=self.time_limit,
244 |             min_obs_for_regress=self.min_obs_for_regress,
245 |             only_nonzero=True).run()
246 | 
247 |         self._process_computed_coefs(coefs, g_index, intercepts)
248 |         self._rank_eeSNVs()
249 | 
250 |         if self._do_rank_genes:
251 |             self._rank_genes()
252 | 
253 |         self.select_top_ranked_features()
254 | 
255 |         self.SNV_mat = SNV_mat
256 |         self.GE_mat = GE_mat
257 | 
258 |     def select_top_ranked_features(self, nb_ranked_features=None):
259 |         """ """
260 |         if not nb_ranked_features:
261 |             nb_ranked_features=self.nb_ranked_features
262 | 
263 |         self.retained_genes = [gene for gene, score in
264 |                                self.genes_ranked[:nb_ranked_features]]
265 |         self.retained_snvs = [snv for snv, score in
266 |                                self.snvs_ranked[:nb_ranked_features]]
267 | 
268 |     def transform(self, SNV_mat):
269 |         """
270 |         create sparse matrix using input original coefs list of dicts
271 |         input:
272 |             :SNV_mat: Matrix (len(samples), len(SNV))
273 |         return:
274 |             :eeSNV_mat: Matrix (len(samples), len(eeSNV))
275 |         """
276 |         return SNV_mat.T[[self.snv_id_dict[snv] for snv in self.retained_snvs]].T
277 | 
278 |     def fit_transform(self, SNV_mat, GE_mat, CNV_mat=None, to_dense=False):
279 |         """
280 |         Combination of fit and transform functions
281 |         """
282 |         self.fit(SNV_mat, GE_mat, CNV_mat=CNV_mat, to_dense=to_dense)
283 |         return self.transform(SNV_mat)
284 | 
285 |     def _process_computed_coefs(self, coefs, g_index, intercepts):
286 |         """
287 |         instanciate weight coefs and eeSNV indexes
288 | 
289 |         input:
290 |             :coefs: list<Counter>
291 |         """
292 |         if self.verbose:
293 |             print('\nprocess computed coefs....')
294 | 
295 |         self.eeSNV_index = list(set([key for coef in coefs for key in coef.keys()]))
296 |         self.eeSNV_index = {self.eeSNV_index[i]: i for i in range(len(self.eeSNV_index))}
297 | 
298 |         self.eeSNV_weight = defaultdict(float)
299 |         self.eeSNV_CIS_score = defaultdict(float)
300 | 
301 |         self.intercepts = {}
302 |         self.coefs = {}
303 | 
304 |         i = 0
305 |         length = len(coefs)
306 | 
307 |         for counter, gene, intercept in zip(coefs, g_index, intercepts):
308 |             gene_name = self.gene_index[gene] if self.gene_index else gene
309 | 
310 |             for key, count in counter.items():
311 |                 if self._cnv_used and key == self.SNV_mat_shape[1]:
312 |                     self.cnv_score[gene_name] = count
313 |                     continue
314 | 
315 |                 self.eeSNV_weight[key] += count
316 | 
317 |                 if self._snv_ids_given:
318 |                     genename, pos = self.snv_index[key]
319 | 
320 |                     if genename == self.gene_index[gene]:
321 |                         self.eeSNV_CIS_score[self.snv_index[key]] += count
322 | 
323 |             if counter:
324 |                 self.intercepts[gene] = intercept
325 |                 self.coefs[gene] = counter
326 | 
327 |                 if self.cnv_score[gene_name]:
328 |                     self.coefs[gene].pop(self.SNV_mat_shape[1])
329 | 
330 |             i += 1
331 | 
332 |             stdout.write('\r {0:.2f} / 100'.format(i / length * 100))
333 |             stdout.flush()
334 | 
335 |         for snv in self.eeSNV_CIS_score:
336 |             self.eeSNV_CIS_score[snv] /= self.eeSNV_weight[self.snv_id_dict[snv]]
337 | 
338 |         print('\n')
339 | 
340 |     def score(self, SNV_mat, GE_mat):
341 |         """
342 |         Return mean of MSE for GE prediction, using GE_mat as predicat
343 |         and SNV_mat as features. SNV_mat and GE_mat should be havethe same number
344 |         of SNVs and genes than the fitted models, respectively
345 | 
346 |         input:
347 |             :SNV_mat: (n_samples x n_SNVs) matrix (binary). Matrix can be sparse
348 |             :GE_mat: (n_GE x n_samples) matrix (float value)
349 | 
350 |         return:
351 |             :err_models: float    mean of the MSE for models
352 |             :err_null_models: float    mean of the MSE for null models (only intercepts)
353 |         """
354 |         if GE_mat.shape[0] == SNV_mat.shape[0] and \
355 |            GE_mat.shape[1] != SNV_mat.shape[1]:
356 |             GE_mat = GE_mat.T
357 | 
358 |         assert(SNV_mat.shape[1] == self.SNV_mat_shape[1])
359 |         assert(GE_mat.shape[0] == self.GE_mat_shape[0])
360 | 
361 |         errs_model = []
362 |         errs_null_model = []
363 | 
364 |         if issparse(GE_mat):
365 |             GE_mat = GE_mat.todense()
366 | 
367 |         if isinstance(GE_mat, np.matrix):
368 |             GE_mat = np.array(GE_mat)
369 | 
370 |         if issparse(SNV_mat):
371 |             SNV_mat = SNV_mat.todense()
372 | 
373 |         if isinstance(SNV_mat, np.matrix):
374 |             SNV_mat = np.array(SNV_mat)
375 | 
376 |         for non_null_gene in self.coefs:
377 |             non_zero = np.nonzero(GE_mat[non_null_gene])[0]
378 | 
379 |             if not len(non_zero):
380 |                 continue
381 | 
382 |             Y_test = GE_mat[non_null_gene][non_zero]
383 |             coef = np.zeros(self.SNV_mat_shape[1])
384 |             coef[list(self.coefs[non_null_gene].keys())] = [self.coefs[non_null_gene][k]
385 |                                                       for k in self.coefs[non_null_gene]]
386 |             Y_inferred =np.asarray(SNV_mat[non_zero] * np.matrix(coef).T).T[0] \
387 |                         + self.intercepts[non_null_gene]
388 | 
389 |             Y_null_inferred = np.ones(Y_test.shape[0]) * self.intercepts[non_null_gene]
390 | 
391 |             score = median_absolute_error(Y_inferred, Y_test)
392 |             score_null = median_absolute_error(Y_null_inferred, Y_test)
393 | 
394 |             errs_model.append(score)
395 |             errs_null_model.append(score_null)
396 | 
397 |         return np.mean(errs_model), np.mean(errs_null_model)
398 | 
399 |     def rank_eeSNVs(self):
400 |         """
401 |         rank eeSNVs according to their inferred coefs
402 |         """
403 |         return self.snvs_ranked
404 | 
405 |     def _rank_eeSNVs(self):
406 |         """
407 |         rank eeSNVs according to their inferred coefs
408 |         """
409 | 
410 |         self.snvs_ranked = []
411 | 
412 |         ranked_snv = sorted(self.eeSNV_weight.items(),
413 |                             key=lambda x:x[1],
414 |                             reverse=True)
415 | 
416 |         for snv_i, score in ranked_snv:
417 |             self.snvs_ranked.append((self.snv_index[snv_i], score))
418 | 
419 |         return self.snvs_ranked
420 | 
421 |     def rank_genes(self):
422 |         """
423 |         rank genes according to their inferred coefs
424 |         """
425 |         return self.genes_ranked
426 | 
427 |     def _rank_genes(self):
428 |         """
429 |         rank genes according to the inferred coefs of eeSNVs inferred and present inside
430 |         """
431 |         self.gene_weights = defaultdict(float)
432 |         self.gene_CIS_score = defaultdict(float)
433 | 
434 |         for snv_i, score in self.eeSNV_weight.items():
435 |             snv = self.snv_index[snv_i]
436 |             gene, pos = snv
437 |             self.gene_weights[gene] += score
438 | 
439 |             if self._snv_ids_given:
440 |                 self.gene_CIS_score[gene] += self.eeSNV_CIS_score[snv] * score
441 | 
442 |         for gene in self.gene_CIS_score:
443 |             self.gene_CIS_score[gene] /= self.gene_weights[gene]
444 | 
445 |         self.genes_ranked = sorted(self.gene_weights.items(),
446 |                                   key=lambda x:x[1],
447 |                                   reverse=True)
448 |         return self.genes_ranked
449 | 
450 |     def rank_features_for_a_subgroup(self, sample_id_list):
451 |         """
452 |         Rank the eeSNVs and the genes for a given subgroup of samples
453 | 
454 |         input:
455 |             :sample_id_list: id of samples of interest
456 |                              example [1,5,10] => group with samples 1, 5 and 10
457 |         output:
458 |             :SubGroupData: data container with features specific to the subgroup
459 |         """
460 |         gene_weights_list = defaultdict(Counter)
461 |         snv_weights_list = defaultdict(Counter)
462 |         exp_gene_weights_list = defaultdict(Counter)
463 |         exp_snv_weights_list = defaultdict(Counter)
464 | 
465 |         sample_id_comp = list(set(
466 |             range(self.SNV_mat.shape[0])).difference(sample_id_list))
467 | 
468 |         SNV_mat_sub = self.SNV_mat[sample_id_list].todense()
469 |         SNV_mat_comp = self.SNV_mat[sample_id_comp].todense()
470 |         GE_mat_sub = self.GE_mat.T[sample_id_list]
471 |         GE_mat_comp = self.GE_mat.T[sample_id_comp]
472 | 
473 |         for key in self.eeSNV_weight:
474 |             SNV_mat_sub.T[key] *= self.eeSNV_weight[key]
475 | 
476 |         subgroup = SubGroupData()
477 | 
478 |         for index, gene in self.gene_index.items():
479 |             subgroup.gene_expr_distrib[gene] = GE_mat_sub.T[index]
480 | 
481 |         for snv_i, score in self.eeSNV_weight.items():
482 |             snv = self.snv_index[snv_i]
483 |             gene, pos = snv
484 |             gene_i = self.gene_id_dict[gene]
485 | 
486 |             for cell_i in range(SNV_mat_sub.shape[0]):
487 |                 snv_weights_list[cell_i][snv] = SNV_mat_sub[cell_i, snv_i]
488 | 
489 |                 if gene_i != '':
490 |                     gene_weights_list[cell_i][gene] += SNV_mat_sub[cell_i, snv_i]
491 | 
492 |                 if gene_i != '' and GE_mat_sub[cell_i, gene_i]:
493 |                     exp_snv_weights_list[cell_i][snv] = SNV_mat_sub[cell_i, snv_i]
494 |                     exp_gene_weights_list[cell_i][gene] += SNV_mat_sub[cell_i, snv_i]
495 | 
496 |             if gene_i:
497 |                 index_cells_comp = np.nonzero(GE_mat_comp.T[gene_i])[0]
498 |                 subgroup.exp_snv_distrib_comp[snv] = np.array(
499 |                     SNV_mat_comp.T[snv_i, index_cells_comp])
500 | 
501 |         for cell_i in snv_weights_list:
502 | 
503 |             for gene in gene_weights_list[cell_i]:
504 |                 subgroup.gene_weights_distrib[gene].append(
505 |                     gene_weights_list[cell_i][gene])
506 | 
507 |             for snv in snv_weights_list[cell_i]:
508 |                 subgroup.snv_weights_distrib[snv].append(
509 |                     snv_weights_list[cell_i][snv])
510 | 
511 |         for cell_i in exp_snv_weights_list:
512 |             for gene in exp_gene_weights_list[cell_i]:
513 |                 subgroup.exp_gene_weights_distrib[gene].append(
514 |                     exp_gene_weights_list[cell_i][gene])
515 | 
516 |             for snv in exp_snv_weights_list[cell_i]:
517 |                 subgroup.exp_snv_weights_distrib[snv].append(
518 |                     exp_snv_weights_list[cell_i][snv])
519 | 
520 |         subgroup._get_significant_subgroup_features()
521 | 
522 |         return subgroup
523 | 
524 | class SubGroupData():
525 |     """
526 |     class containing data for a given subgroup of cells
527 | 
528 |     attribute:
529 |         :significant_eeSNVs: list of ranked significant eeSNVs with their score
530 |         :significant_genes: list of ranked significant eeSNVs with their score
531 | 
532 |         :gene_expr_distrib: distribution of gene expression for each gene
533 |         :gene_weights_distrib: distribution of gene weights for each gene
534 |                                (according to the eeSNVs) for the subgroup
535 |         :snv_weights_distrib: distribution of the eeSNV weights for each eeSNV
536 |         :exp_gene_weights_distrib: distribution of gene weights for each gene using only,
537 |                                    for a given gene, the subset of cells expressing the gene
538 |         :exp_snv_weights_distrib: distribution of eeSNV weights for each eeSNV using only,
539 |                                    for a given eeSNV, the subset of cells expressing the gene
540 |                                    related to the eeSNV
541 |     """
542 |     def __init__(self):
543 |         """ """
544 |         self.significant_eeSNVs = []
545 |         self.significant_genes = []
546 |         self.ranked_eeSNVs = []
547 |         self.ranked_genes = []
548 | 
549 |         self.gene_expr_distrib = defaultdict(list)
550 |         self.gene_weights_distrib = defaultdict(list)
551 |         self.snv_weights_distrib = defaultdict(list)
552 |         self.exp_gene_weights_distrib = defaultdict(list)
553 |         self.exp_snv_weights_distrib = defaultdict(list)
554 |         self.exp_snv_distrib_comp = defaultdict(list)
555 | 
556 | 
557 |     def _get_significant_subgroup_features(self, thres=0.05):
558 |         """ """
559 |         snv_ranked = []
560 |         gene_ranked = defaultdict(float)
561 | 
562 |         for snv in self.snv_weights_distrib:
563 |             distrib_test = np.asarray(self.exp_snv_weights_distrib[snv]).astype('bool')
564 |             distrib_ref = np.asarray(self.exp_snv_distrib_comp[snv]).astype('bool')
565 | 
566 |             key_mean = np.mean(self.exp_snv_weights_distrib[snv])
567 |             contingency = np.array([[distrib_test.sum(), (distrib_test == False).sum()],
568 |                                     [distrib_ref.sum(), (distrib_ref == False).sum()],])
569 | 
570 |             score, pvalue = fisher_exact(contingency)
571 | 
572 |             if pvalue < thres and distrib_test.mean() > distrib_ref.mean():
573 |                 snv_ranked.append((snv, key_mean, pvalue))
574 | 
575 |         snv_ranked.sort(key=lambda x:x[1], reverse=True)
576 | 
577 |         self.significant_eeSNVs = snv_ranked
578 | 
579 |         for (gene, pos), score, pvalue in snv_ranked:
580 |             gene_ranked[gene] += score
581 | 
582 |         self.significant_genes = sorted(gene_ranked.items(),
583 |                                         key=lambda x:x[1],
584 |                                         reverse=True)
585 | 
586 |         self.ranked_genes = sorted([(gene, np.mean(self.gene_weights_distrib[gene]))
587 |                                     for gene in self.gene_weights_distrib],
588 |                                    key=lambda x:x[1],
589 |                                    reverse=True)
590 |         self.ranked_eeSNVs = sorted([(snv, np.mean(self.snv_weights_distrib[snv]))
591 |                                      for snv in self.snv_weights_distrib],
592 |                                     key=lambda x:x[1],
593 |                                     reverse=True)
594 | 
595 | 
596 | if __name__ == "__main__":
597 |     debug()
598 | 


--------------------------------------------------------------------------------
/garmire_SNV_calling/process_snv_GATK.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | 
  3 | """ process one SSR with GATK pipeline SNV"""
  4 | 
  5 | from os import popen
  6 | from os.path import isdir
  7 | from os.path import isfile
  8 | from os.path import getsize
  9 | from subprocess import Popen
 10 | from subprocess import PIPE
 11 | 
 12 | from distutils.dir_util import mkpath
 13 | 
 14 | from shutil import copyfile
 15 | from shutil import move
 16 | 
 17 | from sys import stdout as STDOUT
 18 | from sys import argv
 19 | 
 20 | from random import random
 21 | from time import sleep
 22 | from time import time
 23 | 
 24 | from glob import glob
 25 | 
 26 | from garmire_SNV_calling.config import JAVA
 27 | from garmire_SNV_calling.config import JAVA_MEM
 28 | from garmire_SNV_calling.config import PICARD_DIR
 29 | from garmire_SNV_calling.config import GATK_DIR
 30 | from garmire_SNV_calling.config import GATK_JAR
 31 | 
 32 | 
 33 | ############ VARIABLES ############################################
 34 | SRR_TO_PROCESS = "" # for debug purpose
 35 | PROCESS_ID = 0
 36 | 
 37 | 
 38 | from garmire_SNV_calling.config import OUTPUT_PATH_GATK
 39 | from garmire_SNV_calling.config import PATH_OUTPUT
 40 | from garmire_SNV_calling.config import PLATEFORM
 41 | from garmire_SNV_calling.config import ORGANISM
 42 | 
 43 | from garmire_SNV_calling.config import REF_GENOME
 44 | from garmire_SNV_calling.config import DBSNP
 45 | from garmire_SNV_calling.config import VCF_RESOURCES
 46 | 
 47 | 
 48 | if "--ignore_already_exists" in argv:
 49 |     IGNORE_ALREADY_EXISTS = True
 50 | else:
 51 |     IGNORE_ALREADY_EXISTS = False
 52 | 
 53 | if "--clean_tmp" in argv:
 54 |     CLEAN_TMP = True
 55 | else:
 56 |     CLEAN_TMP = False
 57 | 
 58 | ###################################################################
 59 | 
 60 | 
 61 | def main():
 62 |     process_GATK_snv = ProcessGATKSNV(id=PROCESS_ID)
 63 |     process_GATK_snv.process()
 64 | 
 65 | 
 66 | class ProcessGATKSNV():
 67 |     """ """
 68 |     def __init__(self,
 69 |                  bam_file_name='',
 70 |                  srr_to_process=SRR_TO_PROCESS,
 71 |                  output_path=OUTPUT_PATH_GATK,
 72 |                  path_to_data=PATH_OUTPUT,
 73 |                  java=JAVA,
 74 |                  java_mem=JAVA_MEM,
 75 |                  picard_dir=PICARD_DIR,
 76 |                  gatk_dir=GATK_DIR,
 77 |                  plateform=PLATEFORM,
 78 |                  organism=ORGANISM,
 79 |                  ref_genome=REF_GENOME,
 80 |                  dbsnp=DBSNP,
 81 |                  vcf_resources=VCF_RESOURCES,
 82 |                  gatk_jar=GATK_JAR,
 83 |                  id="1",
 84 |                  ignore_already_exists=IGNORE_ALREADY_EXISTS,
 85 |                  clean_tmp=CLEAN_TMP,
 86 |                  respath=None,
 87 |     ):
 88 | 
 89 |         self.respath = respath
 90 | 
 91 |         self.output_path = output_path
 92 |         self.path_to_data = path_to_data
 93 |         self.srr_to_process = srr_to_process
 94 | 
 95 |         self.bam_file_name = bam_file_name
 96 | 
 97 |         self.java = java
 98 |         self.java_mem = java_mem
 99 |         self.picard_dir = picard_dir
100 |         self.gatk_dir = gatk_dir
101 |         self.plateform = plateform
102 |         self.organism = organism[:]
103 |         self.ignore_already_exists = ignore_already_exists
104 |         self.gatk_jar = gatk_jar
105 | 
106 |         if self.organism == 'HUMAN':
107 |             self.organism = 'hg19'
108 | 
109 |         elif self.organism == 'MOUSE':
110 |             self.organism = 'mm10'
111 | 
112 |         self.ref_genome = ref_genome
113 |         self.dbsnp = dbsnp
114 |         self.vcf_resources = vcf_resources
115 | 
116 |         self.id = str(id)
117 |         self.stdout = None
118 |         self.tmppath = None
119 |         self.time_start = None
120 |         self.bam_file_path = None
121 |         self.clean_tmp = clean_tmp
122 | 
123 |     def process(self, srr_to_process=None):
124 |         """
125 |         process one star bam file with snv calling pipeline
126 |         """
127 |         if srr_to_process:
128 |             self.srr_to_process = srr_to_process
129 | 
130 |         msg = self._init_process()
131 | 
132 |         if msg:
133 |             print(msg)
134 |             self.stdout.write(msg)
135 |             return
136 | 
137 |         self._launch_picard_readgroups()
138 |         self._launch_picard_markduplicates()
139 |         self._launch_gatk_cigar()
140 |         self._launch_gatk_realigner_target_creator()
141 |         self._launch_gatk_realigner_indel()
142 |         self._launch_gatk_base_recalibrator()
143 |         self._launch_gatk_print_reads()
144 |         self._launch_gatk_variant_calling()
145 |         self._launch_gatk_variant_filtering()
146 |         self._finish_process(ext="", out="_GATK")
147 |         self._rm_tmp_file()
148 | 
149 |     def process_exome(self, srr_to_process=None):
150 |         """
151 |         process one star bam file with snv calling pipeline
152 |         """
153 |         if srr_to_process:
154 |             self.srr_to_process = srr_to_process
155 | 
156 | 
157 |         msg = self._init_process()
158 | 
159 |         if msg:
160 |             print(msg)
161 |             self.stdout.write(msg)
162 |             return
163 | 
164 |         self._launch_picard_readgroups()
165 |         self._launch_picard_buildbamindex(name='rg_added_sorted')
166 |         self._launch_picard_markduplicates()
167 |         self._launch_gatk_base_recalibrator(input_name='dedupped')
168 |         self._launch_gatk_print_reads(input_name='dedupped')
169 |         self._launch_gatk_variant_calling()
170 |         self._launch_gatk_variant_filtering()
171 | 
172 |         self._finish_process(ext="_GATK", out="_GATK")
173 |         self._rm_tmp_file()
174 | 
175 |     def _init_process(self):
176 |         """mk tmp folders... """
177 |         self.time_start = time()
178 |         self.tmppath = '{0}/{1}/'.format(
179 |             self.output_path, self.srr_to_process)
180 | 
181 |         if not self.respath:
182 |             self.respath = '{0}/{1}/'.format(
183 |                 self.output_path, self.srr_to_process)
184 | 
185 |         sleep(2 * random())
186 |         if not isdir(self.tmppath):
187 |             mkpath(self.tmppath)
188 | 
189 |         if self.clean_tmp and glob('{0}/*'.format(self.tmppath)):
190 |             popen("rm {0}/*".format(self.tmppath)).read()
191 | 
192 |         self.stdout = open(self.tmppath + '/stdout.log', 'a+')
193 | 
194 |         self.stdout.write('\n\n######## file id {0} ########\n'\
195 |                           .format(self.srr_to_process))
196 | 
197 |         if isfile(self.respath + '/snv_filtered.vcf') \
198 |                 and getsize(self.respath + '/snv_filtered.vcf'):
199 |             msg = 'file : {0} already exists!'\
200 |                 .format(self.respath + '/snv_filtered.vcf')
201 |             print(msg)
202 | 
203 |             if self.ignore_already_exists:
204 |                 print('continuing anyway...')
205 |             else:
206 |                 return msg
207 | 
208 |         if not self.bam_file_name:
209 |             self.bam_file_path = '{0}{1}/Aligned.sortedByCoord.out.bam'.format(
210 |                 self.path_to_data +  "/star/" ,  self.srr_to_process)
211 |         else:
212 |             self.bam_file_path = self.path_to_data + self.bam_file_name
213 | 
214 |         if not isfile(self.bam_file_path)\
215 |                 or not getsize(self.bam_file_path):
216 |             err = 'error file : {0} not found or empty!'\
217 |                 .format(self.bam_file_path)
218 |             raise Exception(err)
219 | 
220 |         copyfile("{0}".format(self.bam_file_path),
221 |                  "{0}/Aligned.sortedByCoord.out.bam".format(self.tmppath))
222 | 
223 |     def _finish_process(self, ext="", out=""):
224 |         """mk res folders... """
225 | 
226 |         if not isdir(self.respath):
227 |             mkpath(self.respath)
228 | 
229 |         self.stdout.write('''\n #### FINISHED #### \n
230 |         ALL PROCESS DONE FOR: {0} in {1} s
231 |         '''.format(self.srr_to_process, time() - self.time_start))
232 | 
233 |         self._run_cmd('echo "#### FINISHED ####'\
234 |                       ' ALL PROCESS DONE FOR: {0} in {1} s"'\
235 |                       .format(self.srr_to_process, time() - self.time_start))
236 | 
237 |         if self.tmppath == self.respath and ext == out:
238 |             return
239 | 
240 |         if isfile(self.tmppath + '/snv_filtered{0}.vcf'.format(ext)):
241 |             move(self.tmppath + '/snv_filtered{0}.vcf'.format(ext),
242 |                 self.respath + '/snv_filtered{0}.vcf'.format(out))
243 | 
244 |         if isfile(self.tmppath + '/snv_filtered{0}.vcf.idx'.format(ext)):
245 |             move(self.tmppath + '/snv_filtered{0}.vcf.idx'.format(ext),
246 |                      self.respath  + '/snv_filtered{0}.vcf.idx'.format(out))
247 | 
248 |         move(self.tmppath + '/stdout.log',
249 |                  self.respath + '/stdout.log')
250 | 
251 |     def _launch_picard_readgroups(self):
252 |         """
253 |         launch picard AddOrReplaceReadGroups
254 |         """
255 |         if self.check_if_output_exists(
256 |             "{0}/rg_added_sorted.bam".format(self.tmppath)):
257 |             return
258 | 
259 |         self._run_cmd(
260 |             'echo "\n\n######## LAUNCHING PICARD READGROUPS ########\n"')
261 | 
262 |         cmd = "{0} {1} -jar {2}/picard.jar AddOrReplaceReadGroups" \
263 |               " I={3}/Aligned.sortedByCoord.out.bam"\
264 |               " O={3}/rg_added_sorted.bam" \
265 |               " SO=coordinate" \
266 |               " RGID={4}" \
267 |               " RGPU={4}" \
268 |               " RGSM={4}" \
269 |               " RGPL={5}" \
270 |               " RGLB={6}" \
271 |               .format(self.java,
272 |                       self.java_mem,
273 |                       self.picard_dir,
274 |                       self.tmppath,
275 |                       self.id,
276 |                       self.plateform,
277 |                       self.organism
278 |               )
279 |         self._run_cmd(cmd)
280 | 
281 |     def _launch_picard_markduplicates(self):
282 |         """
283 |         launch picard MarkDuplicates
284 |         """
285 |         if self.check_if_output_exists(
286 |             "{0}/dedupped.bam".format(self.tmppath)):
287 |             return
288 | 
289 |         self._run_cmd(
290 |             'echo "\n\n######## LAUNCHING PICARD MARKDUPLICATES ########\n"')
291 | 
292 |         cmd = "{0} {1} -jar {2}/picard.jar MarkDuplicates" \
293 |               " I={3}/rg_added_sorted.bam"\
294 |               " O={3}/dedupped.bam" \
295 |               " M={3}/output.metrics" \
296 |               " CREATE_INDEX=true" \
297 |               " VALIDATION_STRINGENCY=SILENT" \
298 |               .format(self.java,
299 |                       self.java_mem,
300 |                       self.picard_dir,
301 |                       self.tmppath,
302 |               )
303 |         self._run_cmd(cmd)
304 | 
305 |     def _launch_picard_buildbamindex(self, name='dedupped'):
306 |         """
307 |         launch picard buildbamindex
308 |         """
309 |         if self.check_if_output_exists(
310 |             "{0}/{1}.bai".format(self.tmppath, name)):
311 |             return
312 | 
313 |         self._run_cmd(
314 |             'echo "\n\n######## LAUNCHING PICARD BuildBamIndex ########\n"')
315 | 
316 |         cmd = "{0} {1} -jar {2}/picard.jar BuildBamIndex" \
317 |               " I={3}/{4}.bam" \
318 |               " TMP_DIR={3}" \
319 |               .format(self.java,
320 |                       self.java_mem,
321 |                       self.picard_dir,
322 |                       self.tmppath,
323 |                       name,
324 |               )
325 |         self._run_cmd(cmd)
326 | 
327 |     def _launch_picard_sortsam(self):
328 |         """
329 |         launch picard SORTSAM
330 |         """
331 |         if self.check_if_output_exists(
332 |             "{0}/sorted.bam".format(self.tmppath)):
333 |             return
334 | 
335 |         self._run_cmd(
336 |             'echo "\n\n######## LAUNCHING PICARD REORDERSAM ########\n"')
337 | 
338 |         cmd = "{0} {1} -jar {2}/picard.jar SortSam" \
339 |               " I={3}/dedupped.bam" \
340 |               " O={3}/sorted.bam" \
341 |               " SORT_ORDER=coordinate" \
342 |               " TMP_DIR={3}" \
343 |               " CREATE_INDEX=TRUE" \
344 |               .format(self.java,
345 |                       self.java_mem,
346 |                       self.picard_dir,
347 |                       self.tmppath,
348 |               )
349 |         self._run_cmd(cmd)
350 | 
351 |     def _launch_picard_reordersam(self):
352 |         """
353 |         launch picard REORDERSAM
354 |         """
355 |         if self.check_if_output_exists(
356 |             "{0}/reordered.bam".format(self.tmppath)):
357 |             return
358 | 
359 |         self._run_cmd(
360 |             'echo "\n\n######## LAUNCHING PICARD REORDERSAM ########\n"')
361 | 
362 |         cmd = "{0} {1} -jar {2}/picard.jar ReorderSam" \
363 |               " I={3}/dedupped.bam" \
364 |               " O={3}/dedupped_reodered.bam" \
365 |               " R={4}"\
366 |               " CREATE_INDEX=TRUE" \
367 |               .format(self.java,
368 |                       self.java_mem,
369 |                       self.picard_dir,
370 |                       self.tmppath,
371 |                       self.ref_genome
372 |               )
373 |         self._run_cmd(cmd)
374 | 
375 |     def _launch_gatk_cigar(self):
376 |         """
377 |         Running cigar string split and mapq 255 fix GATK
378 |         """
379 |         if self.check_if_output_exists(
380 |             "{0}/split.bam".format(self.tmppath)):
381 |             return
382 | 
383 |         self._run_cmd('echo "\n\n######## LAUNCHING CIGAR ########\n"')
384 | 
385 |         cmd = "{0} {1} -jar {2}/{5} -T SplitNCigarReads" \
386 |         " -I {3}/dedupped.bam" \
387 |         " -o {3}/split.bam" \
388 |         " -R {4}" \
389 |         " -rf ReassignOneMappingQuality" \
390 |         " -RMQF 255" \
391 |         " -RMQT 60" \
392 |         " -U ALLOW_N_CIGAR_READS" \
393 |         .format(self.java,
394 |                 self.java_mem,
395 |                 self.gatk_dir,
396 |                 self.tmppath,
397 |                 self.ref_genome,
398 |                 self.gatk_jar
399 |         )
400 | 
401 |         self._run_cmd_fix_quality(cmd, to_rm='split.ba*')
402 | 
403 |     def _launch_gatk_realigner_target_creator(self, input_name='split.bam', resolve='hard'):
404 |         """
405 |         Running Realignment Target creator
406 |         """
407 |         if self.check_if_output_exists(
408 |             "{0}/forRealigner.intervals".format(self.tmppath)):
409 |             return
410 | 
411 |         self._run_cmd(
412 |             'echo "\n\n######## LAUNCHING REALIGNER TARGET CREATOR ########\n"')
413 | 
414 |         cmd = "{0} {1} -jar {2}/{6} -T RealignerTargetCreator" \
415 |         " -I {3}/{5}" \
416 |         " -o {3}/forRealigner.intervals"\
417 |         " -R {4}" \
418 |         " -nt 20 " \
419 |         .format(self.java,
420 |                 self.java_mem,
421 |                 self.gatk_dir,
422 |                 self.tmppath,
423 |                 self.ref_genome,
424 |                 input_name,
425 |                 self.gatk_jar
426 |               )
427 | 
428 |         for vcf in self.vcf_resources:
429 |             cmd += " -known {0}".format(vcf)
430 | 
431 |         self._run_cmd_fix_quality(cmd, to_rm='forRealigner.intervals', resolve=resolve)
432 | 
433 |     def _launch_gatk_realigner_indel(self):
434 |         """
435 |         Running Realignment
436 |         """
437 |         if self.check_if_output_exists(
438 |             "{0}/realigned.bam".format(self.tmppath)):
439 |             return
440 | 
441 |         self._run_cmd(
442 |             'echo "\n\n######## LAUNCHING REALIGNER INDEL ########\n"')
443 | 
444 |         cmd = "{0} {1} -jar {2}/{5} -T IndelRealigner" \
445 |         " -I {3}/split.bam" \
446 |         " -targetIntervals {3}/forRealigner.intervals"\
447 |         " --out {3}/realigned.bam" \
448 |         " -R {4}" \
449 |         .format(self.java,
450 |                 self.java_mem,
451 |                 self.gatk_dir,
452 |                 self.tmppath,
453 |                 self.ref_genome,
454 |                 self.gatk_jar
455 |               )
456 | 
457 |         for vcf in self.vcf_resources:
458 |             cmd += " -known {0}".format(vcf)
459 | 
460 |         self._run_cmd(cmd)
461 | 
462 |     def _launch_gatk_base_recalibrator(self, input_name='realigned'):
463 |         """
464 |         Running base recalibration
465 |         """
466 |         if self.check_if_output_exists(
467 |             "{0}/recal_data.csv".format(self.tmppath)):
468 |             return
469 | 
470 |         self._run_cmd(
471 |             'echo "\n\n######## LAUNCHING RECALIBRATION STEP 1 ########\n"')
472 | 
473 |         cmd = "{0} {1} -jar {2}/{7} -T BaseRecalibrator" \
474 |         " -I {3}/{6}.bam" \
475 |         " -o {3}/recal_data.csv" \
476 |         " -R {4}" \
477 |         " -nct 20" \
478 |         " --knownSites {5}" \
479 |         .format(self.java,
480 |                 self.java_mem,
481 |                 self.gatk_dir,
482 |                 self.tmppath,
483 |                 self.ref_genome,
484 |                 self.dbsnp,
485 |                 input_name,
486 |                 self.gatk_jar
487 |               )
488 | 
489 |         for vcf in self.vcf_resources:
490 |             cmd += " --knownSites {0}".format(vcf)
491 | 
492 |         self._run_cmd_fix_quality(cmd, to_rm='recal_data.csv', resolve='hard')
493 | 
494 |     def _launch_gatk_print_reads(self, input_name='realigned'):
495 |         """
496 |         Running base recalibration STEP 2
497 |         """
498 |         if self.check_if_output_exists(
499 |             "{0}/recal.bam".format(self.tmppath)):
500 |             return
501 | 
502 |         self._run_cmd(
503 |             'echo "\n\n######## LAUNCHING RECALIBRATION STEP 2 ########\n"')
504 | 
505 |         cmd = "{0} {1} -jar {2}/{6} -T PrintReads" \
506 |         " -I {3}/{5}.bam" \
507 |         " --out {3}/recal.bam" \
508 |         " -R {4}" \
509 |         " -BQSR {3}/recal_data.csv" \
510 |         " -nct 20" \
511 |         .format(self.java,
512 |                 self.java_mem,
513 |                 self.gatk_dir,
514 |                 self.tmppath,
515 |                 self.ref_genome,
516 |                 input_name,
517 |                 self.gatk_jar
518 |               )
519 | 
520 |         self._run_cmd_fix_quality(cmd, to_rm='recal.bam', resolve='hard')
521 | 
522 |     def _launch_gatk_variant_calling(self, output_name='snv_raw_GATK.vcf'):
523 |         """
524 |         variant calling
525 |         """
526 |         if self.check_if_output_exists(
527 |             "{0}/{1}".format(self.tmppath, output_name)):
528 |             return
529 | 
530 |         self._run_cmd(
531 |             'echo "\n\n######## LAUNCHING VARIANT CALLING ########\n"')
532 | 
533 |         start_time = time()
534 | 
535 |         cmd = "{0} {1} -jar {2}/{7} -T HaplotypeCaller" \
536 |         " -I {3}/recal.bam" \
537 |         " -o {3}/{6}" \
538 |         " -R {4}" \
539 |         " --dbsnp {5}" \
540 |         " -dontUseSoftClippedBases" \
541 |         " -stand_call_conf 20.0" \
542 |         " -stand_emit_conf 20.0" \
543 |         .format(self.java,
544 |                 self.java_mem,
545 |                 self.gatk_dir,
546 |                 self.tmppath,
547 |                 self.ref_genome,
548 |                 self.dbsnp,
549 |                 output_name,
550 |                 self.gatk_jar
551 | 
552 |               )
553 | 
554 |         self._run_cmd(cmd)
555 | 
556 |         self._run_cmd(
557 |             'echo "\n## GATK variant calling done in {0} s##\n"'.format(
558 |                 time() - start_time))
559 | 
560 |     def _launch_gatk_variant_filtering(
561 |             self,
562 |             input_name='snv_raw_GATK.vcf',
563 |             output_name='snv_filtered_GATK.vcf'):
564 |         """
565 |         variant filtering
566 |         """
567 |         if self.check_if_output_exists(
568 |             "{0}/{1}".format(self.tmppath, output_name)):
569 |             return
570 | 
571 |         self._run_cmd(
572 |             'echo "\n######## LAUNCHING VARIANT FILTERING ########\n"')
573 | 
574 |         start_time = time()
575 | 
576 |         cmd = "{0} {1} -jar {2}/{7} -T VariantFiltration" \
577 |         " -V {3}/{5}" \
578 |         " -o {3}/{6}" \
579 |         " -R {4}" \
580 |         " -cluster 3" \
581 |         " -filterName FS" \
582 |         ' -filter "FS > 30.0"' \
583 |         " -filterName QD" \
584 |         ' -filter "QD < 2.0"' \
585 |         .format(self.java,
586 |                 self.java_mem,
587 |                 self.gatk_dir,
588 |                 self.tmppath,
589 |                 self.ref_genome,
590 |                 input_name,
591 |                 output_name,
592 |                 self.gatk_jar
593 |               )
594 | 
595 |         self._run_cmd(cmd)
596 | 
597 |         self._run_cmd(
598 |             'echo "\n## GATK variant filtering done in {0} s##\n"'.format(
599 |                 time() - start_time))
600 | 
601 |     def check_if_output_exists(self, outfile):
602 |         """
603 |         """
604 |         if isfile(outfile) and getsize(outfile) and not self.clean_tmp:
605 |             return True
606 |         else:
607 |             popen('rm {0}'.format(outfile)).read()
608 | 
609 |     def _rm_tmp_file(self):
610 |         """
611 |         """
612 |         if isdir(self.tmppath) and self.clean_tmp:
613 |             for fil in glob('{0}'.format(self.tmppath)):
614 |                 if fil.count("snv_filtered") or fil.count("stdout.log"):
615 |                     continue
616 |                 cmd = "rm {0}".format(fil)
617 | 
618 |                 try:
619 |                     self._run_cmd(cmd)
620 |                 except Exception as e:
621 |                     print('#### error while trying to remove the tmp file: {0}'\
622 |                           .format(e))
623 | 
624 |         if self.tmppath != self.respath:
625 |             cmd = "rm -rf {0}".format(self.tmppath)
626 | 
627 |             try:
628 |                 self._run_cmd(cmd)
629 |             except Exception as e:
630 |                 print('#### error while trying to remove the tmp folder: {0}'\
631 |                       .format(e))
632 | 
633 |     def _run_cmd(self, cmd):
634 |         """run cmd"""
635 |         stdout_read = open(self.tmppath + '/stdout.log', 'r')
636 |         stdout_read.seek(0, 2)
637 | 
638 |         process = Popen(cmd,
639 |                         stdout=PIPE,
640 |                         stderr=PIPE,
641 |                         shell=True)
642 | 
643 |         c = process.stdout.read(1)
644 |         e = process.stderr.read(1)
645 | 
646 |         while process.poll() == None or c or e:
647 | 
648 |             STDOUT.write(c)
649 |             self.stdout.write(c)
650 |             STDOUT.write(e)
651 |             self.stdout.write(e)
652 |             STDOUT.flush()
653 |             self.stdout.flush()
654 | 
655 |             c = process.stdout.read(1)
656 |             e = process.stderr.read(1)
657 | 
658 |         process.communicate()
659 | 
660 |         if process.returncode:
661 |             raise Exception('{0} raise non 0 return code!\n'\
662 |                             .format(cmd))
663 | 
664 |     def _run_cmd_fix_quality(self, cmd, to_rm, resolve='soft'):
665 |         """ """
666 |         try:
667 |             self._run_cmd(cmd)
668 |         except Exception:
669 |             self._run_cmd('echo "\n\nERROR DETECTED.' \
670 |                           'Try correcting missencoded quality score"')
671 | 
672 |             if resolve == 'hard':
673 |                 cmd += ' --allow_potentially_misencoded_quality_scores'
674 |             else:
675 |                 cmd += ' --fix_misencoded_quality_scores'
676 | 
677 |             popen("rm {0}/{1}".format(self.tmppath, to_rm)).read()
678 |             self._run_cmd(cmd)
679 | 
680 | if __name__ == "__main__":
681 |     main()
682 | 


--------------------------------------------------------------------------------