├── methodFlow.png
├── requirements.txt
├── examples
    ├── accessibility_mask.bed
    ├── genome.bed
    ├── example_pipeline.sh
    └── example_pipeline_pool.sh
├── ReLERNN
    ├── __init__.py
    ├── imports.py
    ├── networks.py
    ├── ReLERNN_TRAIN
    ├── ReLERNN_TRAIN_POOL
    ├── ReLERNN_PREDICT_POOL
    ├── ReLERNN_PREDICT
    ├── ReLERNN_BSCORRECT
    ├── ReLERNN_SIMULATE_POOL
    ├── ReLERNN_SIMULATE
    ├── simulator.py
    ├── manager.py
    ├── sequenceBatchGenerator.py
    └── helpers.py
├── manuscript
    └── README.md
├── LICENSE
├── setup.py
├── .gitignore
└── README.md


/methodFlow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kr-colab/ReLERNN/HEAD/methodFlow.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | msprime>=0.7.4
2 | scikit-learn>=0.22.1
3 | matplotlib>=3.1.3
4 | scikit-allel>=1.2.1
5 | 


--------------------------------------------------------------------------------
/examples/accessibility_mask.bed:
--------------------------------------------------------------------------------
1 | 2L  0   7000
2 | 2R  0   9000
3 | 3L  0   35000
4 | 3R  0   4000
5 | X   0   7300
6 | 


--------------------------------------------------------------------------------
/examples/genome.bed:
--------------------------------------------------------------------------------
1 | 2L  0   840000
2 | 2R  0   1669000
3 | 3L  0   742000
4 | 3R  0   1963000
5 | X   0   1250000
6 | 


--------------------------------------------------------------------------------
/ReLERNN/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | from ReLERNN.imports import *
4 | from ReLERNN.helpers import *
5 | from ReLERNN.networks import *
6 | from ReLERNN.sequenceBatchGenerator import *
7 | from ReLERNN.simulator import * 
8 | 


--------------------------------------------------------------------------------
/manuscript/README.md:
--------------------------------------------------------------------------------
1 | # Predicting the landscape of recombination using deep learning
2 | ## Jeffrey R. Adrion, Jared G. Galloway, and Andrew D. Kern
3 | ====================================================================
4 | 
5 | This repository will host code for analyses found in [Adrion, Galloway, and Kern (2020)](https://academic.oup.com/mbe/advance-article/doi/10.1093/molbev/msaa038/5741419).
6 | 


--------------------------------------------------------------------------------
/ReLERNN/imports.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import pickle
 3 | import sys
 4 | import msprime as msp
 5 | import numpy as np
 6 | import os
 7 | import multiprocessing as mp
 8 | import shutil
 9 | import random
10 | import copy
11 | import argparse
12 | import h5py
13 | import allel
14 | import time
15 | 
16 | from sklearn.neighbors import NearestNeighbors
17 | from sklearn.utils import resample
18 | 
19 | import matplotlib as mpl
20 | mpl.use('pdf')
21 | import matplotlib.pyplot as plt
22 | 
23 | import tensorflow as tf
24 | from tensorflow.keras import layers
25 | from tensorflow.keras.models import Model, model_from_json
26 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TerminateOnNaN
27 | 


--------------------------------------------------------------------------------
/examples/example_pipeline.sh:
--------------------------------------------------------------------------------
 1 | SIMULATE="ReLERNN_SIMULATE"
 2 | TRAIN="ReLERNN_TRAIN"
 3 | PREDICT="ReLERNN_PREDICT"
 4 | BSCORRECT="ReLERNN_BSCORRECT"
 5 | SEED="42"
 6 | MU="1e-8"
 7 | URTR="1"
 8 | DIR="./example_output/"
 9 | VCF="./example.vcf"
10 | GENOME="./genome.bed"
11 | MASK="./accessibility_mask.bed"
12 | 
13 | # Simulate data
14 | ${SIMULATE} \
15 |     --vcf ${VCF} \
16 |     --genome ${GENOME} \
17 |     --mask ${MASK} \
18 |     --projectDir ${DIR} \
19 |     --assumedMu ${MU} \
20 |     --upperRhoThetaRatio ${URTR} \
21 |     --nTrain 13000 \
22 |     --nVali 2000 \
23 |     --nTest 100 \
24 |     --seed ${SEED}
25 | 
26 | # Train network
27 | ${TRAIN} \
28 |     --projectDir ${DIR} \
29 |     --seed ${SEED}
30 | 
31 | # Predict
32 | ${PREDICT} \
33 |     --vcf ${VCF} \
34 |     --projectDir ${DIR} \
35 |     --seed ${SEED}
36 | 
37 | # Parametric Bootstrapping
38 | ${BSCORRECT} \
39 |     --projectDir ${DIR} \
40 |     --nSlice 2 \
41 |     --nReps 2 \
42 |     --seed ${SEED}
43 | 


--------------------------------------------------------------------------------
/examples/example_pipeline_pool.sh:
--------------------------------------------------------------------------------
 1 | SIMULATE="ReLERNN_SIMULATE_POOL"
 2 | TRAIN="ReLERNN_TRAIN_POOL"
 3 | PREDICT="ReLERNN_PREDICT_POOL"
 4 | BSCORRECT="ReLERNN_BSCORRECT"
 5 | SEED="42"
 6 | MU="1e-8"
 7 | URTR="1"
 8 | DIR="./example_output_pool/"
 9 | POOL="./example.pool"
10 | GENOME="./genome.bed"
11 | MASK="./accessibility_mask.bed"
12 | 
13 | # Simulate data
14 | ${SIMULATE} \
15 |     --pool ${POOL} \
16 |     --sampleDepth 20 \
17 |     --genome ${GENOME} \
18 |     --mask ${MASK} \
19 |     --projectDir ${DIR} \
20 |     --assumedMu ${MU} \
21 |     --upperRhoThetaRatio ${URTR} \
22 |     --nTrain 13000 \
23 |     --nVali 2000 \
24 |     --nTest 100 \
25 |     --seed ${SEED}
26 | 
27 | # Train network
28 | ${TRAIN} \
29 |     --projectDir ${DIR} \
30 |     --readDepth 20 \
31 |     --maf 0.05 \
32 |     --seed ${SEED}
33 | 
34 | # Predict
35 | ${PREDICT} \
36 |     --pool ${POOL} \
37 |     --projectDir ${DIR} \
38 |     --seed ${SEED}
39 | 
40 | # Parametric Bootstrapping
41 | ${BSCORRECT} \
42 |     --projectDir ${DIR} \
43 |     --seed ${SEED}
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kern Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from setuptools import setup, find_packages
 3 | 
 4 | with open("README.md", "r") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setup(name='ReLERNN',
 8 |       version='0.2',
 9 |       requires_python="<3.11",
10 |       description='ReLERNN: Recombination Landscape Estimation using Recurrent Neural Networks',
11 |       long_description=long_description,
12 |       long_description_content_type="text/markdown",
13 |       url='https://github.com/kern-lab/ReLERNN/',
14 |       author='Jeffrey Adrion, Jared Galloway, Andrew Kern',
15 |       author_email='jeffreyadrion@gmail.com, jaredgalloway07@gmail.com, adk@uoregon.edu',
16 |       license='MIT',
17 |       packages=find_packages(exclude=[]),
18 |       install_requires=[
19 |           "msprime>=0.7.4",
20 |           "scikit-learn>=0.22.1",
21 |           "matplotlib>=3.1.3",
22 |           "scikit-allel>=1.2.1",
23 |           "tensorflow==2.15.0"],
24 |       scripts=[
25 |             "ReLERNN/ReLERNN_SIMULATE",
26 |             "ReLERNN/ReLERNN_SIMULATE_POOL",
27 |             "ReLERNN/ReLERNN_TRAIN",
28 |             "ReLERNN/ReLERNN_TRAIN_POOL",
29 |             "ReLERNN/ReLERNN_PREDICT",
30 |             "ReLERNN/ReLERNN_PREDICT_POOL",
31 |             "ReLERNN/ReLERNN_BSCORRECT"],
32 |       zip_safe=False,
33 |       setup_requires=[],
34 | )
35 | 
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # vim
107 | *.swp
108 | *.swo
109 | 


--------------------------------------------------------------------------------
/ReLERNN/networks.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Authors: Jeff Adrion, Andrew Kern, Jared Galloway
 3 | '''
 4 | 
 5 | from ReLERNN.imports import *
 6 | 
 7 | def GRU_TUNED84(x,y):
 8 |     '''
 9 |     Same as GRU_VANILLA but with dropout AFTER each dense layer.
10 |     '''
11 | 
12 |     haps,pos = x
13 | 
14 |     numSNPs = haps[0].shape[0]
15 |     numSamps = haps[0].shape[1]
16 |     numPos = pos[0].shape[0]
17 | 
18 |     genotype_inputs = layers.Input(shape=(numSNPs,numSamps))
19 |     model = layers.Bidirectional(layers.GRU(84,return_sequences=False))(genotype_inputs)
20 |     model = layers.Dense(256)(model)
21 |     model = layers.Dropout(0.35)(model)
22 | 
23 |     #----------------------------------------------------
24 | 
25 |     position_inputs = layers.Input(shape=(numPos,))
26 |     m2 = layers.Dense(256)(position_inputs)
27 | 
28 |     #----------------------------------------------------
29 | 
30 | 
31 |     model =  layers.concatenate([model,m2])
32 |     model = layers.Dense(64)(model)
33 |     model = layers.Dropout(0.35)(model)
34 |     output = layers.Dense(1)(model)
35 | 
36 |     #----------------------------------------------------
37 | 
38 |     model = Model(inputs=[genotype_inputs,position_inputs], outputs=[output])
39 |     model.compile(optimizer='Adam', loss='mse')
40 |     model.summary()
41 | 
42 |     return model
43 | 
44 | 
45 | def GRU_POOLED(x,y):
46 | 
47 |     sites=x.shape[1]
48 |     features=x.shape[2]
49 | 
50 |     genotype_inputs = layers.Input(shape=(sites,features))
51 |     model = layers.Bidirectional(layers.GRU(84,return_sequences=False))(genotype_inputs)
52 |     model = layers.Dense(256)(model)
53 |     model = layers.Dropout(0.35)(model)
54 |     output = layers.Dense(1)(model)
55 | 
56 |     model = Model(inputs=[genotype_inputs], outputs=[output])
57 |     model.compile(optimizer='Adam', loss='mse')
58 |     model.summary()
59 | 
60 |     return model
61 | 
62 | 
63 | def HOTSPOT_CLASSIFY(x,y):
64 | 
65 |     haps,pos = x
66 | 
67 |     numSNPs = haps[0].shape[0]
68 |     numSamps = haps[0].shape[1]
69 |     numPos = pos[0].shape[0]
70 | 
71 |     genotype_inputs = layers.Input(shape=(numSNPs,numSamps))
72 |     model = layers.Bidirectional(layers.GRU(84,return_sequences=False))(genotype_inputs)
73 |     model = layers.Dense(256)(model)
74 |     model = layers.Dropout(0.35)(model)
75 | 
76 |     #----------------------------------------------------
77 | 
78 |     position_inputs = layers.Input(shape=(numPos,))
79 |     m2 = layers.Dense(256)(position_inputs)
80 | 
81 |     #----------------------------------------------------
82 | 
83 | 
84 |     model =  layers.concatenate([model,m2])
85 |     model = layers.Dense(64)(model)
86 |     model = layers.Dropout(0.35)(model)
87 |     output = layers.Dense(1,activation='sigmoid')(model)
88 | 
89 |     #----------------------------------------------------
90 | 
91 |     model = Model(inputs=[genotype_inputs,position_inputs], outputs=[output])
92 |     model.compile(optimizer='adam', loss='binary_crossentropy')
93 |     model.summary()
94 | 
95 |     return model
96 | 


--------------------------------------------------------------------------------
/ReLERNN/ReLERNN_TRAIN:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Trains a network on data simulated by ReLERNN_SIMULATE.py"""
  3 | 
  4 | from ReLERNN.imports import *
  5 | from ReLERNN.helpers import *
  6 | from ReLERNN.sequenceBatchGenerator import *
  7 | from ReLERNN.networks import *
  8 | 
  9 | 
 10 | def main():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None)
 13 |     parser.add_argument('--nEpochs',dest='nEpochs',help='Maximum number of epochs to train (EarlyStopping is implemented for validation accuracy)', type=int, default=1000)
 14 |     parser.add_argument('--nValSteps',dest='nValSteps',help='Number of validation steps', type=int, default=20)
 15 |     parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=1)
 16 |     parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None)
 17 |     parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default=0)
 18 |     args = parser.parse_args()
 19 |     
 20 |     
 21 |     ## Set seed
 22 |     if args.seed:
 23 |         os.environ['PYTHONHASHSEED']=str(args.seed)
 24 |         random.seed(args.seed)
 25 |         np.random.seed(args.seed)
 26 |     
 27 |     
 28 |     ## Set number of cores
 29 |     nProc = args.nCPU
 30 |     
 31 |     
 32 |     ## Set up the directory structure to store the simulations data.
 33 |     if not args.outDir:
 34 |         print("Warning: No project directory found, using current working directory.")
 35 |         projectDir = os.getcwd()
 36 |     else:
 37 |         projectDir = args.outDir
 38 |     trainDir = os.path.join(projectDir,"train")
 39 |     valiDir = os.path.join(projectDir,"vali")
 40 |     testDir = os.path.join(projectDir,"test")
 41 |     networkDir = os.path.join(projectDir,"networks")
 42 | 
 43 | 
 44 |     ## Define output files
 45 |     test_resultFile = os.path.join(networkDir,"testResults.p")
 46 |     test_resultFig = os.path.join(networkDir,"testResults.pdf")
 47 |     modelSave = os.path.join(networkDir,"model.json")
 48 |     weightsSave = os.path.join(networkDir,"weights.h5")
 49 | 
 50 | 
 51 |     ## Identify padding required
 52 |     maxSimS = 0
 53 |     winFILE=os.path.join(networkDir,"windowSizes.txt")
 54 |     with open(winFILE, "r") as fIN:
 55 |         for line in fIN:
 56 |             maxSimS=max([maxSimS, int(line.split()[5])])
 57 |     maxSegSites = 0
 58 |     for ds in [trainDir,valiDir,testDir]:
 59 |         DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb"))
 60 |         segSitesInDs = max(DsInfoDir["segSites"])
 61 |         maxSegSites = max(maxSegSites,segSitesInDs)
 62 |     maxSegSites = max(maxSegSites, maxSimS)
 63 | 
 64 |     
 65 |     ## Set network parameters
 66 |     bds_train_params = {
 67 |         'treesDirectory':trainDir,
 68 |         'targetNormalization':"zscore",
 69 |         'batchSize': 64,
 70 |         'maxLen': maxSegSites,
 71 |         'frameWidth': 5,
 72 |         'shuffleInds':True,
 73 |         'sortInds':False,
 74 |         'center':False,
 75 |         'ancVal':-1,
 76 |         'padVal':0,
 77 |         'derVal':1,
 78 |         'realLinePos':True,
 79 |         'posPadVal':0,
 80 |         'seqD':None,
 81 |         'seed':args.seed
 82 |               }
 83 | 
 84 | 
 85 |     ## Dump batch pars for bootstrap
 86 |     batchParsFILE=os.path.join(networkDir,"batchPars.p")
 87 |     with open(batchParsFILE, "wb") as fOUT:
 88 |         pickle.dump(bds_train_params,fOUT)
 89 | 
 90 | 
 91 |     bds_vali_params = copy.deepcopy(bds_train_params)
 92 |     bds_vali_params['treesDirectory'] = valiDir
 93 |     bds_vali_params['batchSize'] = 64
 94 | 
 95 |     bds_test_params = copy.deepcopy(bds_train_params)
 96 |     bds_test_params['treesDirectory'] = testDir
 97 |     DsInfoDir = pickle.load(open(os.path.join(testDir,"info.p"),"rb"))
 98 |     bds_test_params['batchSize'] = DsInfoDir["numReps"]
 99 |     bds_test_params['shuffleExamples'] = False
100 | 
101 | 
102 |     ## Define sequence batch generator
103 |     train_sequence = SequenceBatchGenerator(**bds_train_params)
104 |     vali_sequence = SequenceBatchGenerator(**bds_vali_params)
105 |     test_sequence = SequenceBatchGenerator(**bds_test_params)
106 | 
107 | 
108 |     ## Train network
109 |     runModels(ModelFuncPointer=GRU_TUNED84,
110 |             ModelName="GRU_TUNED84",
111 |             TrainDir=trainDir,
112 |             TrainGenerator=train_sequence,
113 |             ValidationGenerator=vali_sequence,
114 |             TestGenerator=test_sequence,
115 |             resultsFile=test_resultFile,
116 |             network=[modelSave,weightsSave],
117 |             numEpochs=args.nEpochs,
118 |             validationSteps=args.nValSteps,
119 |             nCPU=nProc,
120 |             gpuID=args.gpuID)
121 | 
122 | 
123 |     ## Plot results of predictions on test set
124 |     plotResults(resultsFile=test_resultFile,saveas=test_resultFig)
125 | 
126 | 
127 |     print("\n***ReLERNN_TRAIN.py FINISHED!***\n")
128 | 
129 | if __name__ == "__main__":
130 | 	main()
131 | 


--------------------------------------------------------------------------------
/ReLERNN/ReLERNN_TRAIN_POOL:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Trains a network on data simulated by ReLERNN_SIMULATE_POOL.py"""
  3 | 
  4 | from ReLERNN.imports import *
  5 | from ReLERNN.helpers import *
  6 | from ReLERNN.sequenceBatchGenerator import *
  7 | from ReLERNN.networks import *
  8 | 
  9 | 
 10 | def main():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None)
 13 |     parser.add_argument('--readDepth',dest='seqD',help='Mean read depth of the pool', type=int, default=0)
 14 |     parser.add_argument('--maf',dest='maf',help='discard simulated sites with allele frequencies < maf', type=float, default=0.05)
 15 |     parser.add_argument('--nEpochs',dest='nEpochs',help='Maximum number of epochs to train (EarlyStopping is implemented for validation accuracy)', type=int, default=1000)
 16 |     parser.add_argument('--nValSteps',dest='nValSteps',help='Number of validation steps', type=int, default=20)
 17 |     parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None)
 18 |     parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None)
 19 |     parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default=0)
 20 |     args = parser.parse_args()
 21 |     
 22 |     
 23 |     ## Set seed
 24 |     if args.seed:
 25 |         os.environ['PYTHONHASHSEED']=str(args.seed)
 26 |         random.seed(args.seed)
 27 |         np.random.seed(args.seed)
 28 |     
 29 |     
 30 |     ## Set number of cores
 31 |     if args.nCPU:
 32 |         nProc = args.nCPU
 33 |     else:
 34 |         nProc = mp.cpu_count()
 35 |     
 36 |     
 37 |     print("Warning: training data to be treated as if generated by pool-seq")
 38 |     if args.seqD == 0:
 39 |         print("Error: assumed sequencing depth must be provided.")
 40 |         sys.exit(1)
 41 | 
 42 | 
 43 |     ## Set up the directory structure to store the simulations data.
 44 |     if not args.outDir:
 45 |         print("Warning: No project directory found, using current working directory.")
 46 |         projectDir = os.getcwd()
 47 |     else:
 48 |         projectDir = args.outDir
 49 |     trainDir = os.path.join(projectDir,"train")
 50 |     valiDir = os.path.join(projectDir,"vali")
 51 |     testDir = os.path.join(projectDir,"test")
 52 |     networkDir = os.path.join(projectDir,"networks")
 53 | 
 54 | 
 55 |     ## Define output files
 56 |     test_resultFile = os.path.join(networkDir,"testResults.p")
 57 |     test_resultFig = os.path.join(networkDir,"testResults.pdf")
 58 |     modelSave = os.path.join(networkDir,"model.json")
 59 |     weightsSave = os.path.join(networkDir,"weights.h5")
 60 | 
 61 | 
 62 |     ## Identify padding required
 63 |     maxSimS = 0
 64 |     winFILE=os.path.join(networkDir,"windowSizes.txt")
 65 |     with open(winFILE, "r") as fIN:
 66 |         for line in fIN:
 67 |             maxSimS=max([maxSimS, int(line.split()[5])])
 68 |     maxSegSites = 0
 69 |     for ds in [trainDir,valiDir,testDir]:
 70 |         DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb"))
 71 |         segSitesInDs = max(DsInfoDir["segSites"])
 72 |         maxSegSites = max(maxSegSites,segSitesInDs)
 73 |     maxSegSites = max(maxSegSites, maxSimS)
 74 | 
 75 |     
 76 |     ## Set network parameters
 77 |     bds_train_params = {
 78 |         'treesDirectory':trainDir,
 79 |         'targetNormalization':"zscore",
 80 |         'batchSize': 64,
 81 |         'maxLen': maxSegSites,
 82 |         'frameWidth': 5,
 83 |         'shuffleInds':True,
 84 |         'sortInds':False,
 85 |         'center':False,
 86 |         'ancVal':-1,
 87 |         'padVal':0,
 88 |         'derVal':1,
 89 |         'realLinePos':True,
 90 |         'posPadVal':0,
 91 |         'seqD':args.seqD,
 92 |         'maf':args.maf,
 93 |         'seed':args.seed
 94 |               }
 95 | 
 96 | 
 97 |     ## Dump batch pars for bootstrap
 98 |     batchParsFILE=os.path.join(networkDir,"batchPars.p")
 99 |     with open(batchParsFILE, "wb") as fOUT:
100 |         pickle.dump(bds_train_params,fOUT)
101 | 
102 | 
103 |     bds_vali_params = copy.deepcopy(bds_train_params)
104 |     bds_vali_params['treesDirectory'] = valiDir
105 |     bds_vali_params['batchSize'] = 64
106 | 
107 |     bds_test_params = copy.deepcopy(bds_train_params)
108 |     bds_test_params['treesDirectory'] = testDir
109 |     DsInfoDir = pickle.load(open(os.path.join(testDir,"info.p"),"rb"))
110 |     bds_test_params['batchSize'] = DsInfoDir["numReps"]
111 |     bds_test_params['shuffleExamples'] = False
112 | 
113 | 
114 |     ## Define sequence batch generator
115 |     train_sequence = SequenceBatchGenerator(**bds_train_params)
116 |     vali_sequence = SequenceBatchGenerator(**bds_vali_params)
117 |     test_sequence = SequenceBatchGenerator(**bds_test_params)
118 | 
119 | 
120 |     ## Train network
121 |     runModels(ModelFuncPointer=GRU_POOLED,
122 |             ModelName="GRU_POOLED",
123 |             TrainDir=trainDir,
124 |             TrainGenerator=train_sequence,
125 |             ValidationGenerator=vali_sequence,
126 |             TestGenerator=test_sequence,
127 |             resultsFile=test_resultFile,
128 |             network=[modelSave,weightsSave],
129 |             numEpochs=args.nEpochs,
130 |             validationSteps=args.nValSteps,
131 |             nCPU=nProc,
132 |             gpuID=args.gpuID)
133 | 
134 | 
135 |     ## Plot results of predictions on test set
136 |     plotResults(resultsFile=test_resultFile,saveas=test_resultFig)
137 | 
138 | 
139 |     print("\n***ReLERNN_TRAIN_POOL.py FINISHED!***\n")
140 | 
141 | if __name__ == "__main__":
142 | 	main()
143 | 


--------------------------------------------------------------------------------
/ReLERNN/ReLERNN_PREDICT_POOL:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Predicts the recombination rate for each genomic window in the POOL file
  4 | using the network trained by ReLERNN_TRAIN_POOL.py
  5 | """
  6 | 
  7 | from ReLERNN.imports import *
  8 | from ReLERNN.helpers import *
  9 | from ReLERNN.sequenceBatchGenerator import *
 10 | 
 11 | 
 12 | def main():
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument('-p','--pool',dest='pool',help='Filtered and QC-checked pool file')
 15 |     parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None)
 16 |     parser.add_argument('--minSites',dest='minS',help='Minimum number of SNPs in a genomic window required to return a prediction', type=int, default = 50)
 17 |     parser.add_argument('--batchSizeOverride',dest='bso',help='Batch size to use when number of windows along a chromosome for low memory applications', type=int, default = None)
 18 |     parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default = 0)
 19 |     parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None)
 20 |     args = parser.parse_args()
 21 |     
 22 |     
 23 |     ## Set seed
 24 |     if args.seed:
 25 |         os.environ['PYTHONHASHSEED']=str(args.seed)
 26 |         random.seed(args.seed)
 27 |         np.random.seed(args.seed)
 28 |     
 29 | 
 30 |     ## Set up the directory structure to store the simulations data.
 31 |     if not args.outDir:
 32 |         print("Warning: No project directory found, using current working directory.")
 33 |         projectDir = os.getcwd()
 34 |     else:
 35 |         projectDir = args.outDir
 36 |     trainDir = os.path.join(projectDir,"train")
 37 |     valiDir = os.path.join(projectDir,"vali")
 38 |     testDir = os.path.join(projectDir,"test")
 39 |     networkDir = os.path.join(projectDir,"networks")
 40 |     poolDir = os.path.join(projectDir,"splitPOOLs")
 41 |     modelSave = os.path.join(networkDir,"model.json")
 42 |     weightsSave = os.path.join(networkDir,"weights.h5")
 43 | 
 44 | 
 45 |     ## Read in the window sizes
 46 |     maxSimS = 0
 47 |     wins=[]
 48 |     winFILE=os.path.join(networkDir,"windowSizes.txt")
 49 |     with open(winFILE, "r") as fIN:
 50 |         for line in fIN:
 51 |             ar=line.split()
 52 |             wins.append([ar[0],int(ar[1]),int(ar[2]),int(ar[3]),int(ar[4]),int(ar[5]),int(ar[6])])
 53 |             maxSimS=max([maxSimS, int(ar[5])])
 54 | 
 55 | 
 56 |     ## Loop through chromosomes and predict
 57 |     for i in range(len(wins)):
 58 |         bn=os.path.basename(args.pool)
 59 |         poolFILE=os.path.join(poolDir,bn.replace(".pool","_%s.pool" %(wins[i][0])))
 60 |         print("""Importing POOL: "%s"...""" %(poolFILE))
 61 |         pos,fqs = [], []
 62 |         with open(poolFILE, "r") as fIN:
 63 |             for line in fIN:
 64 |                 ar = line.split()
 65 |                 pos.append(int(ar[1]))
 66 |                 fqs.append(float(ar[2]))
 67 |             chrom = ar[0]
 68 |         pos = np.array(pos)
 69 |         fqs = np.array(fqs)
 70 | 
 71 | 
 72 |         ## Identify padding required
 73 |         maxSegSites = 0
 74 |         for ds in [trainDir,valiDir,testDir]:
 75 |             DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb"))
 76 |             segSitesInDs = max(DsInfoDir["segSites"])
 77 |             maxSegSites = max(maxSegSites,segSitesInDs)
 78 |         maxSegSites = max(maxSegSites, maxSimS)
 79 | 
 80 | 
 81 |         ## Identify parameters used to train
 82 |         DsInfoDir = pickle.load(open(os.path.join(trainDir,"info.p"),"rb"))
 83 |         winLen=wins[i][2]
 84 |         numWins=wins[i][6]
 85 |         if args.bso:
 86 |             batchSize = args.bso
 87 |         else:
 88 |             batchSize = wins[i][6]
 89 | 
 90 |         
 91 |         batchPars = pickle.load(open(os.path.join(networkDir,"batchPars.p"),"rb"))
 92 |         normType = batchPars["targetNormalization"]
 93 |     
 94 | 
 95 |         ## Set network parameters
 96 |         bds_pred_params = {
 97 |             'INFO':DsInfoDir,
 98 |             'CHROM':chrom,
 99 |             'winLen':winLen,
100 |             'numWins':numWins,
101 |             'IDs':get_index(pos,winLen),
102 |             'GT':fqs,
103 |             'POS':pos,
104 |             'batchSize': batchSize,
105 |             'maxLen': maxSegSites,
106 |             'frameWidth': 5,
107 |             'sortInds':False,
108 |             'center':False,
109 |             'ancVal':-1,
110 |             'padVal':0,
111 |             'derVal':1,
112 |             'realLinePos':True,
113 |             'posPadVal':0,
114 |             'normType':normType,
115 |             'seed':args.seed
116 |                   }
117 | 
118 | 
119 |         ### Define sequence batch generator
120 |         pool_gen = POOLBatchGenerator(**bds_pred_params)
121 | 
122 | 
123 |         ## Load trained model and make predictions on pool data
124 |         pred_resultFile = os.path.join(projectDir,wins[i][0]+".CHPREDICT.txt")
125 |         load_and_predictVCF(VCFGenerator=pool_gen,
126 |                 resultsFile=pred_resultFile,
127 |                 network=[modelSave,weightsSave],
128 |                 chromStr=wins[i][0],
129 |                 minS=args.minS,
130 |                 numWins=numWins,
131 |                 batchSize=batchSize,
132 |                 gpuID=args.gpuID)
133 | 
134 | 
135 |     ## Combine chromosome predictions in whole genome prediction file and rm chromosome files
136 |     genPredFILE=os.path.join(projectDir,bn.replace(".pool",".PREDICT.txt"))
137 |     files=[]
138 |     for f in glob.glob(os.path.join(projectDir,"*.CHPREDICT.txt")):
139 |         files.append(f)
140 |     ct=0
141 |     with open(genPredFILE, "w") as fOUT:
142 |         for f in sorted(files):
143 |             if ct==0:
144 |                 with open(f, "r") as fIN:
145 |                     for line in fIN:
146 |                         fOUT.write(line)
147 |             else:
148 |                 with open(f, "r") as fIN:
149 |                     fIN.readline()
150 |                     for line in fIN:
151 |                         fOUT.write(line)
152 |             ct+=1
153 |             cmd="rm %s" %(f)
154 |             os.system(cmd)
155 | 
156 | 
157 |     print("\n***ReLERNN_PREDICT_POOL.py FINISHED!***\n")
158 | 
159 | 
160 | if __name__ == "__main__":
161 | 	main()
162 | 


--------------------------------------------------------------------------------
/ReLERNN/ReLERNN_PREDICT:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Predicts the recombination rate for each genomic window in the VCF file
  4 | using a GRU network trained in ReLERNN_TRAIN.py
  5 | """
  6 | 
  7 | from ReLERNN.imports import *
  8 | from ReLERNN.helpers import *
  9 | from ReLERNN.sequenceBatchGenerator import *
 10 | 
 11 | 
 12 | def main():
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument('-v','--vcf',dest='vcf',help='Filtered and QC-checked VCF file. Important: Every row must correspond to a biallelic SNP with no missing data!')
 15 |     parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None)
 16 |     parser.add_argument('--phased',help='VCF file is phased',default=False, action='store_true')
 17 |     parser.add_argument('--unphased',dest='phased',help='VCF file is unphased',action='store_false')
 18 |     parser.add_argument('--minSites',dest='minS',help='Minimum number of SNPs in a genomic window required to return a prediction', type=int, default = 50)
 19 |     parser.add_argument('--batchSizeOverride',dest='bso',help='Batch size to use when number of windows along a chromosome for low memory applications', type=int, default = None)
 20 |     parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default = 0)
 21 |     parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None)
 22 |     args = parser.parse_args()
 23 |     
 24 |     
 25 |     ## Set seed
 26 |     if args.seed:
 27 |         os.environ['PYTHONHASHSEED']=str(args.seed)
 28 |         random.seed(args.seed)
 29 |         np.random.seed(args.seed)
 30 | 
 31 |     
 32 |     ## Set up the directory structure to store the simulations data.
 33 |     if not args.outDir:
 34 |         print("Warning: No project directory found, using current working directory.")
 35 |         projectDir = os.getcwd()
 36 |     else:
 37 |         projectDir = args.outDir
 38 |     trainDir = os.path.join(projectDir,"train")
 39 |     valiDir = os.path.join(projectDir,"vali")
 40 |     testDir = os.path.join(projectDir,"test")
 41 |     networkDir = os.path.join(projectDir,"networks")
 42 |     vcfDir = os.path.join(projectDir,"splitVCFs")
 43 |     modelSave = os.path.join(networkDir,"model.json")
 44 |     weightsSave = os.path.join(networkDir,"weights.h5")
 45 | 
 46 | 
 47 |     ## Read in the window sizes
 48 |     maxSimS = 0
 49 |     wins=[]
 50 |     winFILE=os.path.join(networkDir,"windowSizes.txt")
 51 |     with open(winFILE, "r") as fIN:
 52 |         for line in fIN:
 53 |             ar=line.split()
 54 |             wins.append([ar[0],int(ar[1]),int(ar[2]),int(ar[3]),int(ar[4]),int(ar[5]),int(ar[6])])
 55 |             maxSimS=max([maxSimS, int(ar[5])])
 56 | 
 57 | 
 58 |     ## Loop through chromosomes and predict
 59 |     pred_resultFiles = []
 60 |     for i in range(len(wins)):
 61 |         ## Read in the hdf5
 62 |         bn=os.path.basename(args.vcf)
 63 |         h5FILE=os.path.join(vcfDir,bn.replace(".vcf","_%s.hdf5" %(wins[i][0])))
 64 |         print("""Importing HDF5: "%s"...""" %(h5FILE))
 65 |         callset=h5py.File(h5FILE, mode="r")
 66 |         var=allel.VariantChunkedTable(callset["variants"],names=["CHROM","POS"], index="POS")
 67 |         chroms=var["CHROM"]
 68 |         pos=var["POS"]
 69 |         genos=allel.GenotypeChunkedArray(callset["calldata"]["GT"])
 70 | 
 71 | 
 72 |         ## Identify padding required
 73 |         maxSegSites = 0
 74 |         for ds in [trainDir,valiDir,testDir]:
 75 |             DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb"))
 76 |             segSitesInDs = max(DsInfoDir["segSites"])
 77 |             maxSegSites = max(maxSegSites,segSitesInDs)
 78 |         maxSegSites = max(maxSegSites, maxSimS)
 79 | 
 80 | 
 81 |         ## Identify parameters used to train
 82 |         DsInfoDir = pickle.load(open(os.path.join(trainDir,"info.p"),"rb"))
 83 |         winLen=wins[i][2]
 84 |         numWins=wins[i][6]
 85 |         if args.bso:
 86 |             batchSize = args.bso
 87 |         else:
 88 |             batchSize = wins[i][6]
 89 |         
 90 |         
 91 |         ## Set network parameters
 92 |         bds_pred_params = {
 93 |             'INFO':DsInfoDir,
 94 |             'CHROM':chroms[0],
 95 |             'winLen':winLen,
 96 |             'numWins':numWins,
 97 |             'IDs':get_index(pos,winLen),
 98 |             'GT':genos,
 99 |             'POS':pos,
100 |             'batchSize': batchSize,
101 |             'maxLen': maxSegSites,
102 |             'frameWidth': 5,
103 |             'sortInds':False,
104 |             'center':False,
105 |             'ancVal':-1,
106 |             'padVal':0,
107 |             'derVal':1,
108 |             'realLinePos':True,
109 |             'posPadVal':0,
110 |             'phase':args.phased,
111 |             'seed':args.seed
112 |                   }
113 | 
114 | 
115 |         ### Define sequence batch generator
116 |         vcf_gen = VCFBatchGenerator(**bds_pred_params)
117 |         
118 | 
119 |         ## Load trained model and make predictions on VCF data
120 |         pred_resultFile = os.path.join(projectDir,wins[i][0]+".CHPREDICT.txt")
121 |         pred_resultFiles.append(pred_resultFile)
122 |         load_and_predictVCF(VCFGenerator=vcf_gen,
123 |                 resultsFile=pred_resultFile,
124 |                 network=[modelSave,weightsSave],
125 |                 chromStr=wins[i][0],
126 |                 minS=args.minS,
127 |                 numWins=numWins,
128 |                 batchSize=batchSize,
129 |                 gpuID=args.gpuID)
130 | 
131 |     
132 |         ## Combine chromosome predictions in whole genome prediction file and rm chromosome files
133 |     genPredFILE=os.path.join(projectDir,bn.replace(".vcf",".PREDICT.txt"))
134 |     ct=0
135 |     with open(genPredFILE, "w") as fOUT:
136 |         for f in pred_resultFiles:
137 |             if ct==0:
138 |                 with open(f, "r") as fIN:
139 |                     for line in fIN:
140 |                         fOUT.write(line)
141 |             else:
142 |                 with open(f, "r") as fIN:
143 |                     fIN.readline()
144 |                     for line in fIN:
145 |                         fOUT.write(line)
146 |             ct+=1
147 |             cmd="rm %s" %(f)
148 |             os.system(cmd)
149 | 
150 | 
151 |     print("\n***ReLERNN_PREDICT.py FINISHED!***\n")
152 | 
153 | 
154 | if __name__ == "__main__":
155 | 	main()
156 | 


--------------------------------------------------------------------------------
/ReLERNN/ReLERNN_BSCORRECT:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Performs a parametric bootstrap to assess any potential bias in recombination rate predictions.
  4 | Corrects for this bias and adds 95% confidence intevals to the predictions
  5 | """
  6 | 
  7 | 
  8 | from ReLERNN.imports import *
  9 | from ReLERNN.helpers import *
 10 | from ReLERNN.simulator import *
 11 | from ReLERNN.sequenceBatchGenerator import *
 12 | 
 13 | 
 14 | def ParametricBootStrap(simParameters,
 15 |                         batchParameters,
 16 |                         trainDir,
 17 |                         network=None,
 18 |                         slices=1000,
 19 |                         repsPerSlice=1000,
 20 |                         gpuID=0,
 21 |                         tempDir="./Temp",
 22 |                         out="./ParametricBootstrap.p",
 23 |                         nCPU=1,
 24 |                         seed=None):
 25 | 
 26 | 
 27 |     '''
 28 |     This Function is for understanding network confidense
 29 |     over a range of rho, using a parametric bootstrap.
 30 | 
 31 |     SIDE NOTE: This will create a "temp" directory for filling
 32 |     writing and re-writing the test sets.
 33 |     after, it will destroy the tempDir.
 34 | 
 35 |     The basic idea being that we take a trained network,
 36 |     and iteritevly create test sets of simulation at steps which increase
 37 |     between fixed ranges of Rho.
 38 | 
 39 |     This function will output a pickle file containing
 40 |     a dictionary where the first
 41 | 
 42 |     This function will output a pickle file containing
 43 |     a dictionary where the ["rho"] key contains the slices
 44 |     between the values of rho where we simulate a test set,
 45 |     and test the trained model.
 46 | 
 47 |     The rest of the ket:value pairs in the dictionary contain
 48 |     the quartile information at each slice position for the
 49 |     distribution of test results
 50 |     '''
 51 | 
 52 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(gpuID)
 53 | 
 54 |     # load json and create model
 55 |     if(network != None):
 56 |         jsonFILE = open(network[0],"r")
 57 |         loadedModel = jsonFILE.read()
 58 |         jsonFILE.close()
 59 |         model=model_from_json(loadedModel)
 60 |         model.load_weights(network[1])
 61 |     else:
 62 |         print("Error: no pretrained network found!")
 63 | 
 64 |     if not os.path.exists(tempDir):
 65 |         os.makedirs(tempDir)
 66 | 
 67 |     priorLowsRho = simParameters['priorLowsRho']
 68 |     priorHighsRho = simParameters['priorHighsRho']
 69 | 
 70 |     rhoDiff = (priorHighsRho - priorLowsRho)/slices
 71 |     IQR = {"rho":[],"Min":[],"CI95LO":[],"Q1":[],"Q2":[],"Q3":[],"CI95HI":[],"Max":[]}
 72 |     rho = [(priorLowsRho+(rhoDiff*i)) for i in range(slices)]
 73 |     IQR["rho"] = rho
 74 | 
 75 |     mean,sd,pad = getMeanSDMax(trainDir)
 76 | 
 77 |     for idx,r in enumerate(rho):
 78 |         print("Simulating slice ",idx," out of ",slices)
 79 | 
 80 |         params = copy.deepcopy(simParameters)
 81 |         params["priorLowsRho"] = r
 82 |         params["priorHighsRho"] = r
 83 |         params["seed"] = seed
 84 |         params.pop("bn", None)
 85 |         simulator = Simulator(**params)
 86 | 
 87 |         simulator.simulateAndProduceTrees(numReps=repsPerSlice,
 88 |                                             direc=tempDir,
 89 |                                             simulator="msprime",
 90 |                                             nProc=nCPU)
 91 | 
 92 |         batch_params = copy.deepcopy(batchParameters)
 93 |         batch_params['treesDirectory'] = tempDir
 94 |         batch_params['batchSize'] = repsPerSlice
 95 |         batch_params['shuffleExamples'] = False
 96 |         batch_params['seed'] = seed
 97 |         batchGenerator= SequenceBatchGenerator(**batch_params)
 98 | 
 99 |         x,y = batchGenerator.__getitem__(0)
100 |         predictions = unNormalize(mean,sd,model.predict(x))
101 |         predictions = [p[0] for p in predictions]
102 | 
103 |         minP,maxP = min(predictions),max(predictions)
104 |         quartiles = np.percentile(predictions,[2.5,25,50,75,97.5])
105 | 
106 |         IQR["Min"].append(relu(minP))
107 |         IQR["Max"].append(relu(maxP))
108 |         IQR["CI95LO"].append(relu(quartiles[0]))
109 |         IQR["Q1"].append(relu(quartiles[1]))
110 |         IQR["Q2"].append(relu(quartiles[2]))
111 |         IQR["Q3"].append(relu(quartiles[3]))
112 |         IQR["CI95HI"].append(relu(quartiles[4]))
113 | 
114 |         del simulator
115 |         del batchGenerator
116 | 
117 |     pickle.dump(IQR,open(out,"wb"))
118 | 
119 |     return rho,IQR
120 | 
121 | 
122 | def main():
123 |     parser = argparse.ArgumentParser()
124 |     parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None)
125 |     parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None)
126 |     parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None)
127 |     parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default=0)
128 |     parser.add_argument('--nSlice',dest='nSlice',help='Number of recombination rate bins to simulate over', type=int, default=100)
129 |     parser.add_argument('--nReps',dest='nReps',help='Number of simulations per step', type=int, default=1000)
130 |     args = parser.parse_args()
131 | 
132 |     
133 |     ## Set seed
134 |     if args.seed:
135 |         os.environ['PYTHONHASHSEED']=str(args.seed)
136 |         random.seed(args.seed)
137 |         np.random.seed(args.seed)
138 |     
139 |     
140 |     ## Set number of cores
141 |     if args.nCPU:
142 |         nProc = args.nCPU
143 |     else:
144 |         nProc = mp.cpu_count()
145 | 
146 |     
147 |     ## Set up the directory structure and output files
148 |     if not args.outDir:
149 |         print("Warning: No project directory found, using current working directory.")
150 |         projectDir = os.getcwd()
151 |     else:
152 |         projectDir = args.outDir
153 |     trainDir = os.path.join(projectDir,"train")
154 |     valiDir = os.path.join(projectDir,"vali")
155 |     testDir = os.path.join(projectDir,"test")
156 |     networkDir = os.path.join(projectDir,"networks")
157 |     bs_resultFile = os.path.join(networkDir,"bootstrapResults.p")
158 |     bs_plotFile = os.path.join(networkDir,"bootstrapPlot.pdf")
159 |     modelWeights = [os.path.join(networkDir,"model.json"),os.path.join(networkDir,"weights.h5")]
160 |     bs_resultFile = os.path.join(networkDir,"bootstrapResults.p")
161 |     bsDir = os.path.join(projectDir,"PBS")
162 | 
163 | 
164 |     ## Load simulation and batch pars
165 |     simParsFILE=os.path.join(networkDir,"simPars.p")
166 |     batchParsFILE=os.path.join(networkDir,"batchPars.p")
167 |     with open(simParsFILE, "rb") as fIN:
168 |         simPars=pickle.load(fIN)
169 |     with open(batchParsFILE, "rb") as fIN:
170 |         batchPars=pickle.load(fIN)
171 |     pred_resultFiles = []
172 |     for f in glob.glob(os.path.join(projectDir,"*.PREDICT.txt")):
173 |         pred_resultFiles.append(f)
174 |     if len(pred_resultFiles) < 1:
175 |         print("Error: no .PREDICT.txt file found. You must run ReLERNN_PREDICT.py prior to running ReLERNN_BSCORRECT.py")
176 |         sys.exit(1)
177 |     elif len(pred_resultFiles) > 1:
178 |         print("Error: multiple prediction files found.")
179 |         sys.exit(1)
180 |     pred_resultFile = pred_resultFiles[0]
181 | 
182 | 
183 |     ## Run parametric bootstrap
184 |     ParametricBootStrap(
185 |             simPars,
186 |             batchPars,
187 |             trainDir,
188 |             network=modelWeights,
189 |             slices=args.nSlice,
190 |             repsPerSlice=args.nReps,
191 |             gpuID=args.gpuID,
192 |             out=bs_resultFile,
193 |             tempDir=bsDir,
194 |             nCPU=nProc,
195 |             seed=args.seed)
196 | 
197 | 
198 |     ## Plot results from bootstrap
199 |     plotParametricBootstrap(bs_resultFile,bs_plotFile)
200 | 
201 | 
202 |     ## Load bootstrap values
203 |     with open(bs_resultFile, "rb") as fIN:
204 |         bs=pickle.load(fIN)
205 | 
206 | 
207 |     ## Loop, correct, and write output
208 |     correctedfile=pred_resultFile.replace(".txt", ".BSCORRECTED.txt")
209 |     with open(correctedfile, "w") as fout, open(pred_resultFile, "r") as fin:
210 |         for line in fin:
211 |             if not line.startswith("chrom\t"):
212 |                 ar=line.split()
213 |                 rate=float(ar[4])
214 |                 C=get_corrected(rate,bs)
215 |                 ar[4]=C[0]
216 |                 ar.extend([C[1],C[2]])
217 |                 fout.write("\t".join([str(x) for x in ar])+"\n")
218 |             else:
219 |                 #fout.write(line)
220 |                 fout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %("chrom","start","end","nSites","recombRate","CI95LO","CI95HI"))
221 | 
222 | 
223 |     ## Remove the bootstrap tree files
224 |     shutil.rmtree(bsDir)
225 |     print("\n***ReLERNN_BSCORRECT.py FINISHED!***\n")
226 | 
227 | 
228 | if __name__ == "__main__":
229 | 	main()
230 | 


--------------------------------------------------------------------------------
/ReLERNN/ReLERNN_SIMULATE_POOL:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Reads a POOL file, estimates some simulation parameters, and simulates via msprime.
  4 | NOTE: This assumes that the user has previously QC'd and filtered the POOL.
  5 | """
  6 | 
  7 | from ReLERNN.imports import *
  8 | from ReLERNN.helpers import *
  9 | from ReLERNN.manager import *
 10 | from ReLERNN.simulator import *
 11 | 
 12 | 
 13 | def main():
 14 |     parser = argparse.ArgumentParser()
 15 |     parser.add_argument('-p','--pool',dest='pool',help='Filtered and QC-checked pool file')
 16 |     parser.add_argument('--sampleDepth',dest='samD',help='Number of chromosomes in pool', type=int)
 17 |     parser.add_argument('-g','--genome',dest='genome',help='BED-formatted (i.e. zero-based) file corresponding to chromosomes and positions to evaluate')
 18 |     parser.add_argument('-m','--mask',dest='mask',help='BED-formatted file corresponding to inaccessible bases', default=None)
 19 |     parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None)
 20 |     parser.add_argument('-n','--demographicHistory',dest='dem',help='Output file from either stairwayplot, SMC++, or MSMC',default=None)
 21 |     parser.add_argument('-u','--assumedMu',dest='mu',help='Assumed per-base mutation rate',type=float,default=1e-8)
 22 |     parser.add_argument('-l','--assumedGenTime',dest='genTime',help='Assumed generation time (in years)',type=float)
 23 |     parser.add_argument('-r','--upperRhoThetaRatio',dest='upRTR',help='Assumed upper bound for the ratio of rho to theta',type=float,default=1.0)
 24 |     parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None)
 25 |     parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None)
 26 |     parser.add_argument('--maxSites',dest='winSizeMx',help='Max number of sites per window to train on. Important: too many sites causes problems in training (see README)!',type=int,default=1750)
 27 |     parser.add_argument('--forceWinSize',dest='forceWinSize',help='USED ONLY FOR TESTING, LEAVE AS DEFAULT',type=int,default=0)
 28 |     parser.add_argument('--maskThresh',dest='maskThresh',help='Discard windows where >= maskThresh percent of sites are inaccessible',type=float,default=1.0)
 29 |     parser.add_argument('--nTrain',dest='nTrain',help='Number of training examples to simulate',type=int,default=100000)
 30 |     parser.add_argument('--nVali',dest='nVali',help='Number of validation examples to simulate',type=int,default=1000)
 31 |     parser.add_argument('--nTest',dest='nTest',help='Number of test examples to simulate',type=int,default=1000)
 32 |     args = parser.parse_args()
 33 |     
 34 |     
 35 |     ## Set seed
 36 |     if args.seed:
 37 |         os.environ['PYTHONHASHSEED']=str(args.seed)
 38 |         random.seed(args.seed)
 39 |         np.random.seed(args.seed)
 40 |     
 41 |     
 42 |     ## Set number of cores
 43 |     if args.nCPU:
 44 |         nProc = args.nCPU
 45 |     else:
 46 |         nProc = mp.cpu_count()
 47 |     
 48 |     
 49 |     ## Ensure all required arguments are provided
 50 |     if not args.samD:
 51 |         print("Error: assumed sample depth must be provided")
 52 |         sys.exit(1)
 53 |     if not args.pool.endswith(".pool"):
 54 |         print('Error: POOL file must end in extension ".pool"')
 55 |         sys.exit(1)
 56 |     if not args.outDir:
 57 |         print("Warning: No project directory found, using current working directory.")
 58 |         projectDir = os.getcwd()
 59 |     else:
 60 |         projectDir = args.outDir
 61 |     if not args.mask:
 62 |         print("Warning: no accessibility mask found. All sites in the genome are assumed to be accessible.") 
 63 |     if args.dem:
 64 |         demHist = check_demHist(args.dem)
 65 |         if demHist == -9:
 66 |             print("Error: demographicHistory file must be raw output from either stairwayplot, SMC++, or MSMC")
 67 |             sys.exit(1)
 68 |         if not args.genTime:
 69 |             print("Error: assumed generation time must be supplied when simulating under stairwayplot, SMC++, or MSMC")
 70 |             sys.exit(1)
 71 |     else:
 72 |         print("Warning: no demographic history file found. All training data will be simulated under demographic equilibrium.")
 73 |         demHist = 0
 74 |     
 75 | 
 76 |     ## Set up the directory structure to store the simulations data.
 77 |     time.sleep(5)
 78 |     trainDir = os.path.join(projectDir,"train")
 79 |     valiDir = os.path.join(projectDir,"vali")
 80 |     testDir = os.path.join(projectDir,"test")
 81 |     networkDir = os.path.join(projectDir,"networks")
 82 |     poolDir = os.path.join(projectDir,"splitPOOLs")
 83 | 
 84 | 
 85 |     ## Make directories if they do not exist
 86 |     for p in [projectDir,trainDir,valiDir,testDir,networkDir,poolDir]:
 87 |         if not os.path.exists(p):
 88 |             os.makedirs(p)
 89 | 
 90 |     
 91 |     ## Read the genome file
 92 |     chromosomes = []
 93 |     with open(args.genome, "r") as fIN:
 94 |         for line in fIN:
 95 |             ar = line.split()
 96 |             if len(ar)!=3:
 97 |                 print("Error: genome file must be formatted as a bed file (i.e.'chromosome     start     end')")
 98 |                 sys.exit(1)
 99 |             chromosomes.append("{}:{}-{}".format(ar[0],ar[1],ar[2]))
100 |    
101 | 
102 |     ## Pass params to the manager    
103 |     manager_params = {
104 |             'pool':args.pool,
105 |             'mask':args.mask,
106 |             'winSizeMx':args.winSizeMx,
107 |             'forceWinSize':args.forceWinSize,
108 |             'chromosomes':chromosomes,
109 |             'poolDir':poolDir,
110 |             'projectDir':projectDir,
111 |             'networkDir':networkDir,
112 |             'seed':args.seed
113 |               }
114 |     pool_manager = Manager(**manager_params)
115 |     
116 |     
117 |     ## Split the pool file
118 |     pool_manager.splitPOOL(nProc=nProc)
119 |     
120 | 
121 |     ## Calculate nSites per window
122 |     wins, nSamps, maxS, maxLen = pool_manager.countSitesPOOL(samD=args.samD, nProc=nProc)
123 | 
124 |     
125 |     ## Prepare the accessibility mask
126 |     if args.mask:
127 |         mask_fraction, win_masks = pool_manager.maskWins(wins=wins, maxLen=maxLen, nProc=nProc)
128 |     else:
129 |         mask_fraction, win_masks = 0.0, None
130 | 
131 |     
132 |     ## Define parameters for msprime simulation
133 |     print("Simulating with window size = {} bp.".format(maxLen))
134 |     a=0
135 |     for i in range(nSamps-1):
136 |         a+=1/(i+1)
137 |     thetaW=maxS/a
138 |     assumedMu = args.mu
139 |     Ne=thetaW/(4.0 * assumedMu * ((1-mask_fraction) * maxLen))
140 |     rhoHi=assumedMu*args.upRTR
141 |     if demHist:
142 |         MspD = convert_demHist(args.dem, nSamps, args.genTime, demHist)
143 |         dg_params = {
144 |                 'priorLowsRho':0.0,
145 |                 'priorHighsRho':rhoHi,
146 |                 'priorLowsMu':assumedMu * 0.66,
147 |                 'priorHighsMu':assumedMu * 1.33,
148 |                 'ChromosomeLength':maxLen,
149 |                 'winMasks':win_masks,
150 |                 'maskThresh':args.maskThresh,
151 |                 'MspDemographics':MspD,
152 |                 'seed':args.seed
153 |                   }
154 | 
155 |     else:
156 |         dg_params = {'N':nSamps,
157 |             'Ne':Ne,
158 |             'priorLowsRho':0.0,
159 |             'priorHighsRho':rhoHi,
160 |             'priorLowsMu':assumedMu * 0.66,
161 |             'priorHighsMu':assumedMu * 1.33,
162 |             'ChromosomeLength':maxLen,
163 |             'winMasks':win_masks,
164 |             'maskThresh':args.maskThresh,
165 |             'seed':args.seed
166 |                   }
167 | 
168 | 
169 |     # Assign pars for each simulation
170 |     dg_train = Simulator(**dg_params)
171 |     dg_vali = Simulator(**dg_params)
172 |     dg_test = Simulator(**dg_params)
173 | 
174 | 
175 |     ## Dump simulation pars for use with parametric bootstrap
176 |     simParsFILE=os.path.join(networkDir,"simPars.p")
177 |     with open(simParsFILE, "wb") as fOUT:
178 |         dg_params["bn"]=os.path.basename(args.pool).replace(".pool","")
179 |         pickle.dump(dg_params,fOUT)
180 | 
181 | 
182 |     ## Simulate data
183 |     print("\nTraining set:")
184 |     dg_train.simulateAndProduceTrees(numReps=args.nTrain,direc=trainDir,simulator="msprime",nProc=nProc)
185 |     print("Validation set:")
186 |     dg_vali.simulateAndProduceTrees(numReps=args.nVali,direc=valiDir,simulator="msprime",nProc=nProc)
187 |     print("Test set:")
188 |     dg_test.simulateAndProduceTrees(numReps=args.nTest,direc=testDir,simulator="msprime",nProc=nProc)
189 |     print("\nSIMULATIONS FINISHED!\n")
190 | 
191 | 
192 |     ## Count number of segregating sites in simulation
193 |     SS=[]
194 |     maxSegSites = 0
195 |     minSegSites = float("inf")
196 |     for ds in [trainDir,valiDir,testDir]:
197 |         DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb"))
198 |         SS.extend(DsInfoDir["segSites"])
199 |         segSitesInDs = max(DsInfoDir["segSites"])
200 |         segSitesInDsMin = min(DsInfoDir["segSites"])
201 |         maxSegSites = max(maxSegSites,segSitesInDs)
202 |         minSegSites = min(minSegSites,segSitesInDsMin)
203 | 
204 | 
205 |     ## Compare counts of segregating sites between simulations and input pool file
206 |     print("SANITY CHECK")
207 |     print("====================")
208 |     print("numSegSites\t\t\tMin\tMean\tMax")
209 |     print("Simulated:\t\t\t%s\t%s\t%s" %(minSegSites, int(sum(SS)/float(len(SS))), maxSegSites))
210 |     for i in range(len(wins)):
211 |         print("Input %s:\t\t%s\t%s\t%s" %(wins[i][0],wins[i][3],wins[i][4],wins[i][5]))
212 |     print("\n\n***ReLERNN_SIMULATE_POOL.py FINISHED!***\n")
213 | 
214 | 
215 | if __name__ == "__main__":
216 | 	main()
217 | 


--------------------------------------------------------------------------------
/ReLERNN/ReLERNN_SIMULATE:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Reads a VCF file, estimates some simulation parameters, and simulates via msprime.
  4 | NOTE: This assumes that the user has previously QC'd and filtered the VCF.
  5 | """
  6 | 
  7 | from ReLERNN.imports import *
  8 | from ReLERNN.helpers import *
  9 | from ReLERNN.manager import *
 10 | from ReLERNN.simulator import *
 11 | 
 12 | 
 13 | def main():
 14 |     parser = argparse.ArgumentParser()
 15 |     parser.add_argument('-v','--vcf',dest='vcf',help='Filtered and QC-checked VCF file. Important: Every row must correspond to a biallelic SNP')
 16 |     parser.add_argument('-g','--genome',dest='genome',help='BED-formatted (i.e. zero-based) file corresponding to chromosomes and positions to evaluate')
 17 |     parser.add_argument('-m','--mask',dest='mask',help='BED-formatted file corresponding to inaccessible bases', default=None)
 18 |     parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None)
 19 |     parser.add_argument('-n','--demographicHistory',dest='dem',help='Output file from either stairwayplot, SMC++, or MSMC',default=None)
 20 |     parser.add_argument('-u','--assumedMu',dest='mu',help='Assumed per-base mutation rate',type=float,default=1e-8)
 21 |     parser.add_argument('-l','--assumedGenTime',dest='genTime',help='Assumed generation time (in years)',type=float)
 22 |     parser.add_argument('-r','--upperRhoThetaRatio',dest='upRTR',help='Assumed upper bound for the ratio of rho to theta',type=float,default=1.0)
 23 |     parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None)
 24 |     parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None)
 25 |     parser.add_argument('--phased',help='Treat genotypes as phased',default=False, action='store_true')
 26 |     parser.add_argument('--unphased',dest='phased',help='Treat genotypes as unphased',action='store_false')
 27 |     parser.add_argument('--forceDiploid',help='Treat all samples as diploids with missing data (bad idea; see README)',default=False, action='store_true')
 28 |     parser.add_argument('--phaseError',dest='phaseError',help='Fraction of bases simulated with incorrect phasing',type=float,default=0.0)
 29 |     parser.add_argument('--maxSites',dest='winSizeMx',help='Max number of sites per window to train on. Important: too many sites causes problems in training (see README)!',type=int,default=1750)
 30 |     parser.add_argument('--forceWinSize',dest='forceWinSize',help='USED ONLY FOR TESTING, LEAVE AS DEFAULT',type=int,default=0)
 31 |     parser.add_argument('--maskThresh',dest='maskThresh',help='Discard windows where >= maskThresh percent of sites are inaccessible',type=float,default=1.0)
 32 |     parser.add_argument('--nTrain',dest='nTrain',help='Number of training examples to simulate',type=int,default=100000)
 33 |     parser.add_argument('--nVali',dest='nVali',help='Number of validation examples to simulate',type=int,default=1000)
 34 |     parser.add_argument('--nTest',dest='nTest',help='Number of test examples to simulate',type=int,default=1000)
 35 |     args = parser.parse_args()
 36 |     
 37 | 
 38 |     ## Set seed
 39 |     if args.seed:
 40 |         os.environ['PYTHONHASHSEED']=str(args.seed)
 41 |         random.seed(args.seed)
 42 |         np.random.seed(args.seed)
 43 |     
 44 | 
 45 |     ## Set number of cores
 46 |     if args.nCPU:
 47 |         nProc = args.nCPU
 48 |     else:
 49 |         nProc = mp.cpu_count()
 50 |     
 51 |     
 52 |     ## Ensure all required arguments are provided
 53 |     if not args.vcf.endswith(".vcf"):
 54 |         print('Error: VCF file must end in extension ".vcf"')
 55 |         sys.exit(1)
 56 |     if not args.outDir:
 57 |         print("Warning: No project directory found, using current working directory.")
 58 |         projectDir = os.getcwd()
 59 |     else:
 60 |         projectDir = args.outDir
 61 |     if not args.mask:
 62 |         print("Warning: no accessibility mask found. All sites in the genome are assumed to be accessible.") 
 63 |     if args.dem:
 64 |         demHist = check_demHist(args.dem)
 65 |         if demHist == -9:
 66 |             print("Error: demographicHistory file must be raw output from either stairwayplot, SMC++, or MSMC")
 67 |             print("If using SMC++, file must be in *.csv format (use option -c in SMC++)")
 68 |             sys.exit(1)
 69 |         if not args.genTime:
 70 |             print("Error: assumed generation time must be supplied when simulating under stairwayplot, SMC++, or MSMC")
 71 |             sys.exit(1)
 72 |     else:
 73 |         print("Warning: no demographic history file found. All training data will be simulated under demographic equilibrium.")
 74 |         demHist = 0
 75 |     if not args.phased and args.phaseError != 0.0:
 76 |         print("Error: non-zero 'phaseError' cannot be used in conjunction with '--unphased'")
 77 |         sys.exit(1)
 78 |     if args.forceDiploid:
 79 |         print("Warning: all haploid/hemizygous samples will be treated as diploid samples with missing data!\n",
 80 |                 "If you want to treat haploid/hemizygous samples and haploids without missing data, quit now, ensure no diploid samples are found in this VCF, and rerun without the option `--forceDiploid`.")
 81 |         time.sleep(10)
 82 |     else:
 83 |         time.sleep(5)
 84 |     
 85 |     
 86 |     ## Set up the directory structure to store the simulations data.
 87 |     trainDir = os.path.join(projectDir,"train")
 88 |     valiDir = os.path.join(projectDir,"vali")
 89 |     testDir = os.path.join(projectDir,"test")
 90 |     networkDir = os.path.join(projectDir,"networks")
 91 |     vcfDir = os.path.join(projectDir,"splitVCFs")
 92 | 
 93 | 
 94 |     ## Make directories if they do not exist
 95 |     for p in [projectDir,trainDir,valiDir,testDir,networkDir,vcfDir]:
 96 |         if not os.path.exists(p):
 97 |             os.makedirs(p)
 98 | 
 99 |     
100 |     ## Read the genome file
101 |     chromosomes = []
102 |     with open(args.genome, "r") as fIN:
103 |         for line in fIN:
104 |             ar = line.split()
105 |             if len(ar)!=3:
106 |                 print("Error: genome file must be formatted as a bed file (i.e.'chromosome     start     end')")
107 |                 sys.exit(1)
108 |             chromosomes.append("{}:{}-{}".format(ar[0],ar[1],ar[2]))
109 |    
110 | 
111 |     ## Pass params to the vcf manager    
112 |     manager_params = {
113 |             'vcf':args.vcf,
114 |             'mask':args.mask,
115 |             'winSizeMx':args.winSizeMx,
116 |             'forceWinSize':args.forceWinSize,
117 |             'forceDiploid':args.forceDiploid,
118 |             'chromosomes':chromosomes,
119 |             'vcfDir':vcfDir,
120 |             'projectDir':projectDir,
121 |             'networkDir':networkDir,
122 |             'seed':args.seed
123 |               }
124 |     vcf_manager = Manager(**manager_params)
125 |     
126 |     
127 |     ## Split the VCF file
128 |     vcf_manager.splitVCF(nProc=nProc)
129 |     
130 | 
131 |     ## Calculate nSites per window
132 |     wins, nSamps, maxS, maxLen = vcf_manager.countSites(nProc=nProc)
133 | 
134 | 
135 |     ## Prepare the accessibility mask
136 |     if args.mask:
137 |         mask_fraction, win_masks = vcf_manager.maskWins(wins=wins, maxLen=maxLen, nProc=nProc)
138 |     else:
139 |         mask_fraction, win_masks = 0.0, None
140 |     
141 |     
142 |     ## Prepare the missing data mask
143 |     md_mask, mask_files = None, []
144 |     for FILE in glob.glob(os.path.join(vcfDir, "*_md_mask.hdf5")):
145 |         mask_files.append(FILE)
146 |         md_mask = []
147 |     for FILE in mask_files:
148 |         print("Reading HDF5 mask: {}...".format(FILE))
149 |         with h5py.File(FILE, "r") as hf:
150 |             md_mask.append(hf["mask"][:])
151 |     if md_mask:
152 |         md_mask = np.concatenate(md_mask)
153 |     
154 |     
155 |     ## Define parameters for msprime simulation
156 |     print("Simulating with window size = {} bp.".format(maxLen))
157 |     a=0
158 |     for i in range(nSamps-1):
159 |         a+=1/(i+1)
160 |     thetaW=maxS/a
161 |     assumedMu = args.mu
162 |     Ne=thetaW/(4.0 * assumedMu * ((1-mask_fraction) * maxLen))
163 |     rhoHi=assumedMu*args.upRTR
164 |     if demHist:
165 |         MspD = convert_demHist(args.dem, nSamps, args.genTime, demHist, assumedMu)
166 |         dg_params = {
167 |                 'priorLowsRho':0.0,
168 |                 'priorHighsRho':rhoHi,
169 |                 'priorLowsMu':assumedMu * 0.66,
170 |                 'priorHighsMu':assumedMu * 1.33,
171 |                 'ChromosomeLength':maxLen,
172 |                 'winMasks':win_masks,
173 |                 'mdMask':md_mask,
174 |                 'maskThresh':args.maskThresh,
175 |                 'phased':args.phased,
176 |                 'phaseError':args.phaseError,
177 |                 'MspDemographics':MspD,
178 |                 'seed':args.seed
179 |                   }
180 | 
181 |     else:
182 |         dg_params = {'N':nSamps,
183 |             'Ne':Ne,
184 |             'priorLowsRho':0.0,
185 |             'priorHighsRho':rhoHi,
186 |             'priorLowsMu':assumedMu * 0.66,
187 |             'priorHighsMu':assumedMu * 1.33,
188 |             'ChromosomeLength':maxLen,
189 |             'winMasks':win_masks,
190 |             'mdMask':md_mask,
191 |             'maskThresh':args.maskThresh,
192 |             'phased':args.phased,
193 |             'phaseError':args.phaseError,
194 |             'seed':args.seed
195 |                   }
196 | 
197 | 
198 |     # Assign pars for each simulation
199 |     dg_train = Simulator(**dg_params)
200 |     dg_vali = Simulator(**dg_params)
201 |     dg_test = Simulator(**dg_params)
202 | 
203 | 
204 |     ## Dump simulation pars for use with parametric bootstrap
205 |     simParsFILE=os.path.join(networkDir,"simPars.p")
206 |     with open(simParsFILE, "wb") as fOUT:
207 |         dg_params["bn"]=os.path.basename(args.vcf).replace(".vcf","")
208 |         pickle.dump(dg_params,fOUT)
209 | 
210 | 
211 |     ## Simulate data
212 |     print("Training set:")
213 |     dg_train.simulateAndProduceTrees(numReps=args.nTrain,direc=trainDir,simulator="msprime",nProc=nProc)
214 |     print("Validation set:")
215 |     dg_vali.simulateAndProduceTrees(numReps=args.nVali,direc=valiDir,simulator="msprime",nProc=nProc)
216 |     print("Test set:")
217 |     dg_test.simulateAndProduceTrees(numReps=args.nTest,direc=testDir,simulator="msprime",nProc=nProc)
218 |     print("\nSIMULATIONS FINISHED!\n")
219 | 
220 | 
221 |     ## Count number of segregating sites in simulation
222 |     SS=[]
223 |     maxSegSites = 0
224 |     minSegSites = float("inf")
225 |     for ds in [trainDir,valiDir,testDir]:
226 |         DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb"))
227 |         SS.extend(DsInfoDir["segSites"])
228 |         segSitesInDs = max(DsInfoDir["segSites"])
229 |         segSitesInDsMin = min(DsInfoDir["segSites"])
230 |         maxSegSites = max(maxSegSites,segSitesInDs)
231 |         minSegSites = min(minSegSites,segSitesInDsMin)
232 | 
233 | 
234 |     ## Compare counts of segregating sites between simulations and input VCF
235 |     print("SANITY CHECK")
236 |     print("====================")
237 |     print("numSegSites\t\t\tMin\tMean\tMax")
238 |     print("Simulated:\t\t\t%s\t%s\t%s" %(minSegSites, int(sum(SS)/float(len(SS))), maxSegSites))
239 |     for i in range(len(wins)):
240 |         print("InputVCF %s:\t\t%s\t%s\t%s" %(wins[i][0],wins[i][3],wins[i][4],wins[i][5]))
241 |     print("\n\n***ReLERNN_SIMULATE.py FINISHED!***\n")
242 | 
243 | 
244 | if __name__ == "__main__":
245 | 	main()
246 | 


--------------------------------------------------------------------------------
/ReLERNN/simulator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author: Jared Galloway, Jeff Adrion
  3 | 
  4 | '''
  5 | 
  6 | from ReLERNN.imports import *
  7 | from ReLERNN.helpers import *
  8 | 
  9 | MAX_SEED = int(2 ** 32 - 1) # maximum allowed seed in msprime
 10 | 
 11 | class Simulator(object):
 12 |     '''
 13 | 
 14 |     The simulator class is a framework for running N simulations
 15 |     using Either msprime (coalescent) or SLiM (forward-moving)
 16 |     in parallel using python's multithreading package.
 17 | 
 18 |     With Specified parameters, the class Simulator() populates
 19 |     a directory with training, validation, and testing datasets.
 20 |     It stores the the treeSequences resulting from each simulation
 21 |     in a subdirectory respectfully labeled 'i.trees' where i is the
 22 |     i^th simulation.
 23 | 
 24 |     Included with each dataset this class produces an info.p
 25 |     in the subdirectory. This uses pickle to store a dictionary
 26 |     containing all the information for each simulation including the random
 27 |     target parameter which will be extracted for training.
 28 | 
 29 |     '''
 30 | 
 31 |     def __init__(self,
 32 |         N = 2,
 33 | 	Ne = 1e2,
 34 |         priorLowsRho = 0.0,
 35 |         priorLowsMu = 0.0,
 36 |         priorHighsRho = 1e-7,
 37 |         priorHighsMu = 1e-8,
 38 |         ChromosomeLength = 1e5,
 39 |         MspDemographics = None,
 40 |         winMasks = None,
 41 |         mdMask = None,
 42 |         maskThresh = 1.0,
 43 |         phased = None,
 44 |         phaseError = None,
 45 |         hotspots = False,
 46 |         nHotWins = 10,
 47 |         seed = None
 48 |         ):
 49 | 
 50 |         self.N = N
 51 |         self.Ne = Ne
 52 |         self.priorLowsRho = priorLowsRho
 53 |         self.priorHighsRho = priorHighsRho
 54 |         self.priorLowsMu = priorLowsMu
 55 |         self.priorHighsMu = priorHighsMu
 56 |         self.ChromosomeLength = ChromosomeLength
 57 |         self.MspDemographics = MspDemographics
 58 |         self.rho = None
 59 |         self.hotWin = None
 60 |         self.mu = None
 61 |         self.segSites = None
 62 |         self.winMasks = winMasks
 63 |         self.mdMask = mdMask
 64 |         self.maskThresh = maskThresh
 65 |         self.phased = phased
 66 |         self.phaseError = phaseError
 67 |         self.hotspots = hotspots
 68 |         self.nHotWins = nHotWins
 69 |         self.seed = seed
 70 | 
 71 | 
 72 |         if self.seed:
 73 |             os.environ['PYTHONHASHSEED']=str(self.seed)
 74 |             random.seed(self.seed)
 75 |             np.random.seed(self.seed)
 76 | 
 77 | 
 78 |     def runOneMsprimeSim(self,simNum,direc):
 79 |         '''
 80 |         run one msprime simulation and put the corresponding treeSequence in treesOutputFilePath
 81 | 
 82 |         (str,float,float)->None
 83 |         '''
 84 | 
 85 |         MR = self.mu[simNum]
 86 |         RR = self.rho[simNum]
 87 |         SEED = self.seed[simNum]
 88 |         os.environ['PYTHONHASHSEED']=str(SEED)
 89 |         random.seed(SEED)
 90 |         np.random.seed(SEED)
 91 | 
 92 |         if self.hotspots:
 93 |             hotspotMultiplier = self.hotWin[simNum]
 94 | 
 95 |             mapName = str(simNum) + "_map.txt"
 96 |             mapPath = os.path.join(direc,mapName)
 97 | 
 98 |             nWins = self.nHotWins
 99 |             hotSpotWin = np.random.randint(nWins)
100 | 
101 |             winRates = np.empty(nWins)
102 | 
103 |             breaks = np.linspace(0,self.ChromosomeLength, num = nWins + 1)
104 |             with open(mapPath, "w") as fOUT:
105 |                 fOUT.write("Chromosome\tstartPos\tRate\n")
106 |                 for i in range(len(breaks)):
107 |                     if i == hotSpotWin:
108 |                         baseRate = RR * hotspotMultiplier * 10**8
109 |                         winRates[i] = baseRate
110 |                     elif i == nWins:
111 |                         baseRate = 0.0
112 |                     else:
113 |                         baseRate = RR * 10**8
114 |                         winRates[i] = baseRate
115 |                     fOUT.write("{}\t{}\t{}\n".format("chr",int(breaks[i]),baseRate))
116 | 
117 |             recomb_map = msp.RecombinationMap.read_hapmap(mapPath)
118 | 
119 |             if self.MspDemographics:
120 |                 DE = self.MspDemographics["demographic_events"]
121 |                 PC = self.MspDemographics["population_configurations"]
122 |                 MM = self.MspDemographics["migration_matrix"]
123 |                 ts = msp.simulate(
124 |                     random_seed=SEED,
125 |                     mutation_rate=MR,
126 |                     population_configurations = PC,
127 |                     migration_matrix = MM,
128 |                     demographic_events = DE,
129 |                     recombination_map = recomb_map
130 |                 )
131 | 
132 |             else:
133 |                 ts = msp.simulate(
134 |                     random_seed = SEED,
135 |                     sample_size = self.N,
136 |                     Ne = self.Ne,
137 |                     mutation_rate=MR,
138 |                     recombination_map = recomb_map
139 |                 )
140 | 
141 |         else:
142 |             if self.MspDemographics:
143 |                 DE = self.MspDemographics["demographic_events"]
144 |                 PC = self.MspDemographics["population_configurations"]
145 |                 MM = self.MspDemographics["migration_matrix"]
146 |                 ts = msp.simulate(
147 |                     random_seed=SEED,
148 |                     length=self.ChromosomeLength,
149 |                     mutation_rate=MR,
150 |                     recombination_rate=RR,
151 |                     population_configurations = PC,
152 |                     migration_matrix = MM,
153 |                     demographic_events = DE
154 |                 )
155 |             else:
156 |                 ts = msp.simulate(
157 |                     random_seed = SEED,
158 |                     sample_size = self.N,
159 |                     Ne = self.Ne,
160 |                     length=self.ChromosomeLength,
161 |                     mutation_rate=MR,
162 |                     recombination_rate=RR
163 |                 )
164 | 
165 |         # Convert tree sequence to genotype matrix, and position matrix
166 |         H = ts.genotype_matrix()
167 |         P = np.array([s.position for s in ts.sites()],dtype='float32')
168 | 
169 |         # "Unphase" genotypes
170 |         if not self.phased:
171 |             np.random.shuffle(np.transpose(H))
172 | 
173 |         # Simulate phasing error
174 |         if self.phaseError:
175 |             H = self.phaseErrorer(H,self.phaseError)
176 | 
177 |         # If there is a missing data mask, sample from the mask and apply to haps
178 |         if not self.mdMask is None:
179 |             mdMask = self.mdMask[np.random.choice(self.mdMask.shape[0], H.shape[0], replace=True)]
180 |             H = np.ma.masked_array(H, mask=mdMask)
181 |             H = np.ma.filled(H,2)
182 | 
183 |         # Sample from the genome-wide distribution of masks and mask both positions and genotypes
184 |         if self.winMasks:
185 |             while True:
186 |                 rand_mask = self.winMasks[random.randint(0,len(self.winMasks)-1)]
187 |                 if rand_mask[0] < self.maskThresh:
188 |                     break
189 |             if rand_mask[0] > 0.0:
190 |                 H,P = self.maskGenotypes(H, P, rand_mask)
191 | 
192 |         # Dump
193 |         Hname = str(simNum) + "_haps.npy"
194 |         Hpath = os.path.join(direc,Hname)
195 |         Pname = str(simNum) + "_pos.npy"
196 |         Ppath = os.path.join(direc,Pname)
197 |         np.save(Hpath,H)
198 |         np.save(Ppath,P)
199 | 
200 |         # Return number of sites
201 |         return H.shape[0]
202 | 
203 | 
204 |     def maskGenotypes(self, H, P, rand_mask):
205 |         """
206 |         Return the genotype and position matrices where masked sites have been removed
207 |         """
208 |         mask_wins = np.array(rand_mask[1])
209 |         mask_wins = np.reshape(mask_wins, 2 * mask_wins.shape[0])
210 |         mask = np.digitize(P, mask_wins) % 2 == 0
211 |         return H[mask], P[mask]
212 | 
213 | 
214 |     def phaseErrorer(self, H, rate):
215 |         """
216 |         Returns the genotype matrix where some fraction of sites have shuffled samples
217 |         """
218 |         H_shuf = copy.deepcopy(H)
219 |         np.random.shuffle(np.transpose(H_shuf))
220 |         H_mask = np.random.choice([True,False], H.shape[0], p = [1-rate,rate])
221 |         H_mask = np.repeat(H_mask, H.shape[1])
222 |         H_mask = H_mask.reshape(H.shape)
223 |         return np.where(H_mask,H,H_shuf)
224 | 
225 | 
226 |     def simulateAndProduceTrees(self,direc,numReps,simulator,nProc=1):
227 |         '''
228 |         determine which simulator to use then populate
229 | 
230 |         (str,str) -> None
231 |         '''
232 | 
233 |         if self.hotspots:
234 |             self.hotWin=np.zeros(numReps)
235 |             for i in range(int(numReps/2.0)):
236 |                 randomTargetParameter = np.random.uniform(50,50)
237 |                 self.hotWin[i] = randomTargetParameter
238 |             for i in range(int(numReps/2.0),numReps):
239 |                 randomTargetParameter = np.random.uniform(1,1)
240 |                 self.hotWin[i] = randomTargetParameter
241 | 
242 |         self.rho=np.empty(numReps)
243 |         for i in range(numReps):
244 |             randomTargetParameter = np.random.uniform(self.priorLowsRho,self.priorHighsRho)
245 |             self.rho[i] = randomTargetParameter
246 | 
247 |         self.mu=np.empty(numReps)
248 |         for i in range(numReps):
249 |             randomTargetParameter = np.random.uniform(self.priorLowsMu,self.priorHighsMu)
250 |             self.mu[i] = randomTargetParameter
251 | 
252 |         if self.seed is None:
253 |             self.seed=np.repeat(self.seed, numReps)
254 |         else:
255 |             self.seed=np.random.randint(0, MAX_SEED, size=(numReps,))
256 | 
257 |         try:
258 |             assert((simulator=='msprime') | (simulator=='SLiM'))
259 |         except:
260 |             print("Sorry, only 'msprime' & 'SLiM' are supported simulators")
261 |             exit()
262 | 
263 |         #Pretty straitforward, create the directory passed if it doesn't exits
264 |         if not os.path.exists(direc):
265 |             print("directory '",direc,"' does not exist, creating it")
266 |             os.makedirs(direc)
267 | 
268 |         # partition data for multiprocessing
269 |         mpID = range(numReps)
270 |         task_q = mp.JoinableQueue()
271 |         result_q = mp.Queue()
272 |         params=[simulator, direc]
273 | 
274 |         # do the work
275 |         print("Simulate...")
276 |         pids = create_procs(nProc, task_q, result_q, params, self.worker_simulate)
277 |         assign_task(mpID, task_q, nProc)
278 |         try:
279 |             task_q.join()
280 |         except KeyboardInterrupt:
281 |             print("KeyboardInterrupt")
282 |             sys.exit(0)
283 | 
284 |         self.segSites=np.empty(numReps,dtype="int64")
285 |         for i in range(result_q.qsize()):
286 |             item = result_q.get()
287 |             self.segSites[item[0]]=item[1]
288 | 
289 |         self.__dict__["numReps"] = numReps
290 |         infofile = open(os.path.join(direc,"info.p"),"wb")
291 |         pickle.dump(self.__dict__,infofile)
292 |         infofile.close()
293 | 
294 |         for p in pids:
295 |             p.terminate()
296 |         return None
297 | 
298 | 
299 |     def worker_simulate(self, task_q, result_q, params):
300 |         while True:
301 |             try:
302 |                 mpID, nth_job = task_q.get()
303 |                 #unpack parameters
304 |                 simulator, direc = params
305 |                 for i in mpID:
306 |                         result_q.put([i,self.runOneMsprimeSim(i,direc)])
307 |             finally:
308 |                 task_q.task_done()
309 | 


--------------------------------------------------------------------------------
/ReLERNN/manager.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author: Jeff Adrion
  3 | 
  4 | '''
  5 | 
  6 | from ReLERNN.imports import *
  7 | from ReLERNN.helpers import *
  8 | 
  9 | class Manager(object):
 10 |     '''
 11 | 
 12 |     The manager class is a framework for handling both VCFs and masks
 13 |     and can multi-process many of the functions orginally found in ReLERNN_SIMULATE
 14 | 
 15 |     '''
 16 | 
 17 | 
 18 |     def __init__(self,
 19 |         vcf = None,
 20 |         pool = None,
 21 |         chromosomes = None,
 22 |         mask = None,
 23 |         winSizeMx = None,
 24 |         forceWinSize = None,
 25 |         forceDiploid = None,
 26 |         vcfDir = None,
 27 |         poolDir = None,
 28 |         projectDir = None,
 29 |         networkDir = None,
 30 |         seed = None
 31 |         ):
 32 | 
 33 |         self.vcf = vcf
 34 |         self.pool = pool
 35 |         self.chromosomes = chromosomes
 36 |         self.mask = mask
 37 |         self.winSizeMx = winSizeMx
 38 |         self.forceWinSize = forceWinSize
 39 |         self.forceDiploid = forceDiploid
 40 |         self.vcfDir = vcfDir
 41 |         self.poolDir = poolDir
 42 |         self.projectDir = projectDir
 43 |         self.networkDir = networkDir
 44 |         self.seed = seed
 45 | 
 46 | 
 47 |         if self.seed:
 48 |             os.environ['PYTHONHASHSEED']=str(self.seed)
 49 |             random.seed(self.seed)
 50 |             np.random.seed(self.seed)
 51 | 
 52 | 
 53 |     def splitVCF(self,nProc=1):
 54 |         '''
 55 |         split the vcf into seperate files by chromosome
 56 |         '''
 57 |         # partition for multiprocessing
 58 |         mpID = range(len(self.chromosomes))
 59 |         task_q = mp.JoinableQueue()
 60 |         result_q = mp.Queue()
 61 |         params=self.vcfDir, self.vcf, self.chromosomes
 62 | 
 63 |         # do the work
 64 |         pids = create_procs(nProc, task_q, result_q, params, self.worker_splitVCF)
 65 |         assign_task(mpID, task_q, nProc)
 66 |         try:
 67 |             task_q.join()
 68 |         except KeyboardInterrupt:
 69 |             print("KeyboardInterrupt")
 70 |             sys.exit(0)
 71 | 
 72 |         return None
 73 | 
 74 | 
 75 |     def worker_splitVCF(self, task_q, result_q, params):
 76 |         while True:
 77 |             try:
 78 |                 mpID, nth_job = task_q.get()
 79 |                 vcfDir, vcf, chroms = params
 80 |                 for i in mpID:
 81 |                     chrom = chroms[i].split(":")[0]
 82 |                     start = int(chroms[i].split(":")[1].split("-")[0])+1
 83 |                     end = int(chroms[i].split(":")[1].split("-")[1])+1
 84 |                     splitVCF=os.path.join(vcfDir, os.path.basename(vcf).replace(".vcf","_%s.vcf" %(chroms[i])))
 85 |                     print("Split chromosome: %s..." %(chrom))
 86 |                     with open(vcf, "r") as fIN, open(splitVCF, "w") as fOUT:
 87 |                         for line in fIN:
 88 |                             if line.startswith("#"):
 89 |                                 fOUT.write(line)
 90 |                             if line.startswith("%s\t" %(chrom)):
 91 |                                 pos = int(line.split()[1])
 92 |                                 if start <= pos <= end:
 93 |                                     fOUT.write(line)
 94 |                     print("Converting %s to HDF5..." %(splitVCF))
 95 |                     h5FILE=splitVCF.replace(".vcf",".hdf5")
 96 |                     allel.vcf_to_hdf5(splitVCF,h5FILE,fields="*",overwrite=True)
 97 |                     os.system("rm %s" %(splitVCF))
 98 |             finally:
 99 |                 task_q.task_done()
100 | 
101 | 
102 |     def splitPOOL(self,nProc=1):
103 |         '''
104 |         split the pool file into seperate files by chromosome
105 |         '''
106 |         # partition for multiprocessing
107 |         mpID = range(len(self.chromosomes))
108 |         task_q = mp.JoinableQueue()
109 |         result_q = mp.Queue()
110 |         params=self.poolDir, self.pool, self.chromosomes
111 | 
112 |         # do the work
113 |         pids = create_procs(nProc, task_q, result_q, params, self.worker_splitPOOL)
114 |         assign_task(mpID, task_q, nProc)
115 |         try:
116 |             task_q.join()
117 |         except KeyboardInterrupt:
118 |             print("KeyboardInterrupt")
119 |             sys.exit(0)
120 | 
121 |         return None
122 | 
123 | 
124 |     def worker_splitPOOL(self, task_q, result_q, params):
125 |         while True:
126 |             try:
127 |                 mpID, nth_job = task_q.get()
128 |                 poolDir, pool, chroms = params
129 |                 for i in mpID:
130 |                     chrom = chroms[i].split(":")[0]
131 |                     start = int(chroms[i].split(":")[1].split("-")[0])+1
132 |                     end = int(chroms[i].split(":")[1].split("-")[1])+1
133 |                     splitPOOL=os.path.join(poolDir, os.path.basename(pool).replace(".pool","_%s.pool" %(chroms[i])))
134 |                     print("Split chromosome: %s..." %(chrom))
135 |                     with open(pool, "r") as fIN, open(splitPOOL, "w") as fOUT:
136 |                         for line in fIN:
137 |                             if line.startswith("%s\t" %(chrom)):
138 |                                 pos = int(line.split()[1])
139 |                                 if start <= pos <= end:
140 |                                     fOUT.write(line)
141 |             finally:
142 |                 task_q.task_done()
143 | 
144 | 
145 |     def countSites(self, nProc=1):
146 |         # partition for multiprocessing
147 |         mpID = range(len(self.chromosomes))
148 |         task_q = mp.JoinableQueue()
149 |         result_q = mp.Queue()
150 |         params=self.chromosomes
151 | 
152 |         # do the work
153 |         pids = create_procs(nProc, task_q, result_q, params, self.worker_countSites)
154 |         assign_task(mpID, task_q, nProc)
155 |         try:
156 |             task_q.join()
157 |         except KeyboardInterrupt:
158 |             print("KeyboardInterrupt")
159 |             sys.exit(0)
160 | 
161 |         wins = []
162 |         for i in range(result_q.qsize()):
163 |             item = result_q.get()
164 |             wins.append(item)
165 | 
166 |         nSamps,maxS,maxLen = [],0,0
167 |         sorted_wins = []
168 |         winFILE=os.path.join(self.networkDir,"windowSizes.txt")
169 |         with open(winFILE, "w") as fOUT:
170 |             for chrom in self.chromosomes:
171 |                 for win in wins:
172 |                     if win[0] == chrom:
173 |                         maxS = max(maxS,win[4])
174 |                         maxLen = max(maxLen,win[2])
175 |                         nSamps.append(win[1])
176 |                         sorted_wins.append(win)
177 |                         fOUT.write("\t".join([str(x) for x in win])+"\n")
178 |         if len(set(nSamps)) != 1:
179 |             print("\nError: chromosomes have different sample sizes!\n")
180 |             print("chromosome\t\tnum_samples (-9 when n varies between samples)")
181 |             for chrom in self.chromosomes:
182 |                 for win in wins:
183 |                     if win[0] == chrom:
184 |                         print("%s\t\t%s"%(chrom.split(":")[0],win[1]))
185 |             print("\nAll samples can be treated as 'diploids with missing data' by rerunning with the option `--forceDiploid`, however this is probably a bad idea (see README.md).")
186 |             sys.exit(1)
187 | 
188 |         return sorted_wins, nSamps[0], maxS, maxLen
189 | 
190 | 
191 |     def worker_countSites(self, task_q, result_q, params):
192 |         while True:
193 |             try:
194 |                 mpID, nth_job = task_q.get()
195 |                 chromosomes = params
196 |                 for i in mpID:
197 |                     h5FILE=os.path.join(self.vcfDir, os.path.basename(self.vcf).replace(".vcf","_%s.hdf5" %(chromosomes[i])))
198 |                     print("""Reading HDF5: "%s"...""" %(h5FILE))
199 |                     callset=h5py.File(h5FILE, mode="r")
200 |                     var=allel.VariantChunkedTable(callset["variants"],names=["CHROM","POS"], index="POS")
201 |                     chroms=var["CHROM"]
202 |                     pos=var["POS"]
203 |                     genos=allel.GenotypeChunkedArray(callset["calldata"]["GT"])
204 |                     GT=genos.to_haplotypes()
205 |                     diploid_check=[]
206 |                     for n in range(1,len(genos[0]),2):
207 |                         GTB=GT[:,n:n+1]
208 |                         if np.unique(GTB).shape[0] == 1 and np.unique(GTB)[0] == -1:
209 |                             diploid_check.append(0)
210 |                         else:
211 |                             diploid_check.append(1)
212 |                     if 1 in diploid_check or self.forceDiploid:
213 |                         GT=np.array(GT)
214 |                         nSamps=len(genos[0])*2
215 |                     else:
216 |                         nSamps=len(genos[0])
217 |                         GT=GT[:,::2] #Select only the first of the genotypes
218 |                     if 0 in diploid_check and 1 in diploid_check and not self.forceDiploid:
219 |                         print("\nError: Both haploid and diploid samples present in %s!"%(chromosomes[i].split(":")[0]))
220 |                         nSamps=-9
221 | 
222 |                     ## if there is any missing data write a missing data boolean mask to hdf5
223 |                     md_mask = GT < 0
224 |                     if md_mask.any():
225 |                         md_maskFile=os.path.join(self.vcfDir, os.path.basename(self.vcf).replace(".vcf","_%s_md_mask.hdf5" %(chromosomes[i])))
226 |                         with h5py.File(md_maskFile, "w") as hf:
227 |                             hf.create_dataset("mask", data=md_mask)
228 | 
229 |                     ## Find best window size
230 |                     if self.forceWinSize != 0:
231 |                         ip = force_win_size(self.forceWinSize,pos)
232 |                         result_q.put([chromosomes[i],nSamps,ip[0],ip[1],ip[2],ip[3],ip[4]])
233 |                     else:
234 |                         lo, hi = 0, round(int(chromosomes[i].split(":")[-1].split("-")[-1]),-3)
235 |                         D = hi - lo
236 |                         target = lo + int((hi - lo)/2.0)
237 |                         while D > 10:
238 |                             ip = find_win_size(target,pos,self.winSizeMx)
239 |                             if len(ip) != 5:
240 |                                 if ip[0] < 0:
241 |                                     hi = target
242 |                                 if ip[0] > 0:
243 |                                     lo = target
244 |                                 target = lo + int((hi - lo)/2.0)
245 |                             else:
246 |                                 break
247 |                             D = hi - lo
248 |                         ip = force_win_size(round(target, -3), pos)
249 |                         result_q.put([chromosomes[i],nSamps,ip[0],ip[1],ip[2],ip[3],ip[4]])
250 |             finally:
251 |                 task_q.task_done()
252 | 
253 | 
254 |     def countSitesPOOL(self, samD=0, nProc=1):
255 |         # partition for multiprocessing
256 |         mpID = range(len(self.chromosomes))
257 |         task_q = mp.JoinableQueue()
258 |         result_q = mp.Queue()
259 |         params=self.chromosomes
260 | 
261 |         # do the work
262 |         pids = create_procs(nProc, task_q, result_q, params, self.worker_countSitesPOOL)
263 |         assign_task(mpID, task_q, nProc)
264 |         try:
265 |             task_q.join()
266 |         except KeyboardInterrupt:
267 |             print("KeyboardInterrupt")
268 |             sys.exit(0)
269 | 
270 |         wins = []
271 |         for i in range(result_q.qsize()):
272 |             item = result_q.get()
273 |             wins.append(item)
274 | 
275 |         nSamps,maxS,maxLen = [],0,0
276 |         sorted_wins = []
277 |         winFILE=os.path.join(self.networkDir,"windowSizes.txt")
278 |         with open(winFILE, "w") as fOUT:
279 |             for chrom in self.chromosomes:
280 |                 for win in wins:
281 |                     if win[0] == chrom:
282 |                         maxS = max(maxS,win[3])
283 |                         maxLen = max(maxLen,win[1])
284 |                         win.insert(1,samD)
285 |                         nSamps.append(samD)
286 |                         sorted_wins.append(win)
287 |                         fOUT.write("\t".join([str(x) for x in win])+"\n")
288 |         return sorted_wins, nSamps[0], maxS, maxLen
289 | 
290 | 
291 |     def worker_countSitesPOOL(self, task_q, result_q, params):
292 |         while True:
293 |             try:
294 |                 mpID, nth_job = task_q.get()
295 |                 chromosomes = params
296 |                 for i in mpID:
297 |                     pos = []
298 |                     poolFILE=os.path.join(self.poolDir, os.path.basename(self.pool).replace(".pool","_%s.pool" %(chromosomes[i])))
299 |                     print("poolFILE:",poolFILE)
300 |                     with open(poolFILE, "r") as fIN:
301 |                         for line in fIN:
302 |                             pos.append(int(line.split()[1]))
303 |                     pos=np.array(pos)
304 | 
305 |                     ## Find best window size
306 |                     if self.forceWinSize != 0:
307 |                         ip = force_win_size(self.forceWinSize,pos)
308 |                         result_q.put([chromosomes[i],ip[0],ip[1],ip[2],ip[3],ip[4]])
309 |                     else:
310 |                         lo, hi = 0, round(int(chromosomes[i].split(":")[-1].split("-")[-1]),-3)
311 |                         D = hi - lo
312 |                         target = lo + int((hi - lo)/2.0)
313 |                         while D > 10:
314 |                             ip = find_win_size(target,pos,self.winSizeMx)
315 |                             if len(ip) != 5:
316 |                                 if ip[0] < 0:
317 |                                     hi = target
318 |                                 if ip[0] > 0:
319 |                                     lo = target
320 |                                 target = lo + int((hi - lo)/2.0)
321 |                             else:
322 |                                 break
323 |                             D = hi - lo
324 |                         ip = force_win_size(round(target, -2), pos)
325 |                         result_q.put([chromosomes[i],ip[0],ip[1],ip[2],ip[3],ip[4]])
326 |             finally:
327 |                 task_q.task_done()
328 | 
329 | 
330 |     def maskWins(self, wins=None, maxLen=None, nProc=1):
331 |         ## Read accessability mask
332 |         print("\nAccessibility mask found: calculating the proportion of the genome that is masked...")
333 |         genome = [x[0].split(":")[0] for x in wins]
334 |         mask={}
335 |         with open(self.mask, "r") as fIN:
336 |             for line in fIN:
337 |                 ar = line.split()
338 |                 try:
339 |                     if int(ar[1]) < mask[ar[0]][-1][1]:
340 |                         print("Error: positions in accessibility mask are required to be non-overlapping and ascending!")
341 |                         sys.exit(1)
342 |                     else:
343 |                         mask[ar[0]].append([int(pos) for pos in ar[1:]])
344 |                 except KeyError:
345 |                     if ar[0] in genome:
346 |                         mask[ar[0]] = [[int(pos) for pos in ar[1:]]]
347 | 
348 |         ## Combine genomic windows
349 |         genomic_wins = []
350 |         for win in wins:
351 |             win_chrom = win[0]
352 |             win_len = win[2]
353 |             win_ct = win[6]
354 |             start = 0
355 |             for i in range(win_ct):
356 |                 genomic_wins.append([win_chrom, start, win_len])
357 |                 start += win_len
358 | 
359 |         # partition for multiprocessing
360 |         mpID = range(len(genomic_wins))
361 |         task_q = mp.JoinableQueue()
362 |         result_q = mp.Queue()
363 |         params=genomic_wins, mask, maxLen
364 | 
365 |         # do the work
366 |         pids = create_procs(nProc, task_q, result_q, params, self.worker_maskWins)
367 |         assign_task(mpID, task_q, nProc)
368 |         try:
369 |             task_q.join()
370 |         except KeyboardInterrupt:
371 |             print("KeyboardInterrupt")
372 |             sys.exit(0)
373 | 
374 |         masks = []
375 |         for i in range(result_q.qsize()):
376 |             item = result_q.get()
377 |             masks.append(item)
378 | 
379 |         mask_fraction, win_masks = [], []
380 |         for mask in masks:
381 |             mask_fraction.append(mask[0])
382 |             win_masks.append(mask)
383 | 
384 |         mean_mask_fraction = sum(mask_fraction)/float(len(mask_fraction))
385 |         print("{}% of genome inaccessible\n".format(round(mean_mask_fraction * 100,1)))
386 |         return mean_mask_fraction, win_masks
387 | 
388 | 
389 |     def worker_maskWins(self, task_q, result_q, params):
390 |         while True:
391 |             try:
392 |                 mpID, nth_job = task_q.get()
393 |                 genomic_wins, mask, maxLen = params
394 |                 last_win = 0
395 |                 last_chrom = genomic_wins[0][0].split(":")[0]
396 |                 for i in mpID:
397 |                     if genomic_wins[i][0].split(":")[0] != last_chrom:
398 |                         last_win = 0
399 |                         last_chrom = genomic_wins[i][0].split(":")[0]
400 |                     M = maskStats(genomic_wins[i], last_win, mask, maxLen)
401 |                     last_win = M[2]
402 |                     result_q.put(M)
403 |             finally:
404 |                 task_q.task_done()
405 | 
406 | 


--------------------------------------------------------------------------------
/ReLERNN/sequenceBatchGenerator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Authors: Jared Galloway, Jeff Adrion
  3 | '''
  4 | 
  5 | from ReLERNN.imports import *
  6 | 
  7 | class SequenceBatchGenerator(tf.keras.utils.Sequence):
  8 | 
  9 |     '''
 10 |     This class, SequenceBatchGenerator, extends tf.keras.utils.Sequence.
 11 |     So as to multithread the batch preparation in tandum with network training
 12 |     for maximum effeciency on the hardware provided.
 13 | 
 14 |     It generated batches of genotype matrices from a given .trees directory
 15 |     (which is generated most effeciently from the Simulator class)
 16 |     which have been prepped according to the given parameters.
 17 | 
 18 |     It also offers a range of data prepping heuristics as well as normalizing
 19 |     the targets.
 20 | 
 21 |     def __getitem__(self, idx):
 22 | 
 23 |     def __data_generation(self, batchTreeIndices):
 24 | 
 25 |     '''
 26 | 
 27 |     #Initialize the member variables which largely determine the data prepping heuristics
 28 |     #in addition to the .trees directory containing the data from which to generate the batches
 29 |     def __init__(self,
 30 |             treesDirectory,
 31 |             targetNormalization = 'zscore',
 32 |             batchSize=64,
 33 |             maxLen=None,
 34 |             frameWidth=0,
 35 |             center=False,
 36 |             shuffleInds=False,
 37 |             sortInds=False,
 38 |             ancVal = -1,
 39 |             padVal = -1,
 40 |             derVal = 1,
 41 |             realLinePos = True,
 42 |             posPadVal = 0,
 43 |             shuffleExamples = True,
 44 |             splitFLAG = False,
 45 |             seqD = None,
 46 |             maf = None,
 47 |             hotspots = False,
 48 |             seed = None
 49 |             ):
 50 | 
 51 |         self.treesDirectory = treesDirectory
 52 |         self.targetNormalization = targetNormalization
 53 |         infoFilename = os.path.join(self.treesDirectory,"info.p")
 54 |         self.infoDir = pickle.load(open(infoFilename,"rb"))
 55 |         self.batch_size = batchSize
 56 |         self.maxLen = maxLen
 57 |         self.frameWidth = frameWidth
 58 |         self.center = center
 59 |         self.shuffleInds = shuffleInds
 60 |         self.sortInds=sortInds
 61 |         self.ancVal = ancVal
 62 |         self.padVal = padVal
 63 |         self.derVal = derVal
 64 |         self.realLinePos = realLinePos
 65 |         self.posPadVal = posPadVal
 66 |         self.indices = np.arange(self.infoDir["numReps"])
 67 |         self.shuffleExamples = shuffleExamples
 68 |         self.splitFLAG = splitFLAG
 69 |         self.seqD = seqD
 70 |         self.maf = maf
 71 |         self.hotspots = hotspots
 72 |         self.seed = seed
 73 | 
 74 |         if self.seed:
 75 |             os.environ['PYTHONHASHSEED']=str(self.seed)
 76 |             random.seed(self.seed)
 77 |             np.random.seed(self.seed)
 78 | 
 79 |         if(targetNormalization != None):
 80 |             if self.hotspots:
 81 |                 self.normalizedTargets = self.normalizeTargetsBinaryClass()
 82 |             else:
 83 |                 self.normalizedTargets = self.normalizeTargets()
 84 | 
 85 |         if(shuffleExamples):
 86 |             np.random.shuffle(self.indices)
 87 | 
 88 |     def sort_min_diff(self,amat):
 89 |         '''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity.
 90 |         this problem is NP, so here we use a nearest neighbors approx.  it's not perfect, but it's fast and generally performs ok.
 91 |         assumes your input matrix is a numpy array'''
 92 | 
 93 |         mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat)
 94 |         v = mb.kneighbors(amat)
 95 |         smallest = np.argmin(v[0].sum(axis=1))
 96 |         return amat[v[1][smallest]]
 97 | 
 98 |     def pad_HapsPos(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False):
 99 |         '''
100 |         pads the haplotype and positions tensors
101 |         to be uniform with the largest tensor
102 |         '''
103 | 
104 |         haps = haplotypes
105 |         pos = positions
106 | 
107 |         #Normalize the shape of all haplotype vectors with padding
108 |         for i in range(len(haps)):
109 |             numSNPs = haps[i].shape[0]
110 |             paddingLen = maxSNPs - numSNPs
111 |             if(center):
112 |                 prior = paddingLen // 2
113 |                 post = paddingLen - prior
114 |                 haps[i] = np.pad(haps[i],((prior,post),(0,0)),"constant",constant_values=2.0)
115 |                 pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0)
116 | 
117 |             else:
118 |                 if(paddingLen < 0):
119 |                     haps[i] = np.pad(haps[i],((0,0),(0,0)),"constant",constant_values=2.0)[:paddingLen]
120 |                     pos[i] = np.pad(pos[i],(0,0),"constant",constant_values=-1.0)[:paddingLen]
121 |                 else:
122 |                     haps[i] = np.pad(haps[i],((0,paddingLen),(0,0)),"constant",constant_values=2.0)
123 |                     pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0)
124 | 
125 |         haps = np.array(haps,dtype='float32')
126 |         pos = np.array(pos,dtype='float32')
127 | 
128 |         if(frameWidth):
129 |             fw = frameWidth
130 |             haps = np.pad(haps,((0,0),(fw,fw),(fw,fw)),"constant",constant_values=2.0)
131 |             pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0)
132 | 
133 |         return haps,pos
134 | 
135 |     def padAlleleFqs(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False):
136 |         '''
137 |         convert haps to allele frequencies, normalize, and
138 |         pad the haplotype and positions tensors
139 |         to be uniform with the largest tensor
140 |         '''
141 | 
142 |         haps = haplotypes
143 |         positions = positions
144 |         fqs, pos = [], []
145 | 
146 |         # Resample to sequencing depth and convert to allele frequencies
147 |         for i in range(len(haps)):
148 |             tmp_freqs = []
149 |             tmp_pos = []
150 |             fqs_list = haps[i].tolist()
151 |             for j in range(len(fqs_list)):
152 | 
153 |                 if self.seqD != -9:
154 |                     ## Resample
155 |                     z = resample(fqs_list[j], n_samples=self.seqD, replace=True)
156 |                     raw_freq = round(np.count_nonzero(z)/float(len(z)),3)
157 |                     if self.maf <= raw_freq < 1.0:
158 |                         tmp_freqs.append(raw_freq)
159 |                         tmp_pos.append(positions[i][j])
160 |                 else:
161 |                     ## Don't resample
162 |                     raw_freq = round(np.count_nonzero(fqs_list[j])/float(len(fqs_list[j])),3)
163 |                     tmp_freqs.append(raw_freq)
164 |                     tmp_pos.append(positions[i][j])
165 | 
166 |             fqs.append(np.array(tmp_freqs))
167 |             pos.append(np.array(tmp_pos))
168 | 
169 |         # Normalize
170 |         fqs = self.normalizeAlleleFqs(fqs)
171 | 
172 |         # Pad
173 |         for i in range(len(fqs)):
174 |             numSNPs = fqs[i].shape[0]
175 |             paddingLen = maxSNPs - numSNPs
176 |             if(center):
177 |                 prior = paddingLen // 2
178 |                 post = paddingLen - prior
179 |                 fqs[i] = np.pad(fqs[i],(prior,post),"constant",constant_values=-1.0)
180 |                 pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0)
181 | 
182 |             else:
183 |                 if(paddingLen < 0):
184 |                     fqs[i] = np.pad(fqs[i],(0,0),"constant",constant_values=-1.0)[:paddingLen]
185 |                     pos[i] = np.pad(pos[i],(0,0),"constant",constant_values=-1.0)[:paddingLen]
186 |                 else:
187 |                     fqs[i] = np.pad(fqs[i],(0,paddingLen),"constant",constant_values=-1.0)
188 |                     pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0)
189 | 
190 |         fqs = np.array(fqs,dtype='float32')
191 |         pos = np.array(pos,dtype='float32')
192 | 
193 |         if(frameWidth):
194 |             fw = frameWidth
195 |             fqs = np.pad(fqs,((0,0),(fw,fw)),"constant",constant_values=-1.0)
196 |             pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0)
197 | 
198 |         return fqs,pos
199 | 
200 |     def normalizeTargets(self):
201 | 
202 |         '''
203 |         We want to normalize all targets.
204 |         '''
205 | 
206 |         norm = self.targetNormalization
207 |         nTargets = copy.deepcopy(self.infoDir['rho'])
208 | 
209 |         if(norm == 'zscore'):
210 |             tar_mean = np.mean(nTargets,axis=0)
211 |             tar_sd = np.std(nTargets,axis=0)
212 |             nTargets -= tar_mean
213 |             nTargets = np.divide(nTargets,tar_sd,out=np.zeros_like(nTargets),where=tar_sd!=0)
214 | 
215 |         elif(norm == 'divstd'):
216 |             tar_sd = np.std(nTargets,axis=0)
217 |             nTargets = np.divide(nTargets,tar_sd,out=np.zeros_like(nTargets),where=tar_sd!=0)
218 | 
219 |         return nTargets
220 | 
221 |     def normalizeTargetsBinaryClass(self):
222 | 
223 |         '''
224 |         We want to normalize all targets.
225 |         '''
226 | 
227 |         norm = self.targetNormalization
228 |         nTargets = copy.deepcopy(self.infoDir['hotWin'])
229 | 
230 |         nTargets[nTargets<5] = 0
231 |         nTargets[nTargets>=5] = 1
232 | 
233 |         return nTargets.astype(np.uint8)
234 | 
235 |     def normalizeAlleleFqs(self, fqs):
236 | 
237 |         '''
238 |         normalize the allele frequencies for the batch
239 |         '''
240 | 
241 |         norm = self.targetNormalization
242 | 
243 |         if(norm == 'zscore'):
244 |             allVals = np.concatenate([a.flatten() for a in fqs])
245 |             fqs_mean = np.mean(allVals)
246 |             fqs_sd = np.std(allVals)
247 |             for i in range(len(fqs)):
248 |                 fqs[i] = np.subtract(fqs[i],fqs_mean)
249 |                 fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0)
250 | 
251 |         elif(norm == 'divstd'):
252 |             allVals = np.concatenate([a.flatten() for a in fqs])
253 |             fqs_sd = np.std(allVals)
254 |             for i in range(len(fqs)):
255 |                 fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0)
256 | 
257 |         return fqs
258 | 
259 |     def on_epoch_end(self):
260 | 
261 |         if(self.shuffleExamples):
262 |             np.random.shuffle(self.indices)
263 | 
264 |     def __len__(self):
265 | 
266 |         return int(np.floor(self.infoDir["numReps"]/self.batch_size))
267 | 
268 |     def __getitem__(self, idx):
269 | 
270 |         indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
271 |         X, y = self.__data_generation(indices)
272 |         return X,y
273 | 
274 |     def shuffleIndividuals(self,x):
275 | 
276 |         t = np.arange(x.shape[1])
277 |         np.random.shuffle(t)
278 |         return x[:,t]
279 | 
280 |     def __data_generation(self, batchTreeIndices):
281 | 
282 |         haps = []
283 |         pos = []
284 |         for treeIndex in batchTreeIndices:
285 |             Hfilepath = os.path.join(self.treesDirectory,str(treeIndex) + "_haps.npy")
286 |             Pfilepath = os.path.join(self.treesDirectory,str(treeIndex) + "_pos.npy")
287 |             H = np.load(Hfilepath)
288 |             P = np.load(Pfilepath)
289 |             haps.append(H)
290 |             pos.append(P)
291 |         respectiveNormalizedTargets = [[t] for t in self.normalizedTargets[batchTreeIndices]]
292 |         targets = np.array(respectiveNormalizedTargets)
293 | 
294 |         if(self.realLinePos):
295 |             for p in range(len(pos)):
296 |                 pos[p] = pos[p] / self.infoDir["ChromosomeLength"]
297 | 
298 |         if(self.sortInds):
299 |             for i in range(len(haps)):
300 |                 haps[i] = np.transpose(self.sort_min_diff(np.transpose(haps[i])))
301 | 
302 |         if(self.shuffleInds):
303 |             for i in range(len(haps)):
304 |                 haps[i] = self.shuffleIndividuals(haps[i])
305 | 
306 |         if self.seqD:
307 |             # simulate pool-sequencing
308 |             if(self.maxLen != None):
309 |                 # convert the haps to allele frequecies and then pad
310 |                 haps,pos = self.padAlleleFqs(haps,pos,
311 |                     maxSNPs=self.maxLen,
312 |                     frameWidth=self.frameWidth,
313 |                     center=self.center)
314 | 
315 |                 haps=np.where(haps == -1.0, self.posPadVal,haps)
316 |                 pos=np.where(pos == -1.0, self.posPadVal,pos)
317 |                 z = np.stack((haps,pos), axis=-1)
318 | 
319 |                 return z, targets
320 |         else:
321 |             if(self.maxLen != None):
322 |                 # pad
323 |                 haps,pos = self.pad_HapsPos(haps,pos,
324 |                     maxSNPs=self.maxLen,
325 |                     frameWidth=self.frameWidth,
326 |                     center=self.center)
327 | 
328 |                 pos=np.where(pos == -1.0, self.posPadVal,pos)
329 |                 haps=np.where(haps < 1.0, self.ancVal, haps)
330 |                 haps=np.where(haps > 1.0, self.padVal, haps)
331 |                 haps=np.where(haps == 1.0, self.derVal, haps)
332 | 
333 |                 return [haps,pos], targets
334 | 
335 | 
336 | class VCFBatchGenerator(tf.keras.utils.Sequence):
337 |     """Basically same as SequenceBatchGenerator Class except for VCF files"""
338 |     def __init__(self,
339 |             INFO,
340 |             CHROM,
341 |             winLen,
342 |             numWins,
343 |             IDs,
344 |             GT,
345 |             POS,
346 |             batchSize=64,
347 |             maxLen=None,
348 |             frameWidth=0,
349 |             center=False,
350 |             sortInds=False,
351 |             ancVal = -1,
352 |             padVal = -1,
353 |             derVal = 1,
354 |             realLinePos = True,
355 |             posPadVal = 0,
356 |             phase=None,
357 |             seed=None
358 |             ):
359 | 
360 |         self.INFO=INFO
361 |         self.CHROM=CHROM
362 |         self.winLen=winLen
363 |         self.numWins=numWins
364 |         self.indices=np.arange(self.numWins)
365 |         self.IDs=IDs
366 |         self.GT=GT
367 |         self.POS=POS
368 |         self.batch_size = batchSize
369 |         self.maxLen = maxLen
370 |         self.frameWidth = frameWidth
371 |         self.center = center
372 |         self.sortInds=sortInds
373 |         self.ancVal = ancVal
374 |         self.padVal = padVal
375 |         self.derVal = derVal
376 |         self.realLinePos = realLinePos
377 |         self.posPadVal = posPadVal
378 |         self.phase=phase
379 |         self.seed=seed
380 | 
381 |         if self.seed:
382 |             os.environ['PYTHONHASHSEED']=str(self.seed)
383 |             random.seed(self.seed)
384 |             np.random.seed(self.seed)
385 | 
386 | 
387 |     def pad_HapsPosVCF(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False):
388 |         '''
389 |         pads the haplotype and positions tensors
390 |         to be uniform with the largest tensor
391 |         '''
392 | 
393 |         haps = haplotypes
394 |         pos = positions
395 | 
396 |         nSNPs=[]
397 | 
398 |         #Normalize the shape of all haplotype vectors with padding
399 |         for i in range(len(haps)):
400 |             numSNPs = haps[i].shape[0]
401 |             nSNPs.append(numSNPs)
402 |             paddingLen = maxSNPs - numSNPs
403 |             if(center):
404 |                 prior = paddingLen // 2
405 |                 post = paddingLen - prior
406 |                 haps[i] = np.pad(haps[i],((prior,post),(0,0)),"constant",constant_values=2.0)
407 |                 pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0)
408 | 
409 |             else:
410 |                 haps[i] = np.pad(haps[i],((0,paddingLen),(0,0)),"constant",constant_values=2.0)
411 |                 pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0)
412 | 
413 |         haps = np.array(haps,dtype='float32')
414 |         pos = np.array(pos,dtype='float32')
415 | 
416 |         if(frameWidth):
417 |             fw = frameWidth
418 |             haps = np.pad(haps,((0,0),(fw,fw),(fw,fw)),"constant",constant_values=2.0)
419 |             pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0)
420 |         return haps,pos,nSNPs
421 | 
422 | 
423 |     def __getitem__(self, idx):
424 | 
425 |         indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
426 |         X, nSNPs = self.__data_generation(indices)
427 | 
428 |         return X, self.CHROM, self.winLen, self.INFO, nSNPs
429 | 
430 | 
431 |     def __data_generation(self, indices):
432 | 
433 |         if self.seed:
434 |             os.environ['PYTHONHASHSEED']=str(self.seed)
435 |             random.seed(self.seed)
436 |             np.random.seed(self.seed)
437 | 
438 |         #def __getitem__(self, idx):
439 |         genos=self.GT
440 |         GT=self.GT.to_haplotypes()
441 |         diploid_check=[]
442 |         for n in range(1,len(genos[0]),2):
443 |             GTB=GT[:,n:n+1]
444 |             if np.unique(GTB).shape[0] == 1 and np.unique(GTB)[0] == -1:
445 |                 diploid_check.append(0)
446 |             else:
447 |                 diploid_check.append(1)
448 |                 break
449 |         if 1 in diploid_check:
450 |             GT=np.array(GT)
451 |         else:
452 |             GT=GT[:,::2] #Select only the first of the genotypes
453 |         GT = np.where(GT == -1, 2, GT) # Code missing data as 2, these will ultimately end up being transformed to the pad value
454 | 
455 |         if not self.phase:
456 |             np.random.shuffle(np.transpose(GT))
457 | 
458 |         haps,pos=[],[]
459 |         for i in indices:
460 |             haps.append(GT[self.IDs[i][0]:self.IDs[i][1]])
461 |             pos.append(self.POS[self.IDs[i][0]:self.IDs[i][1]])
462 | 
463 |         if(self.realLinePos):
464 |             for i in range(len(pos)):
465 |                 pos[i] = (pos[i]-(self.winLen*indices[i])) / self.winLen
466 | 
467 |         if(self.sortInds):
468 |             for i in range(len(haps)):
469 |                 haps[i] = np.transpose(sort_min_diff(np.transpose(haps[i])))
470 | 
471 |         if(self.maxLen != None):
472 |             haps,pos,nSNPs = self.pad_HapsPosVCF(haps,pos,
473 |                 maxSNPs=self.maxLen,
474 |                 frameWidth=self.frameWidth,
475 |                 center=self.center)
476 | 
477 |             pos=np.where(pos == -1.0, self.posPadVal,pos)
478 |             haps=np.where(haps < 1.0, self.ancVal, haps)
479 |             haps=np.where(haps > 1.0, self.padVal, haps)
480 |             haps=np.where(haps == 1.0, self.derVal, haps)
481 | 
482 |             return [haps,pos], nSNPs
483 | 
484 | 
485 | class POOLBatchGenerator(tf.keras.utils.Sequence):
486 |     """Basically same as SequenceBatchGenerator Class except for POOL files"""
487 |     def __init__(self,
488 |             INFO,
489 |             CHROM,
490 |             winLen,
491 |             numWins,
492 |             IDs,
493 |             GT,
494 |             POS,
495 |             batchSize=64,
496 |             maxLen=None,
497 |             frameWidth=0,
498 |             center=False,
499 |             sortInds=False,
500 |             ancVal = -1,
501 |             padVal = -1,
502 |             derVal = 1,
503 |             realLinePos = True,
504 |             posPadVal = 0,
505 |             normType = 'zscore',
506 |             seed = None
507 |             ):
508 | 
509 |         self.INFO=INFO
510 |         self.normType = normType
511 |         self.CHROM=CHROM
512 |         self.winLen=winLen
513 |         self.numWins=numWins
514 |         self.indices=np.arange(self.numWins)
515 |         self.IDs=IDs
516 |         self.GT=GT
517 |         self.POS=POS
518 |         self.batch_size = batchSize
519 |         self.maxLen = maxLen
520 |         self.frameWidth = frameWidth
521 |         self.center = center
522 |         self.sortInds=sortInds
523 |         self.ancVal = ancVal
524 |         self.padVal = padVal
525 |         self.derVal = derVal
526 |         self.realLinePos = realLinePos
527 |         self.posPadVal = posPadVal
528 |         self.seed = seed
529 | 
530 |         if self.seed:
531 |             os.environ['PYTHONHASHSEED']=str(self.seed)
532 |             random.seed(self.seed)
533 |             np.random.seed(self.seed)
534 | 
535 |     def padFqs(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False):
536 |         '''
537 |         normalize, and pad the haplotype and positions tensors
538 |         to be uniform with the largest tensor
539 |         '''
540 | 
541 |         fqs = haplotypes
542 |         pos = positions
543 | 
544 |         # Normalize
545 |         fqs = self.normalizeAlleleFqs(fqs)
546 | 
547 |         nSNPs=[]
548 |         # Pad
549 |         for i in range(len(fqs)):
550 |             numSNPs = fqs[i].shape[0]
551 |             nSNPs.append(numSNPs)
552 |             paddingLen = maxSNPs - numSNPs
553 |             if(center):
554 |                 prior = paddingLen // 2
555 |                 post = paddingLen - prior
556 |                 fqs[i] = np.pad(fqs[i],(prior,post),"constant",constant_values=-1.0)
557 |                 pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0)
558 | 
559 |             else:
560 |                 if(paddingLen < 0):
561 |                     fqs[i] = np.pad(fqs[i],(0,0),"constant",constant_values=-1.0)[:paddingLen]
562 |                     pos[i] = np.pad(pos[i],(0,0),"constant",constant_values=-1.0)[:paddingLen]
563 |                 else:
564 |                     fqs[i] = np.pad(fqs[i],(0,paddingLen),"constant",constant_values=-1.0)
565 |                     pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0)
566 | 
567 |         fqs = np.array(fqs,dtype='float32')
568 |         pos = np.array(pos,dtype='float32')
569 | 
570 |         if(frameWidth):
571 |             fw = frameWidth
572 |             fqs = np.pad(fqs,((0,0),(fw,fw)),"constant",constant_values=-1.0)
573 |             pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0)
574 | 
575 |         return fqs,pos,nSNPs
576 | 
577 | 
578 |     def normalizeAlleleFqs(self, fqs):
579 | 
580 |         '''
581 |         normalize the allele frequencies for the batch
582 |         '''
583 | 
584 |         norm = self.normType
585 | 
586 |         if(norm == 'zscore'):
587 |             allVals = np.concatenate([a.flatten() for a in fqs])
588 |             fqs_mean = np.mean(allVals)
589 |             fqs_sd = np.std(allVals)
590 |             for i in range(len(fqs)):
591 |                 fqs[i] = np.subtract(fqs[i],fqs_mean)
592 |                 fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0)
593 | 
594 |         elif(norm == 'divstd'):
595 |             allVals = np.concatenate([a.flatten() for a in fqs])
596 |             fqs_sd = np.std(allVals)
597 |             for i in range(len(fqs)):
598 |                 fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0)
599 | 
600 |         return fqs
601 | 
602 | 
603 |     def __getitem__(self, idx):
604 | 
605 |         indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
606 |         X, nSNPs = self.__data_generation(indices)
607 | 
608 |         return X, self.CHROM, self.winLen, self.INFO, nSNPs
609 | 
610 | 
611 |     def __data_generation(self, indices):
612 | 
613 |         if self.seed:
614 |             os.environ['PYTHONHASHSEED']=str(self.seed)
615 |             random.seed(self.seed)
616 |             np.random.seed(self.seed)
617 | 
618 |         GT=self.GT
619 | 
620 |         haps,pos=[],[]
621 |         for i in indices:
622 |             haps.append(GT[self.IDs[i][0]:self.IDs[i][1]])
623 |             pos.append(self.POS[self.IDs[i][0]:self.IDs[i][1]])
624 | 
625 |         if(self.realLinePos):
626 |             for i in range(len(pos)):
627 |                 pos[i] = (pos[i]-(self.winLen*indices[i])) / self.winLen
628 | 
629 |         if(self.sortInds):
630 |             for i in range(len(haps)):
631 |                 haps[i] = np.transpose(sort_min_diff(np.transpose(haps[i])))
632 | 
633 |         # pad the allele freqs and positions
634 |         if(self.maxLen != None):
635 |             haps,pos,nSNPs = self.padFqs(haps,pos,
636 |                 maxSNPs=self.maxLen,
637 |                 frameWidth=self.frameWidth,
638 |                 center=self.center)
639 | 
640 |             haps=np.where(haps == -1.0, self.posPadVal,haps)
641 |             pos=np.where(pos == -1.0, self.posPadVal,pos)
642 |             np.set_printoptions(threshold=sys.maxsize)
643 |             z = np.stack((haps,pos), axis=-1)
644 | 
645 |             return z, nSNPs
646 | 
647 | 
648 | 


--------------------------------------------------------------------------------
/ReLERNN/helpers.py:
--------------------------------------------------------------------------------
  1 | 
  2 | '''
  3 | Authors: Jared Galloway, Jeff Adrion
  4 | '''
  5 | 
  6 | from ReLERNN.imports import *
  7 | 
  8 | #-------------------------------------------------------------------------------------------
  9 | 
 10 | def assign_task(mpID, task_q, nProcs):
 11 |     c,i,nth_job=0,0,1
 12 |     while (i+1)*nProcs <= len(mpID):
 13 |         i+=1
 14 |     nP1=nProcs-(len(mpID)%nProcs)
 15 |     for j in range(nP1):
 16 |         task_q.put((mpID[c:c+i], nth_job))
 17 |         nth_job += 1
 18 |         c=c+i
 19 |     for j in range(nProcs-nP1):
 20 |         task_q.put((mpID[c:c+i+1], nth_job))
 21 |         nth_job += 1
 22 |         c=c+i+1
 23 | 
 24 | #-------------------------------------------------------------------------------------------
 25 | 
 26 | def create_procs(nProcs, task_q, result_q, params, worker):
 27 |     pids = []
 28 |     for _ in range(nProcs):
 29 |         p = mp.Process(target=worker, args=(task_q, result_q, params))
 30 |         p.daemon = True
 31 |         p.start()
 32 |         pids.append(p)
 33 |     return pids
 34 | 
 35 | #-------------------------------------------------------------------------------------------
 36 | 
 37 | def get_corrected_index(L,N):
 38 |     idx,outN="",""
 39 |     dist=float("inf")
 40 |     for i in range(len(L)):
 41 |         D=abs(N-L[i])
 42 |         if D < dist:
 43 |             idx=i
 44 |             outN=L[i]
 45 |             dist=D
 46 |     return [idx,outN]
 47 | 
 48 | #-------------------------------------------------------------------------------------------
 49 | 
 50 | def get_corrected(rate,bs):
 51 |     idx=get_corrected_index(bs["Q2"],rate)
 52 |     CI95LO=bs["CI95LO"][idx[0]]
 53 |     CI95HI=bs["CI95HI"][idx[0]]
 54 |     cRATE=relu(rate+(bs["rho"][idx[0]]-idx[1]))
 55 |     ciHI=relu(cRATE+(CI95HI-idx[1]))
 56 |     ciLO=relu(cRATE+(CI95LO-idx[1]))
 57 |     return [cRATE,ciLO,ciHI]
 58 | 
 59 | #-------------------------------------------------------------------------------------------
 60 | 
 61 | def get_index(pos, winSize):
 62 |     y=snps_per_win(pos,winSize)
 63 |     st=0
 64 |     indices=[]
 65 |     for i in range(len(y)):
 66 |         indices.append([st,st+y[i]])
 67 |         st+=y[i]
 68 |     return indices
 69 | 
 70 | #-------------------------------------------------------------------------------------------
 71 | 
 72 | def snps_per_win(pos, window_size):
 73 |     bins = np.arange(1, pos.max()+window_size, window_size) #use 1-based coordinates, per VCF standard
 74 |     y,x = np.histogram(pos,bins=bins)
 75 |     return y
 76 | 
 77 | #-------------------------------------------------------------------------------------------
 78 | 
 79 | def find_win_size(winSize, pos, winSizeMx):
 80 |     snpsWin=snps_per_win(pos,winSize)
 81 |     mn,u,mx = snpsWin.min(), int(snpsWin.mean()), snpsWin.max()
 82 |     if mx > winSizeMx:
 83 |         return [-1]
 84 |     elif mx < winSizeMx:
 85 |         return [1]
 86 |     else:
 87 |         return [winSize,mn,u,mx,len(snpsWin)]
 88 | 
 89 | #-------------------------------------------------------------------------------------------
 90 | 
 91 | def force_win_size(winSize, pos):
 92 |     snpsWin=snps_per_win(pos,winSize)
 93 |     mn,u,mx = snpsWin.min(), int(snpsWin.mean()), snpsWin.max()
 94 |     return [winSize,mn,u,mx,len(snpsWin)]
 95 | 
 96 | #-------------------------------------------------------------------------------------------
 97 | 
 98 | def maskStats(wins, last_win, mask, maxLen):
 99 |     """
100 |     return a three-element list with the first element being the total proportion of the window that is masked,
101 |     the second element being a list of masked positions that are relative to the windown start=0 and the window end = window length,
102 |     and the third being the last window before breaking to expidite the next loop
103 |     """
104 |     chrom = wins[0].split(":")[0]
105 |     a = wins[1]
106 |     L = wins[2]
107 |     b = a + L
108 |     prop = [0.0,[],0]
109 |     try:
110 |         for i in range(last_win, len(mask[chrom])):
111 |             x, y = mask[chrom][i][0], mask[chrom][i][1]
112 |             if y < a:
113 |                 continue
114 |             if b < x:
115 |                 return prop
116 |             else:  # i.e. [a--b] and [x--y] overlap
117 |                 if a >= x and b <= y:
118 |                     return [1.0, [[0,maxLen]], i]
119 |                 elif a >= x and b > y:
120 |                     win_prop = (y-a)/float(b-a)
121 |                     prop[0] += win_prop
122 |                     prop[1].append([0,int(win_prop * maxLen)])
123 |                     prop[2] = i
124 |                 elif b <= y and a < x:
125 |                     win_prop = (b-x)/float(b-a)
126 |                     prop[0] += win_prop
127 |                     prop[1].append([int((1-win_prop)*maxLen),maxLen])
128 |                     prop[2] = i
129 |                 else:
130 |                     win_prop = (y-x)/float(b-a)
131 |                     prop[0] += win_prop
132 |                     prop[1].append([int(((x-a)/float(b-a))*maxLen), int(((y-a)/float(b-a))*maxLen)])
133 |                     prop[2] = i
134 |         return prop
135 |     except KeyError:
136 |         return prop
137 | 
138 | #-------------------------------------------------------------------------------------------
139 | 
140 | def check_demHist(path):
141 |     fTypeFlag = -9
142 |     with open(path, "r") as fIN:
143 |         for line in fIN:
144 |             if line.startswith("mutation_per_site"):
145 |                 fTypeFlag = 1
146 |                 break
147 |             if line.startswith("label"):
148 |                 fTypeFlag = 2
149 |                 break
150 |             if line.startswith("time_index"):
151 |                 fTypeFlag = 3
152 |                 break
153 |     return fTypeFlag
154 | 
155 | #-------------------------------------------------------------------------------------------
156 | 
157 | def convert_msmc_output(results_file, mutation_rate, generation_time):
158 |    """
159 |    This function converts the output from msmc into a csv the will be read in for
160 |    plotting comparison.
161 | 
162 |    MSMC outputs times and rates scaled by the mutation rate per basepair per generation.
163 |    First, scaled times are given in units of the per-generation mutation rate.
164 |    This means that in order to convert scaled times to generations,
165 |    divide them by the mutation rate. In humans, we used mu=1e-8 per basepair per generation.
166 |    To convert generations into years, multiply by the generation time, for which we used 10 years.
167 | 
168 |    To get population sizes out of coalescence rates, first take the inverse of the coalescence rate,
169 |    scaledPopSize = 1 / lambda00. Then divide this scaled population size by 2*mu
170 |    """
171 |    outfile = results_file+".csv"
172 |    out_fp = open(outfile, "w")
173 |    in_fp = open(results_file, "r")
174 |    header = in_fp.readline()
175 |    out_fp.write("label,x,y\n")
176 |    for line in in_fp:
177 |        result = line.split()
178 |        time = float(result[1])
179 |        time_generation = time / mutation_rate
180 |        time_years = time_generation * generation_time
181 |        lambda00 = float(result[3])
182 |        scaled_pop_size = 1 / lambda00
183 |        size = scaled_pop_size / (2*mutation_rate)
184 |        out_fp.write(f"pop0,{time_years},{size}\n")
185 |    out_fp.close
186 |    return None
187 | 
188 | #-------------------------------------------------------------------------------------------
189 | 
190 | def convert_demHist(path, nSamps, gen, fType, mu):
191 |     swp, PC, DE = [],[],[]
192 |     # Convert stairwayplot to msp demographic_events
193 |     if fType == 1:
194 |         with open(path, "r") as fIN:
195 |             flag=0
196 |             lCt=0
197 |             for line in fIN:
198 |                 if flag == 1:
199 |                     if lCt % 2 == 0:
200 |                         swp.append(line.split())
201 |                     lCt+=1
202 |                 if line.startswith("mutation_per_site"):
203 |                     flag=1
204 |         N0 = int(float(swp[0][6]))
205 |         for i in range(len(swp)):
206 |             if i == 0:
207 |                 PC.append(msp.PopulationConfiguration(sample_size=nSamps, initial_size=N0))
208 |             else:
209 |                 DE.append(msp.PopulationParametersChange(time=int(float(swp[i][5])/float(gen)), initial_size=int(float(swp[i][6])), population=0))
210 |     ## Convert MSMC to similar format to smc++
211 |     if fType == 3:
212 |         convert_msmc_output(path, mu, gen)
213 |         path+=".csv"
214 |     ## Convert smc++ or MSMC results to msp demographic_events
215 |     if fType == 2 or fType == 3:
216 |         with open(path, "r") as fIN:
217 |             fIN.readline()
218 |             for line in fIN:
219 |                 ar=line.split(",")
220 |                 swp.append([int(float(ar[1])/gen),int(float(ar[2]))])
221 |         N0 = swp[0][1]
222 |         for i in range(len(swp)):
223 |             if i == 0:
224 |                 PC.append(msp.PopulationConfiguration(sample_size=nSamps, initial_size=N0))
225 |             else:
226 |                 DE.append(msp.PopulationParametersChange(time=swp[i][0], initial_size=swp[i][1], population=0))
227 |     dd=msp.DemographyDebugger(population_configurations=PC,
228 |             demographic_events=DE)
229 |     print("Simulating under the following population size history:")
230 |     dd.print_history()
231 |     MspD = {"population_configurations" : PC,
232 |         "migration_matrix" : None,
233 |         "demographic_events" : DE}
234 |     if MspD:
235 |         return MspD
236 |     else:
237 |         print("Error in converting demographic history file.")
238 |         sys.exit(1)
239 | 
240 | #-------------------------------------------------------------------------------------------
241 | 
242 | def relu(x):
243 |     return max(0,x)
244 | 
245 | #-------------------------------------------------------------------------------------------
246 | 
247 | def zscoreTargets(self):
248 |     norm = self.targetNormalization
249 |     nTargets = copy.deepcopy(self.infoDir['y'])
250 |     if(norm == 'zscore'):
251 |         tar_mean = np.mean(nTargets,axis=0)
252 |         tar_sd = np.std(nTargets,axis=0)
253 |         nTargets -= tar_mean
254 |         nTargets = np.divide(nTargets,tar_sd,out=np.zeros_like(nTargets),where=tar_sd!=0)
255 | 
256 | #-------------------------------------------------------------------------------------------
257 | 
258 | def load_and_predictVCF(VCFGenerator,
259 |             resultsFile=None,
260 |             network=None,
261 |             chromStr=None,
262 |             minS = 50,
263 |             numWins = None,
264 |             batchSize = None,
265 |             gpuID = 0,
266 |             hotspots = False):
267 | 
268 |     if hotspots:
269 |         print("Error: hotspot detection under construction")
270 |         sys.exit(1)
271 | 
272 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(gpuID)
273 | 
274 |     ## The following code block appears necessary for running with tf2 and cudnn
275 |     from tensorflow.compat.v1 import ConfigProto
276 |     from tensorflow.compat.v1 import Session
277 |     config = ConfigProto()
278 |     config.gpu_options.allow_growth = True
279 |     Session(config=config)
280 |     ###
281 | 
282 |     # load json and create model
283 |     if(network != None):
284 |         jsonFILE = open(network[0],"r")
285 |         loadedModel = jsonFILE.read()
286 |         jsonFILE.close()
287 |         model=model_from_json(loadedModel)
288 |         model.load_weights(network[1])
289 |     else:
290 |         print("Error: no pretrained network found!")
291 |         sys.exit(1)
292 | 
293 |     num_batches = int(np.ceil(numWins / batchSize))
294 | 
295 |     with open(resultsFile, "w") as fOUT:
296 |         ct=0
297 |         last = int(chromStr.split(":")[-1].split("-")[-1])
298 |         fOUT.write("\t".join([str(head) for head in ["chrom","start","end","nSites","recombRate"]])+"\n")
299 |         for i in range(num_batches):
300 |             X,chrom,win,info,nSNPs = VCFGenerator.__getitem__(i)
301 |             predictions = model.predict(X)
302 |             u=np.mean(info["rho"])
303 |             sd=np.std(info["rho"])
304 |             for j in range(len(predictions)):
305 |                 if nSNPs[j] >= minS:
306 |                     fOUT.write("%s\t%s\t%s\t%s\t%s\n" %(chrom,ct,min(ct+win,last),nSNPs[j],relu(sd*predictions[j][0]+u)))
307 |                 ct+=win
308 | 
309 |     return None
310 | 
311 | #-------------------------------------------------------------------------------------------
312 | 
313 | def runModels(ModelFuncPointer,
314 |             ModelName,
315 |             TrainDir,
316 |             TrainGenerator,
317 |             ValidationGenerator,
318 |             TestGenerator,
319 |             resultsFile=None,
320 |             numEpochs=10,
321 |             epochSteps=100,
322 |             validationSteps=1,
323 |             network=None,
324 |             nCPU = 1,
325 |             gpuID = 0):
326 | 
327 | 
328 |     os.environ["CUDA_VISIBLE_DEVICES"]=str(gpuID)
329 | 
330 |     ## The following code block appears necessary for running with tf2 and cudnn
331 |     from tensorflow.compat.v1 import ConfigProto
332 |     from tensorflow.compat.v1 import Session
333 |     config = ConfigProto()
334 |     config.gpu_options.allow_growth = True
335 |     Session(config=config)
336 |     ###
337 | 
338 |     if(resultsFile == None):
339 | 
340 |         resultsFilename = os.path.basename(trainFile)[:-4] + ".p"
341 |         resultsFile = os.path.join("./results/",resultsFilename)
342 | 
343 |     x,y = TrainGenerator.__getitem__(0)
344 |     model = ModelFuncPointer(x,y)
345 | 
346 |     # Early stopping and saving the best weights
347 |     callbacks_list = [
348 |             EarlyStopping(
349 |                 monitor='val_loss',
350 |                 verbose=1,
351 |                 min_delta=0.01,
352 |                 patience=100),
353 |             ModelCheckpoint(
354 |                 filepath=network[1],
355 |                 monitor='val_loss',
356 |                 save_best_only=True),
357 |             TerminateOnNaN()
358 |             ]
359 | 
360 |     if nCPU > 1:
361 |         history = model.fit(TrainGenerator,
362 |             steps_per_epoch=epochSteps,
363 |             epochs=numEpochs,
364 |             validation_data=ValidationGenerator,
365 |             callbacks=callbacks_list,
366 |             use_multiprocessing=True,
367 |             max_queue_size=nCPU,
368 |             workers=nCPU)
369 |     else:
370 |         history = model.fit(TrainGenerator,
371 |             steps_per_epoch=epochSteps,
372 |             epochs=numEpochs,
373 |             validation_data=ValidationGenerator,
374 |             callbacks=callbacks_list,
375 |             use_multiprocessing=False)
376 | 
377 |     # Write the network
378 |     if(network != None):
379 |         ##serialize model to JSON
380 |         model_json = model.to_json()
381 |         with open(network[0], "w") as json_file:
382 |             json_file.write(model_json)
383 | 
384 |     # Load json and create model
385 |     if(network != None):
386 |         jsonFILE = open(network[0],"r")
387 |         loadedModel = jsonFILE.read()
388 |         jsonFILE.close()
389 |         model=model_from_json(loadedModel)
390 |         model.load_weights(network[1])
391 |     else:
392 |         print("Error: model and weights not loaded")
393 |         sys.exit(1)
394 | 
395 |     x,y = TestGenerator.__getitem__(0)
396 |     predictions = model.predict(x)
397 | 
398 |     history.history['loss'] = np.array(history.history['loss'])
399 |     history.history['val_loss'] = np.array(history.history['val_loss'])
400 |     history.history['predictions'] = np.array(predictions)
401 |     history.history['Y_test'] = np.array(y)
402 |     history.history['name'] = ModelName
403 | 
404 |     print("results written to: ",resultsFile)
405 |     pickle.dump(history.history, open( resultsFile, "wb" ))
406 | 
407 |     return None
408 | 
409 | #-------------------------------------------------------------------------------------------
410 | 
411 | #def indicesGenerator(batchSize,numReps):
412 | #    '''
413 | #    Generate indices randomly from range (0,numReps) in batches of size batchSize
414 | #    without replacement.
415 | #
416 | #    This is for the batch generator to randomly choose trees from a directory
417 | #    but make sure
418 | #    '''
419 | #    availableIndices = np.arange(numReps)
420 | #    np.random.shuffle(availableIndices)
421 | #    ci = 0
422 | #    while 1:
423 | #        if((ci+batchSize) > numReps):
424 | #            ci = 0
425 | #            np.random.shuffle(availableIndices)
426 | #        batchIndices = availableIndices[ci:ci+batchSize]
427 | #        ci = ci+batchSize
428 | #
429 | #        yield batchIndices
430 | 
431 | #-------------------------------------------------------------------------------------------
432 | 
433 | def getHapsPosLabels(direc,simulator,shuffle=False):
434 |     '''
435 |     loops through a trees directory created by the data generator class
436 |     and returns the repsective genotype matrices, positions, and labels
437 |     '''
438 |     haps = []
439 |     positions = []
440 |     infoFilename = os.path.join(direc,"info.p")
441 |     infoDict = pickle.load(open(infoFilename,"rb"))
442 |     labels = infoDict["y"]
443 | 
444 |     #how many trees files are in this directory.
445 |     li = os.listdir(direc)
446 |     numReps = len(li) - 1   #minus one for the 'info.p' file
447 | 
448 |     for i in range(numReps):
449 |         filename = str(i) + ".trees"
450 |         filepath = os.path.join(direc,filename)
451 |         treeSequence = msp.load(filepath)
452 |         haps.append(treeSequence.genotype_matrix())
453 |         positions.append(np.array([s.position for s in treeSequence.sites()]))
454 | 
455 | 
456 |     haps = np.array(haps)
457 |     positions = np.array(positions)
458 | 
459 |     return haps,positions,labels
460 | 
461 | #-------------------------------------------------------------------------------------------
462 | 
463 | def simplifyTreeSequenceOnSubSampleSet_stub(ts,numSamples):
464 |     '''
465 |     This function should take in a tree sequence, generate
466 |     a subset the size of numSamples, and return the tree sequence simplified on
467 |     that subset of individuals
468 |     '''
469 | 
470 |     ts = ts.simplify() #is this neccessary
471 |     inds = [ind.id for ind in ts.individuals()]
472 |     sample_subset = np.sort(np.random.choice(inds,sample_size,replace=False))
473 |     sample_nodes = []
474 |     for i in sample_subset:
475 |         ind = ts.individual(i)
476 |         sample_nodes.append(ind.nodes[0])
477 |         sample_nodes.append(ind.nodes[1])
478 | 
479 |     ts = ts.simplify(sample_nodes)
480 | 
481 |     return ts
482 | 
483 | #-------------------------------------------------------------------------------------------
484 | 
485 | def sort_min_diff(amat):
486 |     '''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity.
487 |     this problem is NP, so here we use a nearest neighbors approx.  it's not perfect, but it's fast and generally performs ok.
488 |     assumes your input matrix is a numpy array'''
489 | 
490 |     mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat)
491 |     v = mb.kneighbors(amat)
492 |     smallest = np.argmin(v[0].sum(axis=1))
493 |     return amat[v[1][smallest]]
494 | 
495 | #-------------------------------------------------------------------------------------------
496 | 
497 | def mutateTrees(treesDirec,outputDirec,muLow,muHigh,numMutsPerTree=1,simulator="msprime"):
498 |     '''
499 |     read in .trees files from treesDirec, mutate that tree numMuts seperate times
500 |     using a mutation rate pulled from a uniform dirstribution between muLow and muHigh
501 | 
502 |     also, re-write the labels file to reflect.
503 |     '''
504 |     if(numMutsPerTree > 1):
505 |         assert(treesDirec != outputDirec)
506 | 
507 |     if not os.path.exists(outputDirec):
508 |         print("directory '",outputDirec,"' does not exist, creating it")
509 |         os.makedirs(outputDirec)
510 | 
511 |     infoFilename = os.path.join(treesDirec,"info.p")
512 |     infoDict = pickle.load(open(infoFilename,"rb"))
513 |     labels = infoDict["y"]
514 | 
515 |     newLabels = []
516 |     newMaxSegSites = 0
517 | 
518 |     #how many trees files are in this directory.
519 |     li = os.listdir(treesDirec)
520 |     numReps = len(li) - 1   #minus one for the 'labels.txt' file
521 | 
522 |     for i in range(numReps):
523 |         filename = str(i) + ".trees"
524 |         filepath = os.path.join(treesDirec,filename)
525 |         treeSequence = msp.load(filepath)
526 |         blankTreeSequence = msp.mutate(treeSequence,0)
527 |         rho = labels[i]
528 |         for mut in range(numMuts):
529 |             simNum = (i*numMuts) + mut
530 |             simFileName = os.path.join(outputDirec,str(simNum)+".trees")
531 |             mutationRate = np.random.uniform(muLow,muHigh)
532 |             mutatedTreeSequence = msp.mutate(blankTreeSequence,mutationRate)
533 |             mutatedTreeSequence.dump(simFileName)
534 |             newMaxSegSites = max(newMaxSegSites,mutatedTreeSequence.num_sites)
535 |             newLabels.append(rho)
536 | 
537 |     infoCopy = copy.deepcopy(infoDict)
538 |     infoCopy["maxSegSites"] = newMaxSeqSites
539 |     if(numMutsPerTree > 1):
540 |         infoCopy["y"] = np.array(newLabels,dtype="float32")
541 |         infoCopy["numReps"] = numReps * numMuts
542 |     outInfoFilename = os.path.join(outputDirec,"info.p")
543 |     pickle.dump(infocopy,open(outInfoFilename,"wb"))
544 | 
545 |     return None
546 | 
547 | #-------------------------------------------------------------------------------------------
548 | 
549 | def segSitesStats(treesDirec):
550 |     '''
551 |     DEPRICATED
552 |     '''
553 | 
554 |     infoFilename = os.path.join(treesDirec,"info.p")
555 |     infoDict = pickle.load(open(infoFilename,"rb"))
556 | 
557 |     newLabels = []
558 |     newMaxSegSites = 0
559 | 
560 |     #how many trees files are in this directory.
561 |     li = os.listdir(treesDirec)
562 |     numReps = len(li) - 1   #minus one for the 'labels.txt' file
563 | 
564 |     segSites = []
565 | 
566 |     for i in range(numReps):
567 |         filename = str(i) + ".trees"
568 |         filepath = os.path.join(treesDirec,filename)
569 |         treeSequence = msp.load(filepath)
570 |         segSites.append(treeSequence.num_sites)
571 | 
572 |     return segSites
573 | 
574 | #-------------------------------------------------------------------------------------------
575 | 
576 | def mae(x,y):
577 |     '''
578 |     Compute mean absolute error between predictions and targets
579 | 
580 |     float[],float[] -> float
581 |     '''
582 |     assert(len(x) == len(y))
583 |     summ = 0.0
584 |     length = len(x)
585 |     for i in range(length):
586 |         summ += abs(x[i] - y[i])
587 |     return summ/length
588 | 
589 | #-------------------------------------------------------------------------------------------
590 | 
591 | def mse(x,y):
592 |     '''
593 |     Compute mean squared error between predictions and targets
594 | 
595 |     float[],float[] -> float
596 |     '''
597 | 
598 |     assert(len(x) == len(y))
599 |     summ = 0.0
600 |     length = len(x)
601 |     for i in range(length):
602 |         summ += (x[i] - y[i])**2
603 |     return summ/length
604 | 
605 | #-------------------------------------------------------------------------------------------
606 | 
607 | def plotResults(resultsFile,saveas):
608 | 
609 |     '''
610 |     plotting code for testing a model on simulation.
611 |     using the resulting pickle file on a training run (resultsFile).
612 |     This function plots the results of the final test set predictions,
613 |     as well as validation loss as a function of Epochs during training.
614 | 
615 |     '''
616 | 
617 |     plt.rc('font', family='serif', serif='Times')
618 |     plt.rc('xtick', labelsize=6)
619 |     plt.rc('ytick', labelsize=6)
620 |     plt.rc('axes', labelsize=6)
621 | 
622 |     results = pickle.load(open( resultsFile , "rb" ))
623 | 
624 |     fig,axes = plt.subplots(2,1)
625 |     plt.subplots_adjust(hspace=0.5)
626 | 
627 |     predictions = np.array([float(Y) for Y in results["predictions"]])
628 |     realValues = np.array([float(X) for X in results["Y_test"]])
629 | 
630 |     r_2 = round((np.corrcoef(predictions,realValues)[0,1])**2,5)
631 | 
632 |     mae_0 = round(mae(realValues,predictions),4)
633 |     mse_0 = round(mse(realValues,predictions),4)
634 |     labels = "$R^{2} = $"+str(r_2)+"\n"+"$mae = $" + str(mae_0)+" | "+"$mse = $" + str(mse_0)
635 | 
636 |     axes[0].scatter(realValues,predictions,marker = "o", color = 'tab:purple',s=5.0,alpha=0.6)
637 | 
638 |     lims = [
639 |         np.min([axes[0].get_xlim(), axes[0].get_ylim()]),  # min of both axes
640 |         np.max([axes[0].get_xlim(), axes[0].get_ylim()]),  # max of both axes
641 |     ]
642 |     axes[0].set_xlim(lims)
643 |     axes[0].set_ylim(lims)
644 |     axes[0].plot(lims, lims, 'k-', alpha=0.75, zorder=0)
645 |     axes[0].set_title(results["name"]+"\n"+labels,fontsize=6)
646 | 
647 |     lossRowIndex = 1
648 |     axes[1].plot(results["loss"],label = "mae loss",color='tab:cyan')
649 |     axes[1].plot(results["val_loss"], label= "mae validation loss",color='tab:pink')
650 | 
651 |     #axes[1].plot(results["mean_squared_error"],label = "mse loss",color='tab:green')
652 |     #axes[1].plot(results["val_mean_squared_error"], label= "mse validation loss",color='tab:olive')
653 | 
654 |     axes[1].legend(frameon = False,fontsize = 6)
655 |     axes[1].set_ylabel("mse")
656 | 
657 |     axes[0].set_ylabel(str(len(predictions))+" msprime predictions")
658 |     axes[0].set_xlabel(str(len(realValues))+" msprime real values")
659 |     fig.subplots_adjust(left=.15, bottom=.16, right=.85, top=.92,hspace = 0.5,wspace=0.4)
660 |     height = 7.00
661 |     width = 7.00
662 | 
663 |     axes[0].grid()
664 |     fig.set_size_inches(height, width)
665 |     fig.savefig(saveas)
666 | 
667 | #-------------------------------------------------------------------------------------------
668 | 
669 | def getMeanSDMax(trainDir):
670 |     '''
671 |     get the mean and standard deviation of rho from training set
672 | 
673 |     str -> int,int,int
674 | 
675 |     '''
676 |     info = pickle.load(open(trainDir+"/info.p","rb"))
677 |     rho = info["rho"]
678 |     segSites = info["segSites"]
679 |     tar_mean = np.mean(rho,axis=0)
680 |     tar_sd = np.std(rho,axis=0)
681 |     return tar_mean,tar_sd,max(segSites)
682 | 
683 | #-------------------------------------------------------------------------------------------
684 | 
685 | def unNormalize(mean,sd,data):
686 |     '''
687 |     un-zcore-ify. do the inverse to get real value predictions
688 | 
689 |     float,float,float[] -> float[]
690 |     '''
691 | 
692 |     data *= sd
693 |     data += mean  ##comment this line out for GRU_TUNED84_RELU
694 |     return data
695 | 
696 | #-------------------------------------------------------------------------------------------
697 | 
698 | def plotParametricBootstrap(results,saveas):
699 | 
700 |     '''
701 |     Use the location of "out" paramerter to parametric bootstrap
702 |     as input to plot the results of said para-boot
703 |     '''
704 | 
705 |     stats = pickle.load(open(results,'rb'))
706 |     x = stats["rho"]
707 | 
708 |     fig, ax = plt.subplots()
709 | 
710 | 
711 |     for i,s in enumerate(stats):
712 |         if(i == 0):
713 |             continue
714 | 
715 |         ax.plot(x,stats[s])
716 | 
717 |     lims = [
718 |         np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
719 |         np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
720 |     ]
721 |     ax.set_xlim(lims)
722 |     ax.set_ylim(lims)
723 |     ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
724 | 
725 |     fig.savefig(saveas)
726 | 
727 |     return None
728 | 
729 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # *ReLERNN*
  2 | ## *Recombination Landscape Estimation using Recurrent Neural Networks*
  3 | ====================================================================
  4 | 
  5 | ReLERNN uses deep learning to infer the genome-wide landscape of recombination from as few as four individually sequenced chromosomes, or from allele frequencies inferred by pooled sequencing.
  6 | This repository contains the code and instructions required to run ReLERNN, and includes example files to ensure everything is working properly. The manuscript detailing ReLERNN can be found [here](https://academic.oup.com/mbe/advance-article/doi/10.1093/molbev/msaa038/5741419).
  7 | 
  8 | ## Recommended installation on linux
  9 | Install `tensorflow 2` on your system. Directions can be found [here](https://www.tensorflow.org/install). You will also need to install the CUDA toolkit and CuDNN.
 10 | ReLERNN requires the use of a CUDA-Enabled NVIDIA GPU. The current version of ReLERNN has been successfully tested with tensorflow/2.2.0, cudatoolkit/10.1.243, and cudnn/7.6.5.
 11 | 
 12 | Further dependencies for ReLERNN can be installed with pip.
 13 | This is done with the following commands:
 14 | 
 15 | ```bash
 16 | $ git clone https://github.com/kr-colab/ReLERNN.git
 17 | $ cd ReLERNN
 18 | $ pip install .
 19 | ```
 20 | 
 21 | It should be as simple as that.
 22 | 
 23 | ### Installing `CUDA`
 24 | 
 25 | We are asked often about installing `CUDA` and the NVIDIA requirements. This can be quite finicky depending on your hardware setup, but many users
 26 | have had luck installing the `tensorflow`/`cuda` requirements using `mamba` with the following recipe
 27 | 
 28 | ```bash
 29 | $ mamba create -n relearnn-1.0.0 -c conda-forge -c nvidia python=3.10 tensorflow=2.15.0 cuda-toolkit h5py -y
 30 | # then install ReLERNN as above
 31 | $ git clone https://github.com/kr-colab/ReLERNN.git
 32 | $ cd ReLERNN
 33 | $ pip install .
 34 | ```
 35 | 
 36 | ## Testing ReLERNN
 37 | An example VCF file (5 contigs; 10 haploid chromosomes) and a shell script for running ReLERNN's four modules is located in `$/ReLERNN/examples`.
 38 | To test the functionality of ReLERNN simply use the following commands:
 39 | 
 40 | ```bash
 41 | $ cd examples
 42 | $ ./example_pipeline.sh
 43 | ```
 44 | 
 45 | Provided everything worked as planned, `$ReLERNN/examples/example_output/` should be populated with a few directories along with the files: `example.PREDICT.txt` and `example.PREDICT.BSCORRECT.txt`.
 46 | The latter is the finalized output file with your recombination rate predictions and estimates of uncertainty.
 47 | 
 48 | The above example took 57 seconds to complete on a Xeon machine using four CPUs and one NVIDIA 2070 GPU.
 49 | Note that the parameters used for this example were designed only to test the success of the installation, not to make accurate predictions.
 50 | Please use the guidelines below for the best results when analyzing real data.
 51 | 
 52 | You can now test the functionality of ReLERNN for use with pool-seq data by using the following commands:
 53 | 
 54 | ```bash
 55 | $ cd examples
 56 | $ ./example_pipeline_pool.sh
 57 | ```
 58 | 
 59 | ## Estimating a recombination landscape from individually sequenced chromosomes
 60 | 
 61 | The ReLERNN pipeline is executed using four commands: `ReLERNN_SIMULATE`, `ReLERNN_TRAIN`, `ReLERNN_PREDICT`, and the optional `ReLERNN_BSCORRECT` (see the [Method flow diagram](./methodFlow.png)).
 62 | 
 63 | ### Before running ReLERNN
 64 | ReLERNN takes as input a VCF file of biallelic variants. Users should use appropriate QC techniques (filtering low-quality variants, etc.) and remove non-biallelic variants before running ReLERNN. Small contigs (<< 250 SNPs) should not be included in the genome file `--genome`, though these do not need to be removed from the VCF. 
 65 | ReLERNN also requires that the number of sampled chromosomes is identical across all contigs, and VCFs should be filtered accordingly. Hemizygous chromosomes or haploid samples in an otherwise diploid dataset 
 66 | should ideally be run separately using a separate VCF. It is possible to treat hemizygous chromosomes as "diploids with missing data" using the `--forceDiploid` option, however this is not recommended. 
 67 | It is now possible to run ReLERNN on VCFs with missing genotypes (coded as a `.`).
 68 | 
 69 | If you want to make predictions based on equilibrium simulations, you can skip ahead to executing `ReLERNN_SIMULATE`.
 70 | While ReLERNN is generally robust to demographic model misspecification, prediction accuracy may potentially be improved by simulating the training set under a demographic history that accurately matches that of your sample. ReLERNN optionally takes the output files from three popular demographic history inference programs ([stairwayplot_v1](https://sites.google.com/site/jpopgen/stairway-plot), [SMC++](https://github.com/popgenmethods/smcpp), and [MSMC](https://github.com/stschiff/msmc)), and simulates a training set under these histories. Note: for SMC++ use the .csv output (option -c in SMC++). It is up to the user to perform the proper due diligence to ensure that the population size histories reported by these programs are sound. In our opinion, unless you know exactly how these programs work and you expect your data to represent a history dramatically different from equilibrium, you are better off skipping this step and training ReLERNN on equilibrium simulations. Once you have run one of the demographic history inference programs listed above, you simply provide the raw output file from that program to ReLERNN_SIMULATE using the `--demographicHistory` option.
 71 | 
 72 | 
 73 | ### Step 1) ReLERNN_SIMULATE
 74 | `ReLERNN_SIMULATE` reads your VCF file and splits it by chromosome. The chromosomes to be evaluated must be specified by providing a BED file of said positions using the `--genome` argument. A BED-formatted accessibility mask (with non-overlapping ascending windows) may be optionally provided using the `--mask` option. Use the `--phased` or `--unphased` flag to train using phased or unphased genotypes (the default is unphased). It is required that the VCF file use the extension `.vcf`. The prefix of that file will serve as the prefix used for all output files (e.g. running ReLERNN on the file `population7.vcf` will generate the result file `population7.PREDICT.txt`). It is strongly recommended that you use the default setting for `--maxWinSize`, larger values can cause training to fail and smaller values can result in lower accuracy. Users are required to provide an estimate of the per-base mutation rate for your sample, along with an estimate for generation time (in years). If you previously ran one of the demographic history inference programs listed above, just use the same values that you used for them. This is also where you will point to the output from said program, using `--demographicHistory`. If you are not simulating under an inferred history, simply do not include this option. Importantly, you can also set a value for the maximum recombination rate to be simulated using `--upperRhoThetaRatio`. If you have an a priori estimate for an upper bound to the ratio of rho to theta go ahead and set this here. Keep in mind that higher values will dramatically slow the coalescent simulations. We recommend using the default number of train/test/validation simulation examples, but if you want to simulate more examples, go right ahead. `ReLERNN_SIMULATE` then uses msprime to simulate 100k training examples and 1k validation and test examples. All output files will be generated in subdirectories within the path provided to `--projectDir`. It is required that you use the same projectDir for all four ReLERNN commands. If you want to run ReLERNN of multiple populations/taxa, you can run them independently using a unique projectDir for each. This step is simulation heavy and runtimes will strongly depend on the inferred population size.
 75 | 
 76 | The complete list of arguments used in `ReLERNN_SIMULATE` is found below:
 77 | ```
 78 | ReLERNN_SIMULATE -h
 79 | 
 80 | usage: ReLERNN_SIMULATE [-h] [-v VCF] [-g GENOME] [-m MASK] [-d OUTDIR]
 81 |                         [-n DEM] [-u MU] [-l GENTIME] [-r UPRTR] [-t NCPU] [-s SEED]
 82 |                         [--phased] [--unphased] [--forceDiploid] [--phaseError PHASEERROR]
 83 |                         [--maxWinSize WINSIZEMX] [--maskThresh MASKTHRESH]
 84 |                         [--nTrain NTRAIN] [--nVali NVALI] [--nTest NTEST]
 85 | 
 86 | optional arguments:
 87 |   -h, --help            show this help message and exit
 88 |   -v VCF, --vcf VCF     Filtered and QC-checked VCF file. Important: Every row
 89 |                         must correspond to a biallelic SNP with no missing
 90 |                         data!
 91 |   -g GENOME, --genome GENOME
 92 |                         BED-formatted (i.e. zero-based) file corresponding to
 93 |                         chromosomes and positions to consider
 94 |   -m MASK, --mask MASK  BED-formatted file corresponding to inaccessible bases
 95 |   -d OUTDIR, --projectDir OUTDIR
 96 |                         Directory for all project output. NOTE: the same
 97 |                         projectDir must be used for all functions of ReLERNN
 98 |   -n DEM, --demographicHistory DEM
 99 |                         Output file from either stairwayplot, SMC++, or MSMC
100 |   -u MU, --assumedMu MU
101 |                         Assumed per-base mutation rate
102 |   -l GENTIME, --assumedGenTime GENTIME
103 |                         Assumed generation time (in years)
104 |   -r UPRTR, --upperRhoThetaRatio UPRTR
105 |                         Assumed upper bound for the ratio of rho to theta
106 |   -t NCPU, --nCPU NCPU  Number of CPUs to use (defaults to total available cores)
107 |   -s SEED, --seed SEED  Random seed
108 |   --phased              VCF file is phased
109 |   --unphased            VCF file is unphased
110 |   --forceDiploid        Treats all samples as diploids
111 |                         with missing data (bad idea; see README)
112 |   --phaseError PHASEERROR
113 |                         Fraction of bases simulated with incorrect phasing
114 |   --maxWinSize WINSIZEMX
115 |                         Max number of sites per window to train on. Important:
116 |                         too many sites causes problems in training
117 |   --maskThresh MASKTHRESH
118 |                         Discard windows where >= maskThresh percent of sites
119 |                         are inaccessible
120 |   --nTrain NTRAIN       Number of training examples to simulate
121 |   --nVali NVALI         Number of validation examples to simulate
122 |   --nTest NTEST         Number of test examples to simulate
123 | ```
124 | 
125 | 
126 | ### Step 2) ReLERNN_TRAIN
127 | `ReLERNN_TRAIN` takes the simulations created by `ReLERNN_SIMULATE` and uses them to train a recurrent neural network. Again, we recommend using the defaults for `--nEpochs` and `--nValSteps`, but if you would like to do more training, feel free. To set the GPU to be used for machines with multiple dedicated GPUs use `--gpuID` (e.g. if running an analysis on two populations simultaneously, set `--gpuID 0` for the first population and `--gpuID 1` for the second). `ReLERNN_TRAIN` outputs some basic metrics of the training results for you, generating the figure `$/projectDir/networks/vcfprefix.pdf`. The default value of `-nCPU` is 1 for this step, as this is often produces the shortest training times per epoch (depending on missing data and the mask). Feel free to test training times using multiple cores, and set `-nCPU` to whatever works best for your data/machine.
128 | 
129 | The complete list of arguments used in `ReLERNN_TRAIN` is found below:
130 | ```
131 | ReLERNN_TRAIN -h
132 | 
133 | usage: ReLERNN_TRAIN [-h] [-d OUTDIR] [--nEpochs NEPOCHS]
134 |                      [-t NCPU] [-s SEED]
135 |                      [--nValSteps NVALSTEPS] [--gpuID GPUID]
136 | 
137 | optional arguments:
138 |   -h, --help            show this help message and exit
139 |   -d OUTDIR, --projectDir OUTDIR
140 |                         Directory for all project output. NOTE: the same
141 |                         projectDir must be used for all functions of ReLERNN
142 |   -t NCPU, --nCPU NCPU  Number of CPUs to use (defaults to 1)
143 |   -s SEED, --seed SEED  Random seed
144 |   --nEpochs NEPOCHS     Number of epochs to train over
145 |   --nValSteps NVALSTEPS
146 |                         Number of validation steps
147 |   --gpuID GPUID         Identifier specifying which GPU to use
148 | ```
149 | 
150 | 
151 | 
152 | ### Step 3) ReLERNN_PREDICT
153 | `ReLERNN_PREDICT` now takes the same VCF file you used in `ReLERNN_SIMULATE` and predicts per-base recombination rates in non-overlapping windows across the genome. The output file of predictions will be created as `$/projectDir/vcfprefix.PREDICT.txt`. It is important to note that the window size used for predictions might be different for different chromosomes. A complete list of the window sizes used for each chromosome can be found in third column of `$/projectDir/networks/windowSizes.txt`. Use the optional `--minSites` argument to exclude windows with fewer than the desired number of SNPs. If you are not interested in estimating confidence intervals around the predictions, your ReLERNN analysis is now finished. If you are getting OOM errors at this step you can try setting `--batchSizeOverride` to a value significantly less than the total number of windows along a chromosome (found in the last column of `$/projectDir/networks/windowSizes.txt`).
154 | 
155 | 
156 | The complete list of arguments used in `ReLERNN_PREDICT` is found below:
157 | ```
158 | ReLERNN_PREDICT -h
159 | 
160 | usage: ReLERNN_PREDICT [-h] [-v VCF] [-d OUTDIR] [--minSites MINS]
161 |                        [--gpuID GPUID] [--batchSizeOverride BSO] [-s SEED]
162 | 
163 | optional arguments:
164 |   -h, --help            show this help message and exit
165 |   -v VCF, --vcf VCF     Filtered and QC-checked VCF file. Important: Every row
166 |                         must correspond to a biallelic SNP with no missing
167 |                         data!
168 |   -d OUTDIR, --projectDir OUTDIR
169 |                         Directory for all project output. NOTE: the same
170 |                         projectDir must be used for all functions of ReLERNN
171 |   --phased              VCF file is phased
172 |   --unphased            VCF file is unphased
173 |   --minSites MINS       Minimum number of SNPs in a genomic window required to
174 |                         return a prediction
175 |   --gpuID GPUID         Identifier specifying which GPU to use
176 |   --batchSizeOverride BSO
177 |                         Batch size to use for low memory applications
178 |   -s SEED, --seed SEED  Random seed
179 | 
180 | ```
181 | 
182 | ### Optional Step 4) ReLERNN_BSCORRECT
183 | However, you might want to have an idea of the uncertainty around your predictions. This is where `ReLERNN_BSCORRECT` comes in. `ReLERNN_BSCORRECT` generates 95% confidence intervals around each prediction, and additionally attempts to correct for systematic bias ([see Materials and Methods](https://www.biorxiv.org/content/biorxiv/early/2019/08/16/662247.full.pdf)). It does this by simulated a set of `--nReps` examples at each of `nSlice` recombination rate bins. It then uses the network that was trained in `ReLERNN_TRAIN` and estimates the distribution of predictions around each know recombination rate. The result is both an estimate of uncertainty, and a prediction that has been slightly corrected to account for biases in how the network predicts in this area of parameter space. The resulting file is created as `$/projectDir/vcfprefix.PREDICT.BSCORRECT.txt`, and is formatted similarly to `$/projectDir/vcfprefix.PREDICT.txt`, with the addition of columns for the low and high 95CI bounds. Note that this step is simulation heavy and runtimes can be slow.
184 | 
185 | The complete list of arguments used in `ReLERNN_BSCORRECT` is found below:
186 | ```
187 | ReLERNN_BSCORRECT -h
188 | 
189 | usage: ReLERNN_BSCORRECT [-h] [-d OUTDIR] [-t NCPU] [-s SEED] [--gpuID GPUID]
190 |                          [--nSlice NSLICE] [--nReps NREPS]
191 | 
192 | optional arguments:
193 |   -h, --help            show this help message and exit
194 |   -d OUTDIR, --projectDir OUTDIR
195 |                         Directory for all project output. NOTE: the same
196 |                         projectDir must be used for all functions of ReLERNN
197 |   -t NCPU, --nCPU NCPU  Number of CPUs to use (defaults to total available cores)
198 |   -s SEED, --seed SEED  Random seed
199 |   --gpuID GPUID         Identifier specifying which GPU to use
200 |   --nSlice NSLICE       Number of recombination rate bins to simulate over
201 |   --nReps NREPS         Number of simulations per step
202 | ```
203 | 
204 | ## Estimating a recombination landscape from pool-seq data
205 | 
206 | Similar to the directions above, the ReLERNN pipeline for pool-seq data is executed using four commands: `ReLERNN_SIMULATE_POOL`, `ReLERNN_TRAIN_POOL`, `ReLERNN_PREDICT_POOL`, and the optional `ReLERNN_BSCORRECT`.
207 | 
208 | ### Before running ReLERNN
209 | ReLERNN for pool-seq analyses takes as input a file of genomic positions and allele frequencies (herein a 'POOLFILE'; see example file).
210 | 
211 | Similar to ReLERNN for individually sequenced chromosomes, if you want to make predictions based on equilibrium simulations, you can skip ahead to executing `ReLERNN_SIMULATE_POOL`.
212 | While ReLERNN is generally robust to demographic model misspecification, prediction accuracy may potentially be improved by simulating the training set under a demographic history that accurately matches that of your sample. ReLERNN optionally takes the raw output files from three popular demographic history inference programs ([stairwayplot_v1](https://sites.google.com/site/jpopgen/stairway-plot), [SMC++](https://github.com/popgenmethods/smcpp), and [MSMC](https://github.com/stschiff/msmc)), and simulates a training set under these histories. It is up to the user to perform the proper due diligence to ensure that the population size histories reported by these programs are sound. In our opinion, unless you know exactly how these programs work and you expect your data to represent a history dramatically different from equilibrium, you are better off skipping this step and training ReLERNN on equilibrium simulations. Once you have run one of the demographic history inference programs listed above, you simply provide the raw output file from that program to ReLERNN_SIMULATE_POOL using the `--demographicHistory` option.
213 | 
214 | 
215 | ### Step 1) ReLERNN_SIMULATE_POOL
216 | `ReLERNN_SIMULATE_POOL` reads your POOLFILE and splits it by chromosome. The number of chromosomes in the pool must be specified using the `--sampleDepth` argument. The genomic chromosomes to be evaluated must be specified by providing a BED file of said positions using the `--genome` argument. A BED-formatted accessibility mask (with non-overlapping ascending windows) may be optionally provided using the `--mask` option. It is required that the POOLFILE use the extension `.pool`. The prefix of that file will serve as the prefix used for all output files (e.g. running ReLERNN on the file `population7.pool` will generate the result file `population7.PREDICT.txt`). It is strongly recommended that you use the default setting for `--maxSites`, larger values can cause training to fail and smaller values can result in lower accuracy. Users are required to provide an estimate of the per-base mutation rate for your sample, along with an estimate for generation time (in years). If you previously ran one of the demographic history inference programs listed above, just use the same values that you used for them. This is also where you will point to the output from said program, using `--demographicHistory`. If you are not simulating under an inferred history, simply do not include this option. Importantly, you can also set a value for the maximum recombination rate to be simulated using `--upperRhoThetaRatio`. If you have an a priori estimate for an upper bound to the ratio of rho to theta go ahead and set this here. Keep in mind that higher values will dramatically slow the coalescent simulations. We recommend using the default number of train/test/validation simulation examples, but if you want to simulate more examples, go right ahead. `ReLERNN_SIMULATE_POOL` then uses msprime to simulate 100k training examples and 1k validation and test examples. All output files will be generated in subdirectories within the path provided to `--projectDir`. It is required that you use the same projectDir for all four ReLERNN commands. If you want to run ReLERNN of multiple populations/taxa, you can run them independently using a unique projectDir for each. This step is simulation heavy and runtimes will strongly depend on the inferred population size.
217 | 
218 | The complete list of arguments used in `ReLERNN_SIMULATE_POOL` is found below:
219 | ```
220 | ReLERNN_SIMULATE_POOL -h
221 | 
222 | usage: ReLERNN_SIMULATE_POOL [-h] [-p POOL] [--sampleDepth SAMD] [-g GENOME] [-m MASK] [-d OUTDIR]
223 |                         [-n DEM] [-u MU] [-l GENTIME] [-r UPRTR] [-t NCPU] [-s SEED]
224 |                         [--maxSites WINSIZEMX] [--maskThresh MASKTHRESH]
225 |                         [--nTrain NTRAIN] [--nVali NVALI] [--nTest NTEST]
226 | 
227 | optional arguments:
228 |   -h, --help            show this help message and exit
229 |   -p POOL, --pool POOL     Filtered and QC-checked POOL file.
230 |   --sampleDepth SAMD    Number of chromosomes in pool
231 |   -g GENOME, --genome GENOME
232 |                         BED-formatted (i.e. zero-based) file corresponding to
233 |                         chromosomes and positions to consider
234 |   -m MASK, --mask MASK  BED-formatted file corresponding to inaccessible bases
235 |   -d OUTDIR, --projectDir OUTDIR
236 |                         Directory for all project output. NOTE: the same
237 |                         projectDir must be used for all functions of ReLERNN
238 |   -n DEM, --demographicHistory DEM
239 |                         Output file from either stairwayplot, SMC++, or MSMC
240 |   -u MU, --assumedMu MU
241 |                         Assumed per-base mutation rate
242 |   -l GENTIME, --assumedGenTime GENTIME
243 |                         Assumed generation time (in years)
244 |   -r UPRTR, --upperRhoThetaRatio UPRTR
245 |                         Assumed upper bound for the ratio of rho to theta
246 |   -t NCPU, --nCPU NCPU  Number of CPUs to use (defaults to total available cores)
247 |   -s SEED, --seed SEED  Random seed
248 |   --maxSites WINSIZEMX
249 |                         Max number of sites per window to train on. Important:
250 |                         too many sites causes problems in training
251 |   --maskThresh MASKTHRESH
252 |                         Discard windows where >= maskThresh percent of sites
253 |                         are inaccessible
254 |   --nTrain NTRAIN       Number of training examples to simulate
255 |   --nVali NVALI         Number of validation examples to simulate
256 |   --nTest NTEST         Number of test examples to simulate
257 | ```
258 | 
259 | 
260 | ### Step 2) ReLERNN_TRAIN_POOL
261 | `ReLERNN_TRAIN_POOL` takes the simulations created by `ReLERNN_SIMULATE_POOL` and uses them to train a recurrent neural network. The only difference here is that the mean read depth of the pool must be specified using the `--readDepth` argument. You can also specify a minor allele frequency threshold (`--maf`), if a similar threshold was used to generate your POOLFILE. Again, we recommend using the defaults for `--nEpochs` and `--nValSteps`, but if you would like to do more training, feel free. To set the GPU to be used for machines with multiple dedicated GPUs use `--gpuID` (e.g. if running an analysis on two populations simultaneously, set `--gpuID 0` for the first population and `--gpuID 1` for the second). `ReLERNN_TRAIN_POOL` outputs some basic metrics of the training results for you, generating the figure `$/projectDir/networks/poolprefix.pdf`. The default value of `-nCPU` for this step is the max number of available cores, as training on pooled data with a single core can be very slow.
262 | 
263 | The complete list of arguments used in `ReLERNN_TRAIN_POOL` is found below:
264 | ```
265 | ReLERNN_TRAIN_POOL -h
266 | 
267 | usage: ReLERNN_TRAIN_POOL [-h] [-d OUTDIR] [--readDepth SEQD] [--maf MAF] [--nEpochs NEPOCHS]
268 |                      [--nValSteps NVALSTEPS] [-t NCPU] [-s SEED] [--gpuID GPUID]
269 | 
270 | optional arguments:
271 |   -h, --help            show this help message and exit
272 |   -d OUTDIR, --projectDir OUTDIR
273 |                         Directory for all project output. NOTE: the same
274 |                         projectDir must be used for all functions of ReLERNN
275 |   --readDepth SEQD     Mean read depth of the pool
276 |   --maf MAF     discard simulated sites with allele frequencies < maf
277 |   --nEpochs NEPOCHS     Number of epochs to train over
278 |   --nValSteps NVALSTEPS
279 |                         Number of validation steps
280 |   -t NCPU, --nCPU NCPU           Number of CPUs to use (defaults to total available cores)
281 |   -s SEED, --seed SEED  Random seed
282 |   --gpuID GPUID         Identifier specifying which GPU to use
283 | ```
284 | 
285 | 
286 | 
287 | ### Step 3) ReLERNN_PREDICT_POOL
288 | `ReLERNN_PREDICT_POOL` now takes the same POOL file you used in `ReLERNN_SIMULATE_POOL` and predicts per-base recombination rates in non-overlapping windows across the genome. The output file of predictions will be created as `$/projectDir/poolprefix.PREDICT.txt`. It is important to note that the window size used for predictions might be different for different chromosomes. A complete list of the window sizes used for each chromosome can be found in third column of `$/projectDir/networks/windowSizes.txt`. Use the optional `--minSites` argument to exclude windows with fewer than the desired number of SNPs. If you are not interested in estimating confidence intervals around the predictions, your ReLERNN analysis is now finished. If you are getting OOM errors at this step you can try setting `--batchSizeOverride` to a value significantly less than the total number of windows along a chromosome (found in the last column of `$/projectDir/networks/windowSizes.txt`).
289 | 
290 | 
291 | The complete list of arguments used in `ReLERNN_PREDICT_POOL` is found below:
292 | ```
293 | ReLERNN_PREDICT_POOL -h
294 | 
295 | usage: ReLERNN_PREDICT [-h] [-p POOL] [-d OUTDIR] [--minSites MINS]
296 |                        [--batchSizeOverride BSO] [--gpuID GPUID] [-s SEED]
297 | 
298 | optional arguments:
299 |   -h, --help            show this help message and exit
300 |   -p POOL, --pool POOL     Filtered and QC-checked POOL file.
301 |   -d OUTDIR, --projectDir OUTDIR
302 |                         Directory for all project output. NOTE: the same
303 |                         projectDir must be used for all functions of ReLERNN
304 |   --minSites MINS       Minimum number of SNPs in a genomic window required to
305 |                         return a prediction
306 |   --batchSizeOverride BSO
307 |                         Batch size to use for low memory applications
308 |   --gpuID GPUID         Identifier specifying which GPU to use
309 |   -s SEED, --seed SEED  Random seed
310 | ```
311 | 
312 | ### Optional Step 4) ReLERNN_BSCORRECT
313 | This step is exactly the same as in ReLERNN for individually sequenced chromosomes (above).
314 | 
315 | The complete list of arguments used in `ReLERNN_BSCORRECT` is found below:
316 | ```
317 | ReLERNN_BSCORRECT -h
318 | 
319 | usage: ReLERNN_BSCORRECT [-h] [-d OUTDIR] [-t NCPU] [-s SEED] [--gpuID GPUID]
320 |                          [--nSlice NSLICE] [--nReps NREPS]
321 | 
322 | optional arguments:
323 |   -h, --help            show this help message and exit
324 |   -d OUTDIR, --projectDir OUTDIR
325 |                         Directory for all project output. NOTE: the same
326 |                         projectDir must be used for all functions of ReLERNN
327 |   -t NCPU, --nCPU NCPU  Number of CPUs to use (defaults to total available cores)
328 |   -s SEED, --seed SEED  Random seed
329 |   --gpuID GPUID         Identifier specifying which GPU to use
330 |   --nSlice NSLICE       Number of recombination rate bins to simulate over
331 |   --nReps NREPS         Number of simulations per step
332 | ```
333 | 


--------------------------------------------------------------------------------