├── methodFlow.png ├── requirements.txt ├── examples ├── accessibility_mask.bed ├── genome.bed ├── example_pipeline.sh └── example_pipeline_pool.sh ├── ReLERNN ├── __init__.py ├── imports.py ├── networks.py ├── ReLERNN_TRAIN ├── ReLERNN_TRAIN_POOL ├── ReLERNN_PREDICT_POOL ├── ReLERNN_PREDICT ├── ReLERNN_BSCORRECT ├── ReLERNN_SIMULATE_POOL ├── ReLERNN_SIMULATE ├── simulator.py ├── manager.py ├── sequenceBatchGenerator.py └── helpers.py ├── manuscript └── README.md ├── LICENSE ├── setup.py ├── .gitignore └── README.md /methodFlow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kr-colab/ReLERNN/HEAD/methodFlow.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | msprime>=0.7.4 2 | scikit-learn>=0.22.1 3 | matplotlib>=3.1.3 4 | scikit-allel>=1.2.1 5 | -------------------------------------------------------------------------------- /examples/accessibility_mask.bed: -------------------------------------------------------------------------------- 1 | 2L 0 7000 2 | 2R 0 9000 3 | 3L 0 35000 4 | 3R 0 4000 5 | X 0 7300 6 | -------------------------------------------------------------------------------- /examples/genome.bed: -------------------------------------------------------------------------------- 1 | 2L 0 840000 2 | 2R 0 1669000 3 | 3L 0 742000 4 | 3R 0 1963000 5 | X 0 1250000 6 | -------------------------------------------------------------------------------- /ReLERNN/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from ReLERNN.imports import * 4 | from ReLERNN.helpers import * 5 | from ReLERNN.networks import * 6 | from ReLERNN.sequenceBatchGenerator import * 7 | from ReLERNN.simulator import * 8 | -------------------------------------------------------------------------------- /manuscript/README.md: -------------------------------------------------------------------------------- 1 | # Predicting the landscape of recombination using deep learning 2 | ## Jeffrey R. Adrion, Jared G. Galloway, and Andrew D. Kern 3 | ==================================================================== 4 | 5 | This repository will host code for analyses found in [Adrion, Galloway, and Kern (2020)](https://academic.oup.com/mbe/advance-article/doi/10.1093/molbev/msaa038/5741419). 6 | -------------------------------------------------------------------------------- /ReLERNN/imports.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pickle 3 | import sys 4 | import msprime as msp 5 | import numpy as np 6 | import os 7 | import multiprocessing as mp 8 | import shutil 9 | import random 10 | import copy 11 | import argparse 12 | import h5py 13 | import allel 14 | import time 15 | 16 | from sklearn.neighbors import NearestNeighbors 17 | from sklearn.utils import resample 18 | 19 | import matplotlib as mpl 20 | mpl.use('pdf') 21 | import matplotlib.pyplot as plt 22 | 23 | import tensorflow as tf 24 | from tensorflow.keras import layers 25 | from tensorflow.keras.models import Model, model_from_json 26 | from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TerminateOnNaN 27 | -------------------------------------------------------------------------------- /examples/example_pipeline.sh: -------------------------------------------------------------------------------- 1 | SIMULATE="ReLERNN_SIMULATE" 2 | TRAIN="ReLERNN_TRAIN" 3 | PREDICT="ReLERNN_PREDICT" 4 | BSCORRECT="ReLERNN_BSCORRECT" 5 | SEED="42" 6 | MU="1e-8" 7 | URTR="1" 8 | DIR="./example_output/" 9 | VCF="./example.vcf" 10 | GENOME="./genome.bed" 11 | MASK="./accessibility_mask.bed" 12 | 13 | # Simulate data 14 | ${SIMULATE} \ 15 | --vcf ${VCF} \ 16 | --genome ${GENOME} \ 17 | --mask ${MASK} \ 18 | --projectDir ${DIR} \ 19 | --assumedMu ${MU} \ 20 | --upperRhoThetaRatio ${URTR} \ 21 | --nTrain 13000 \ 22 | --nVali 2000 \ 23 | --nTest 100 \ 24 | --seed ${SEED} 25 | 26 | # Train network 27 | ${TRAIN} \ 28 | --projectDir ${DIR} \ 29 | --seed ${SEED} 30 | 31 | # Predict 32 | ${PREDICT} \ 33 | --vcf ${VCF} \ 34 | --projectDir ${DIR} \ 35 | --seed ${SEED} 36 | 37 | # Parametric Bootstrapping 38 | ${BSCORRECT} \ 39 | --projectDir ${DIR} \ 40 | --nSlice 2 \ 41 | --nReps 2 \ 42 | --seed ${SEED} 43 | -------------------------------------------------------------------------------- /examples/example_pipeline_pool.sh: -------------------------------------------------------------------------------- 1 | SIMULATE="ReLERNN_SIMULATE_POOL" 2 | TRAIN="ReLERNN_TRAIN_POOL" 3 | PREDICT="ReLERNN_PREDICT_POOL" 4 | BSCORRECT="ReLERNN_BSCORRECT" 5 | SEED="42" 6 | MU="1e-8" 7 | URTR="1" 8 | DIR="./example_output_pool/" 9 | POOL="./example.pool" 10 | GENOME="./genome.bed" 11 | MASK="./accessibility_mask.bed" 12 | 13 | # Simulate data 14 | ${SIMULATE} \ 15 | --pool ${POOL} \ 16 | --sampleDepth 20 \ 17 | --genome ${GENOME} \ 18 | --mask ${MASK} \ 19 | --projectDir ${DIR} \ 20 | --assumedMu ${MU} \ 21 | --upperRhoThetaRatio ${URTR} \ 22 | --nTrain 13000 \ 23 | --nVali 2000 \ 24 | --nTest 100 \ 25 | --seed ${SEED} 26 | 27 | # Train network 28 | ${TRAIN} \ 29 | --projectDir ${DIR} \ 30 | --readDepth 20 \ 31 | --maf 0.05 \ 32 | --seed ${SEED} 33 | 34 | # Predict 35 | ${PREDICT} \ 36 | --pool ${POOL} \ 37 | --projectDir ${DIR} \ 38 | --seed ${SEED} 39 | 40 | # Parametric Bootstrapping 41 | ${BSCORRECT} \ 42 | --projectDir ${DIR} \ 43 | --seed ${SEED} 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Kern Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup, find_packages 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | setup(name='ReLERNN', 8 | version='0.2', 9 | requires_python="<3.11", 10 | description='ReLERNN: Recombination Landscape Estimation using Recurrent Neural Networks', 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | url='https://github.com/kern-lab/ReLERNN/', 14 | author='Jeffrey Adrion, Jared Galloway, Andrew Kern', 15 | author_email='jeffreyadrion@gmail.com, jaredgalloway07@gmail.com, adk@uoregon.edu', 16 | license='MIT', 17 | packages=find_packages(exclude=[]), 18 | install_requires=[ 19 | "msprime>=0.7.4", 20 | "scikit-learn>=0.22.1", 21 | "matplotlib>=3.1.3", 22 | "scikit-allel>=1.2.1", 23 | "tensorflow==2.15.0"], 24 | scripts=[ 25 | "ReLERNN/ReLERNN_SIMULATE", 26 | "ReLERNN/ReLERNN_SIMULATE_POOL", 27 | "ReLERNN/ReLERNN_TRAIN", 28 | "ReLERNN/ReLERNN_TRAIN_POOL", 29 | "ReLERNN/ReLERNN_PREDICT", 30 | "ReLERNN/ReLERNN_PREDICT_POOL", 31 | "ReLERNN/ReLERNN_BSCORRECT"], 32 | zip_safe=False, 33 | setup_requires=[], 34 | ) 35 | 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # vim 107 | *.swp 108 | *.swo 109 | -------------------------------------------------------------------------------- /ReLERNN/networks.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Authors: Jeff Adrion, Andrew Kern, Jared Galloway 3 | ''' 4 | 5 | from ReLERNN.imports import * 6 | 7 | def GRU_TUNED84(x,y): 8 | ''' 9 | Same as GRU_VANILLA but with dropout AFTER each dense layer. 10 | ''' 11 | 12 | haps,pos = x 13 | 14 | numSNPs = haps[0].shape[0] 15 | numSamps = haps[0].shape[1] 16 | numPos = pos[0].shape[0] 17 | 18 | genotype_inputs = layers.Input(shape=(numSNPs,numSamps)) 19 | model = layers.Bidirectional(layers.GRU(84,return_sequences=False))(genotype_inputs) 20 | model = layers.Dense(256)(model) 21 | model = layers.Dropout(0.35)(model) 22 | 23 | #---------------------------------------------------- 24 | 25 | position_inputs = layers.Input(shape=(numPos,)) 26 | m2 = layers.Dense(256)(position_inputs) 27 | 28 | #---------------------------------------------------- 29 | 30 | 31 | model = layers.concatenate([model,m2]) 32 | model = layers.Dense(64)(model) 33 | model = layers.Dropout(0.35)(model) 34 | output = layers.Dense(1)(model) 35 | 36 | #---------------------------------------------------- 37 | 38 | model = Model(inputs=[genotype_inputs,position_inputs], outputs=[output]) 39 | model.compile(optimizer='Adam', loss='mse') 40 | model.summary() 41 | 42 | return model 43 | 44 | 45 | def GRU_POOLED(x,y): 46 | 47 | sites=x.shape[1] 48 | features=x.shape[2] 49 | 50 | genotype_inputs = layers.Input(shape=(sites,features)) 51 | model = layers.Bidirectional(layers.GRU(84,return_sequences=False))(genotype_inputs) 52 | model = layers.Dense(256)(model) 53 | model = layers.Dropout(0.35)(model) 54 | output = layers.Dense(1)(model) 55 | 56 | model = Model(inputs=[genotype_inputs], outputs=[output]) 57 | model.compile(optimizer='Adam', loss='mse') 58 | model.summary() 59 | 60 | return model 61 | 62 | 63 | def HOTSPOT_CLASSIFY(x,y): 64 | 65 | haps,pos = x 66 | 67 | numSNPs = haps[0].shape[0] 68 | numSamps = haps[0].shape[1] 69 | numPos = pos[0].shape[0] 70 | 71 | genotype_inputs = layers.Input(shape=(numSNPs,numSamps)) 72 | model = layers.Bidirectional(layers.GRU(84,return_sequences=False))(genotype_inputs) 73 | model = layers.Dense(256)(model) 74 | model = layers.Dropout(0.35)(model) 75 | 76 | #---------------------------------------------------- 77 | 78 | position_inputs = layers.Input(shape=(numPos,)) 79 | m2 = layers.Dense(256)(position_inputs) 80 | 81 | #---------------------------------------------------- 82 | 83 | 84 | model = layers.concatenate([model,m2]) 85 | model = layers.Dense(64)(model) 86 | model = layers.Dropout(0.35)(model) 87 | output = layers.Dense(1,activation='sigmoid')(model) 88 | 89 | #---------------------------------------------------- 90 | 91 | model = Model(inputs=[genotype_inputs,position_inputs], outputs=[output]) 92 | model.compile(optimizer='adam', loss='binary_crossentropy') 93 | model.summary() 94 | 95 | return model 96 | -------------------------------------------------------------------------------- /ReLERNN/ReLERNN_TRAIN: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Trains a network on data simulated by ReLERNN_SIMULATE.py""" 3 | 4 | from ReLERNN.imports import * 5 | from ReLERNN.helpers import * 6 | from ReLERNN.sequenceBatchGenerator import * 7 | from ReLERNN.networks import * 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None) 13 | parser.add_argument('--nEpochs',dest='nEpochs',help='Maximum number of epochs to train (EarlyStopping is implemented for validation accuracy)', type=int, default=1000) 14 | parser.add_argument('--nValSteps',dest='nValSteps',help='Number of validation steps', type=int, default=20) 15 | parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=1) 16 | parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None) 17 | parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default=0) 18 | args = parser.parse_args() 19 | 20 | 21 | ## Set seed 22 | if args.seed: 23 | os.environ['PYTHONHASHSEED']=str(args.seed) 24 | random.seed(args.seed) 25 | np.random.seed(args.seed) 26 | 27 | 28 | ## Set number of cores 29 | nProc = args.nCPU 30 | 31 | 32 | ## Set up the directory structure to store the simulations data. 33 | if not args.outDir: 34 | print("Warning: No project directory found, using current working directory.") 35 | projectDir = os.getcwd() 36 | else: 37 | projectDir = args.outDir 38 | trainDir = os.path.join(projectDir,"train") 39 | valiDir = os.path.join(projectDir,"vali") 40 | testDir = os.path.join(projectDir,"test") 41 | networkDir = os.path.join(projectDir,"networks") 42 | 43 | 44 | ## Define output files 45 | test_resultFile = os.path.join(networkDir,"testResults.p") 46 | test_resultFig = os.path.join(networkDir,"testResults.pdf") 47 | modelSave = os.path.join(networkDir,"model.json") 48 | weightsSave = os.path.join(networkDir,"weights.h5") 49 | 50 | 51 | ## Identify padding required 52 | maxSimS = 0 53 | winFILE=os.path.join(networkDir,"windowSizes.txt") 54 | with open(winFILE, "r") as fIN: 55 | for line in fIN: 56 | maxSimS=max([maxSimS, int(line.split()[5])]) 57 | maxSegSites = 0 58 | for ds in [trainDir,valiDir,testDir]: 59 | DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb")) 60 | segSitesInDs = max(DsInfoDir["segSites"]) 61 | maxSegSites = max(maxSegSites,segSitesInDs) 62 | maxSegSites = max(maxSegSites, maxSimS) 63 | 64 | 65 | ## Set network parameters 66 | bds_train_params = { 67 | 'treesDirectory':trainDir, 68 | 'targetNormalization':"zscore", 69 | 'batchSize': 64, 70 | 'maxLen': maxSegSites, 71 | 'frameWidth': 5, 72 | 'shuffleInds':True, 73 | 'sortInds':False, 74 | 'center':False, 75 | 'ancVal':-1, 76 | 'padVal':0, 77 | 'derVal':1, 78 | 'realLinePos':True, 79 | 'posPadVal':0, 80 | 'seqD':None, 81 | 'seed':args.seed 82 | } 83 | 84 | 85 | ## Dump batch pars for bootstrap 86 | batchParsFILE=os.path.join(networkDir,"batchPars.p") 87 | with open(batchParsFILE, "wb") as fOUT: 88 | pickle.dump(bds_train_params,fOUT) 89 | 90 | 91 | bds_vali_params = copy.deepcopy(bds_train_params) 92 | bds_vali_params['treesDirectory'] = valiDir 93 | bds_vali_params['batchSize'] = 64 94 | 95 | bds_test_params = copy.deepcopy(bds_train_params) 96 | bds_test_params['treesDirectory'] = testDir 97 | DsInfoDir = pickle.load(open(os.path.join(testDir,"info.p"),"rb")) 98 | bds_test_params['batchSize'] = DsInfoDir["numReps"] 99 | bds_test_params['shuffleExamples'] = False 100 | 101 | 102 | ## Define sequence batch generator 103 | train_sequence = SequenceBatchGenerator(**bds_train_params) 104 | vali_sequence = SequenceBatchGenerator(**bds_vali_params) 105 | test_sequence = SequenceBatchGenerator(**bds_test_params) 106 | 107 | 108 | ## Train network 109 | runModels(ModelFuncPointer=GRU_TUNED84, 110 | ModelName="GRU_TUNED84", 111 | TrainDir=trainDir, 112 | TrainGenerator=train_sequence, 113 | ValidationGenerator=vali_sequence, 114 | TestGenerator=test_sequence, 115 | resultsFile=test_resultFile, 116 | network=[modelSave,weightsSave], 117 | numEpochs=args.nEpochs, 118 | validationSteps=args.nValSteps, 119 | nCPU=nProc, 120 | gpuID=args.gpuID) 121 | 122 | 123 | ## Plot results of predictions on test set 124 | plotResults(resultsFile=test_resultFile,saveas=test_resultFig) 125 | 126 | 127 | print("\n***ReLERNN_TRAIN.py FINISHED!***\n") 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /ReLERNN/ReLERNN_TRAIN_POOL: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Trains a network on data simulated by ReLERNN_SIMULATE_POOL.py""" 3 | 4 | from ReLERNN.imports import * 5 | from ReLERNN.helpers import * 6 | from ReLERNN.sequenceBatchGenerator import * 7 | from ReLERNN.networks import * 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None) 13 | parser.add_argument('--readDepth',dest='seqD',help='Mean read depth of the pool', type=int, default=0) 14 | parser.add_argument('--maf',dest='maf',help='discard simulated sites with allele frequencies < maf', type=float, default=0.05) 15 | parser.add_argument('--nEpochs',dest='nEpochs',help='Maximum number of epochs to train (EarlyStopping is implemented for validation accuracy)', type=int, default=1000) 16 | parser.add_argument('--nValSteps',dest='nValSteps',help='Number of validation steps', type=int, default=20) 17 | parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None) 18 | parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None) 19 | parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default=0) 20 | args = parser.parse_args() 21 | 22 | 23 | ## Set seed 24 | if args.seed: 25 | os.environ['PYTHONHASHSEED']=str(args.seed) 26 | random.seed(args.seed) 27 | np.random.seed(args.seed) 28 | 29 | 30 | ## Set number of cores 31 | if args.nCPU: 32 | nProc = args.nCPU 33 | else: 34 | nProc = mp.cpu_count() 35 | 36 | 37 | print("Warning: training data to be treated as if generated by pool-seq") 38 | if args.seqD == 0: 39 | print("Error: assumed sequencing depth must be provided.") 40 | sys.exit(1) 41 | 42 | 43 | ## Set up the directory structure to store the simulations data. 44 | if not args.outDir: 45 | print("Warning: No project directory found, using current working directory.") 46 | projectDir = os.getcwd() 47 | else: 48 | projectDir = args.outDir 49 | trainDir = os.path.join(projectDir,"train") 50 | valiDir = os.path.join(projectDir,"vali") 51 | testDir = os.path.join(projectDir,"test") 52 | networkDir = os.path.join(projectDir,"networks") 53 | 54 | 55 | ## Define output files 56 | test_resultFile = os.path.join(networkDir,"testResults.p") 57 | test_resultFig = os.path.join(networkDir,"testResults.pdf") 58 | modelSave = os.path.join(networkDir,"model.json") 59 | weightsSave = os.path.join(networkDir,"weights.h5") 60 | 61 | 62 | ## Identify padding required 63 | maxSimS = 0 64 | winFILE=os.path.join(networkDir,"windowSizes.txt") 65 | with open(winFILE, "r") as fIN: 66 | for line in fIN: 67 | maxSimS=max([maxSimS, int(line.split()[5])]) 68 | maxSegSites = 0 69 | for ds in [trainDir,valiDir,testDir]: 70 | DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb")) 71 | segSitesInDs = max(DsInfoDir["segSites"]) 72 | maxSegSites = max(maxSegSites,segSitesInDs) 73 | maxSegSites = max(maxSegSites, maxSimS) 74 | 75 | 76 | ## Set network parameters 77 | bds_train_params = { 78 | 'treesDirectory':trainDir, 79 | 'targetNormalization':"zscore", 80 | 'batchSize': 64, 81 | 'maxLen': maxSegSites, 82 | 'frameWidth': 5, 83 | 'shuffleInds':True, 84 | 'sortInds':False, 85 | 'center':False, 86 | 'ancVal':-1, 87 | 'padVal':0, 88 | 'derVal':1, 89 | 'realLinePos':True, 90 | 'posPadVal':0, 91 | 'seqD':args.seqD, 92 | 'maf':args.maf, 93 | 'seed':args.seed 94 | } 95 | 96 | 97 | ## Dump batch pars for bootstrap 98 | batchParsFILE=os.path.join(networkDir,"batchPars.p") 99 | with open(batchParsFILE, "wb") as fOUT: 100 | pickle.dump(bds_train_params,fOUT) 101 | 102 | 103 | bds_vali_params = copy.deepcopy(bds_train_params) 104 | bds_vali_params['treesDirectory'] = valiDir 105 | bds_vali_params['batchSize'] = 64 106 | 107 | bds_test_params = copy.deepcopy(bds_train_params) 108 | bds_test_params['treesDirectory'] = testDir 109 | DsInfoDir = pickle.load(open(os.path.join(testDir,"info.p"),"rb")) 110 | bds_test_params['batchSize'] = DsInfoDir["numReps"] 111 | bds_test_params['shuffleExamples'] = False 112 | 113 | 114 | ## Define sequence batch generator 115 | train_sequence = SequenceBatchGenerator(**bds_train_params) 116 | vali_sequence = SequenceBatchGenerator(**bds_vali_params) 117 | test_sequence = SequenceBatchGenerator(**bds_test_params) 118 | 119 | 120 | ## Train network 121 | runModels(ModelFuncPointer=GRU_POOLED, 122 | ModelName="GRU_POOLED", 123 | TrainDir=trainDir, 124 | TrainGenerator=train_sequence, 125 | ValidationGenerator=vali_sequence, 126 | TestGenerator=test_sequence, 127 | resultsFile=test_resultFile, 128 | network=[modelSave,weightsSave], 129 | numEpochs=args.nEpochs, 130 | validationSteps=args.nValSteps, 131 | nCPU=nProc, 132 | gpuID=args.gpuID) 133 | 134 | 135 | ## Plot results of predictions on test set 136 | plotResults(resultsFile=test_resultFile,saveas=test_resultFig) 137 | 138 | 139 | print("\n***ReLERNN_TRAIN_POOL.py FINISHED!***\n") 140 | 141 | if __name__ == "__main__": 142 | main() 143 | -------------------------------------------------------------------------------- /ReLERNN/ReLERNN_PREDICT_POOL: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Predicts the recombination rate for each genomic window in the POOL file 4 | using the network trained by ReLERNN_TRAIN_POOL.py 5 | """ 6 | 7 | from ReLERNN.imports import * 8 | from ReLERNN.helpers import * 9 | from ReLERNN.sequenceBatchGenerator import * 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('-p','--pool',dest='pool',help='Filtered and QC-checked pool file') 15 | parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None) 16 | parser.add_argument('--minSites',dest='minS',help='Minimum number of SNPs in a genomic window required to return a prediction', type=int, default = 50) 17 | parser.add_argument('--batchSizeOverride',dest='bso',help='Batch size to use when number of windows along a chromosome for low memory applications', type=int, default = None) 18 | parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default = 0) 19 | parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None) 20 | args = parser.parse_args() 21 | 22 | 23 | ## Set seed 24 | if args.seed: 25 | os.environ['PYTHONHASHSEED']=str(args.seed) 26 | random.seed(args.seed) 27 | np.random.seed(args.seed) 28 | 29 | 30 | ## Set up the directory structure to store the simulations data. 31 | if not args.outDir: 32 | print("Warning: No project directory found, using current working directory.") 33 | projectDir = os.getcwd() 34 | else: 35 | projectDir = args.outDir 36 | trainDir = os.path.join(projectDir,"train") 37 | valiDir = os.path.join(projectDir,"vali") 38 | testDir = os.path.join(projectDir,"test") 39 | networkDir = os.path.join(projectDir,"networks") 40 | poolDir = os.path.join(projectDir,"splitPOOLs") 41 | modelSave = os.path.join(networkDir,"model.json") 42 | weightsSave = os.path.join(networkDir,"weights.h5") 43 | 44 | 45 | ## Read in the window sizes 46 | maxSimS = 0 47 | wins=[] 48 | winFILE=os.path.join(networkDir,"windowSizes.txt") 49 | with open(winFILE, "r") as fIN: 50 | for line in fIN: 51 | ar=line.split() 52 | wins.append([ar[0],int(ar[1]),int(ar[2]),int(ar[3]),int(ar[4]),int(ar[5]),int(ar[6])]) 53 | maxSimS=max([maxSimS, int(ar[5])]) 54 | 55 | 56 | ## Loop through chromosomes and predict 57 | for i in range(len(wins)): 58 | bn=os.path.basename(args.pool) 59 | poolFILE=os.path.join(poolDir,bn.replace(".pool","_%s.pool" %(wins[i][0]))) 60 | print("""Importing POOL: "%s"...""" %(poolFILE)) 61 | pos,fqs = [], [] 62 | with open(poolFILE, "r") as fIN: 63 | for line in fIN: 64 | ar = line.split() 65 | pos.append(int(ar[1])) 66 | fqs.append(float(ar[2])) 67 | chrom = ar[0] 68 | pos = np.array(pos) 69 | fqs = np.array(fqs) 70 | 71 | 72 | ## Identify padding required 73 | maxSegSites = 0 74 | for ds in [trainDir,valiDir,testDir]: 75 | DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb")) 76 | segSitesInDs = max(DsInfoDir["segSites"]) 77 | maxSegSites = max(maxSegSites,segSitesInDs) 78 | maxSegSites = max(maxSegSites, maxSimS) 79 | 80 | 81 | ## Identify parameters used to train 82 | DsInfoDir = pickle.load(open(os.path.join(trainDir,"info.p"),"rb")) 83 | winLen=wins[i][2] 84 | numWins=wins[i][6] 85 | if args.bso: 86 | batchSize = args.bso 87 | else: 88 | batchSize = wins[i][6] 89 | 90 | 91 | batchPars = pickle.load(open(os.path.join(networkDir,"batchPars.p"),"rb")) 92 | normType = batchPars["targetNormalization"] 93 | 94 | 95 | ## Set network parameters 96 | bds_pred_params = { 97 | 'INFO':DsInfoDir, 98 | 'CHROM':chrom, 99 | 'winLen':winLen, 100 | 'numWins':numWins, 101 | 'IDs':get_index(pos,winLen), 102 | 'GT':fqs, 103 | 'POS':pos, 104 | 'batchSize': batchSize, 105 | 'maxLen': maxSegSites, 106 | 'frameWidth': 5, 107 | 'sortInds':False, 108 | 'center':False, 109 | 'ancVal':-1, 110 | 'padVal':0, 111 | 'derVal':1, 112 | 'realLinePos':True, 113 | 'posPadVal':0, 114 | 'normType':normType, 115 | 'seed':args.seed 116 | } 117 | 118 | 119 | ### Define sequence batch generator 120 | pool_gen = POOLBatchGenerator(**bds_pred_params) 121 | 122 | 123 | ## Load trained model and make predictions on pool data 124 | pred_resultFile = os.path.join(projectDir,wins[i][0]+".CHPREDICT.txt") 125 | load_and_predictVCF(VCFGenerator=pool_gen, 126 | resultsFile=pred_resultFile, 127 | network=[modelSave,weightsSave], 128 | chromStr=wins[i][0], 129 | minS=args.minS, 130 | numWins=numWins, 131 | batchSize=batchSize, 132 | gpuID=args.gpuID) 133 | 134 | 135 | ## Combine chromosome predictions in whole genome prediction file and rm chromosome files 136 | genPredFILE=os.path.join(projectDir,bn.replace(".pool",".PREDICT.txt")) 137 | files=[] 138 | for f in glob.glob(os.path.join(projectDir,"*.CHPREDICT.txt")): 139 | files.append(f) 140 | ct=0 141 | with open(genPredFILE, "w") as fOUT: 142 | for f in sorted(files): 143 | if ct==0: 144 | with open(f, "r") as fIN: 145 | for line in fIN: 146 | fOUT.write(line) 147 | else: 148 | with open(f, "r") as fIN: 149 | fIN.readline() 150 | for line in fIN: 151 | fOUT.write(line) 152 | ct+=1 153 | cmd="rm %s" %(f) 154 | os.system(cmd) 155 | 156 | 157 | print("\n***ReLERNN_PREDICT_POOL.py FINISHED!***\n") 158 | 159 | 160 | if __name__ == "__main__": 161 | main() 162 | -------------------------------------------------------------------------------- /ReLERNN/ReLERNN_PREDICT: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Predicts the recombination rate for each genomic window in the VCF file 4 | using a GRU network trained in ReLERNN_TRAIN.py 5 | """ 6 | 7 | from ReLERNN.imports import * 8 | from ReLERNN.helpers import * 9 | from ReLERNN.sequenceBatchGenerator import * 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('-v','--vcf',dest='vcf',help='Filtered and QC-checked VCF file. Important: Every row must correspond to a biallelic SNP with no missing data!') 15 | parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None) 16 | parser.add_argument('--phased',help='VCF file is phased',default=False, action='store_true') 17 | parser.add_argument('--unphased',dest='phased',help='VCF file is unphased',action='store_false') 18 | parser.add_argument('--minSites',dest='minS',help='Minimum number of SNPs in a genomic window required to return a prediction', type=int, default = 50) 19 | parser.add_argument('--batchSizeOverride',dest='bso',help='Batch size to use when number of windows along a chromosome for low memory applications', type=int, default = None) 20 | parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default = 0) 21 | parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None) 22 | args = parser.parse_args() 23 | 24 | 25 | ## Set seed 26 | if args.seed: 27 | os.environ['PYTHONHASHSEED']=str(args.seed) 28 | random.seed(args.seed) 29 | np.random.seed(args.seed) 30 | 31 | 32 | ## Set up the directory structure to store the simulations data. 33 | if not args.outDir: 34 | print("Warning: No project directory found, using current working directory.") 35 | projectDir = os.getcwd() 36 | else: 37 | projectDir = args.outDir 38 | trainDir = os.path.join(projectDir,"train") 39 | valiDir = os.path.join(projectDir,"vali") 40 | testDir = os.path.join(projectDir,"test") 41 | networkDir = os.path.join(projectDir,"networks") 42 | vcfDir = os.path.join(projectDir,"splitVCFs") 43 | modelSave = os.path.join(networkDir,"model.json") 44 | weightsSave = os.path.join(networkDir,"weights.h5") 45 | 46 | 47 | ## Read in the window sizes 48 | maxSimS = 0 49 | wins=[] 50 | winFILE=os.path.join(networkDir,"windowSizes.txt") 51 | with open(winFILE, "r") as fIN: 52 | for line in fIN: 53 | ar=line.split() 54 | wins.append([ar[0],int(ar[1]),int(ar[2]),int(ar[3]),int(ar[4]),int(ar[5]),int(ar[6])]) 55 | maxSimS=max([maxSimS, int(ar[5])]) 56 | 57 | 58 | ## Loop through chromosomes and predict 59 | pred_resultFiles = [] 60 | for i in range(len(wins)): 61 | ## Read in the hdf5 62 | bn=os.path.basename(args.vcf) 63 | h5FILE=os.path.join(vcfDir,bn.replace(".vcf","_%s.hdf5" %(wins[i][0]))) 64 | print("""Importing HDF5: "%s"...""" %(h5FILE)) 65 | callset=h5py.File(h5FILE, mode="r") 66 | var=allel.VariantChunkedTable(callset["variants"],names=["CHROM","POS"], index="POS") 67 | chroms=var["CHROM"] 68 | pos=var["POS"] 69 | genos=allel.GenotypeChunkedArray(callset["calldata"]["GT"]) 70 | 71 | 72 | ## Identify padding required 73 | maxSegSites = 0 74 | for ds in [trainDir,valiDir,testDir]: 75 | DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb")) 76 | segSitesInDs = max(DsInfoDir["segSites"]) 77 | maxSegSites = max(maxSegSites,segSitesInDs) 78 | maxSegSites = max(maxSegSites, maxSimS) 79 | 80 | 81 | ## Identify parameters used to train 82 | DsInfoDir = pickle.load(open(os.path.join(trainDir,"info.p"),"rb")) 83 | winLen=wins[i][2] 84 | numWins=wins[i][6] 85 | if args.bso: 86 | batchSize = args.bso 87 | else: 88 | batchSize = wins[i][6] 89 | 90 | 91 | ## Set network parameters 92 | bds_pred_params = { 93 | 'INFO':DsInfoDir, 94 | 'CHROM':chroms[0], 95 | 'winLen':winLen, 96 | 'numWins':numWins, 97 | 'IDs':get_index(pos,winLen), 98 | 'GT':genos, 99 | 'POS':pos, 100 | 'batchSize': batchSize, 101 | 'maxLen': maxSegSites, 102 | 'frameWidth': 5, 103 | 'sortInds':False, 104 | 'center':False, 105 | 'ancVal':-1, 106 | 'padVal':0, 107 | 'derVal':1, 108 | 'realLinePos':True, 109 | 'posPadVal':0, 110 | 'phase':args.phased, 111 | 'seed':args.seed 112 | } 113 | 114 | 115 | ### Define sequence batch generator 116 | vcf_gen = VCFBatchGenerator(**bds_pred_params) 117 | 118 | 119 | ## Load trained model and make predictions on VCF data 120 | pred_resultFile = os.path.join(projectDir,wins[i][0]+".CHPREDICT.txt") 121 | pred_resultFiles.append(pred_resultFile) 122 | load_and_predictVCF(VCFGenerator=vcf_gen, 123 | resultsFile=pred_resultFile, 124 | network=[modelSave,weightsSave], 125 | chromStr=wins[i][0], 126 | minS=args.minS, 127 | numWins=numWins, 128 | batchSize=batchSize, 129 | gpuID=args.gpuID) 130 | 131 | 132 | ## Combine chromosome predictions in whole genome prediction file and rm chromosome files 133 | genPredFILE=os.path.join(projectDir,bn.replace(".vcf",".PREDICT.txt")) 134 | ct=0 135 | with open(genPredFILE, "w") as fOUT: 136 | for f in pred_resultFiles: 137 | if ct==0: 138 | with open(f, "r") as fIN: 139 | for line in fIN: 140 | fOUT.write(line) 141 | else: 142 | with open(f, "r") as fIN: 143 | fIN.readline() 144 | for line in fIN: 145 | fOUT.write(line) 146 | ct+=1 147 | cmd="rm %s" %(f) 148 | os.system(cmd) 149 | 150 | 151 | print("\n***ReLERNN_PREDICT.py FINISHED!***\n") 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /ReLERNN/ReLERNN_BSCORRECT: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Performs a parametric bootstrap to assess any potential bias in recombination rate predictions. 4 | Corrects for this bias and adds 95% confidence intevals to the predictions 5 | """ 6 | 7 | 8 | from ReLERNN.imports import * 9 | from ReLERNN.helpers import * 10 | from ReLERNN.simulator import * 11 | from ReLERNN.sequenceBatchGenerator import * 12 | 13 | 14 | def ParametricBootStrap(simParameters, 15 | batchParameters, 16 | trainDir, 17 | network=None, 18 | slices=1000, 19 | repsPerSlice=1000, 20 | gpuID=0, 21 | tempDir="./Temp", 22 | out="./ParametricBootstrap.p", 23 | nCPU=1, 24 | seed=None): 25 | 26 | 27 | ''' 28 | This Function is for understanding network confidense 29 | over a range of rho, using a parametric bootstrap. 30 | 31 | SIDE NOTE: This will create a "temp" directory for filling 32 | writing and re-writing the test sets. 33 | after, it will destroy the tempDir. 34 | 35 | The basic idea being that we take a trained network, 36 | and iteritevly create test sets of simulation at steps which increase 37 | between fixed ranges of Rho. 38 | 39 | This function will output a pickle file containing 40 | a dictionary where the first 41 | 42 | This function will output a pickle file containing 43 | a dictionary where the ["rho"] key contains the slices 44 | between the values of rho where we simulate a test set, 45 | and test the trained model. 46 | 47 | The rest of the ket:value pairs in the dictionary contain 48 | the quartile information at each slice position for the 49 | distribution of test results 50 | ''' 51 | 52 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpuID) 53 | 54 | # load json and create model 55 | if(network != None): 56 | jsonFILE = open(network[0],"r") 57 | loadedModel = jsonFILE.read() 58 | jsonFILE.close() 59 | model=model_from_json(loadedModel) 60 | model.load_weights(network[1]) 61 | else: 62 | print("Error: no pretrained network found!") 63 | 64 | if not os.path.exists(tempDir): 65 | os.makedirs(tempDir) 66 | 67 | priorLowsRho = simParameters['priorLowsRho'] 68 | priorHighsRho = simParameters['priorHighsRho'] 69 | 70 | rhoDiff = (priorHighsRho - priorLowsRho)/slices 71 | IQR = {"rho":[],"Min":[],"CI95LO":[],"Q1":[],"Q2":[],"Q3":[],"CI95HI":[],"Max":[]} 72 | rho = [(priorLowsRho+(rhoDiff*i)) for i in range(slices)] 73 | IQR["rho"] = rho 74 | 75 | mean,sd,pad = getMeanSDMax(trainDir) 76 | 77 | for idx,r in enumerate(rho): 78 | print("Simulating slice ",idx," out of ",slices) 79 | 80 | params = copy.deepcopy(simParameters) 81 | params["priorLowsRho"] = r 82 | params["priorHighsRho"] = r 83 | params["seed"] = seed 84 | params.pop("bn", None) 85 | simulator = Simulator(**params) 86 | 87 | simulator.simulateAndProduceTrees(numReps=repsPerSlice, 88 | direc=tempDir, 89 | simulator="msprime", 90 | nProc=nCPU) 91 | 92 | batch_params = copy.deepcopy(batchParameters) 93 | batch_params['treesDirectory'] = tempDir 94 | batch_params['batchSize'] = repsPerSlice 95 | batch_params['shuffleExamples'] = False 96 | batch_params['seed'] = seed 97 | batchGenerator= SequenceBatchGenerator(**batch_params) 98 | 99 | x,y = batchGenerator.__getitem__(0) 100 | predictions = unNormalize(mean,sd,model.predict(x)) 101 | predictions = [p[0] for p in predictions] 102 | 103 | minP,maxP = min(predictions),max(predictions) 104 | quartiles = np.percentile(predictions,[2.5,25,50,75,97.5]) 105 | 106 | IQR["Min"].append(relu(minP)) 107 | IQR["Max"].append(relu(maxP)) 108 | IQR["CI95LO"].append(relu(quartiles[0])) 109 | IQR["Q1"].append(relu(quartiles[1])) 110 | IQR["Q2"].append(relu(quartiles[2])) 111 | IQR["Q3"].append(relu(quartiles[3])) 112 | IQR["CI95HI"].append(relu(quartiles[4])) 113 | 114 | del simulator 115 | del batchGenerator 116 | 117 | pickle.dump(IQR,open(out,"wb")) 118 | 119 | return rho,IQR 120 | 121 | 122 | def main(): 123 | parser = argparse.ArgumentParser() 124 | parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None) 125 | parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None) 126 | parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None) 127 | parser.add_argument('--gpuID',dest='gpuID',help='Identifier specifying which GPU to use', type=int, default=0) 128 | parser.add_argument('--nSlice',dest='nSlice',help='Number of recombination rate bins to simulate over', type=int, default=100) 129 | parser.add_argument('--nReps',dest='nReps',help='Number of simulations per step', type=int, default=1000) 130 | args = parser.parse_args() 131 | 132 | 133 | ## Set seed 134 | if args.seed: 135 | os.environ['PYTHONHASHSEED']=str(args.seed) 136 | random.seed(args.seed) 137 | np.random.seed(args.seed) 138 | 139 | 140 | ## Set number of cores 141 | if args.nCPU: 142 | nProc = args.nCPU 143 | else: 144 | nProc = mp.cpu_count() 145 | 146 | 147 | ## Set up the directory structure and output files 148 | if not args.outDir: 149 | print("Warning: No project directory found, using current working directory.") 150 | projectDir = os.getcwd() 151 | else: 152 | projectDir = args.outDir 153 | trainDir = os.path.join(projectDir,"train") 154 | valiDir = os.path.join(projectDir,"vali") 155 | testDir = os.path.join(projectDir,"test") 156 | networkDir = os.path.join(projectDir,"networks") 157 | bs_resultFile = os.path.join(networkDir,"bootstrapResults.p") 158 | bs_plotFile = os.path.join(networkDir,"bootstrapPlot.pdf") 159 | modelWeights = [os.path.join(networkDir,"model.json"),os.path.join(networkDir,"weights.h5")] 160 | bs_resultFile = os.path.join(networkDir,"bootstrapResults.p") 161 | bsDir = os.path.join(projectDir,"PBS") 162 | 163 | 164 | ## Load simulation and batch pars 165 | simParsFILE=os.path.join(networkDir,"simPars.p") 166 | batchParsFILE=os.path.join(networkDir,"batchPars.p") 167 | with open(simParsFILE, "rb") as fIN: 168 | simPars=pickle.load(fIN) 169 | with open(batchParsFILE, "rb") as fIN: 170 | batchPars=pickle.load(fIN) 171 | pred_resultFiles = [] 172 | for f in glob.glob(os.path.join(projectDir,"*.PREDICT.txt")): 173 | pred_resultFiles.append(f) 174 | if len(pred_resultFiles) < 1: 175 | print("Error: no .PREDICT.txt file found. You must run ReLERNN_PREDICT.py prior to running ReLERNN_BSCORRECT.py") 176 | sys.exit(1) 177 | elif len(pred_resultFiles) > 1: 178 | print("Error: multiple prediction files found.") 179 | sys.exit(1) 180 | pred_resultFile = pred_resultFiles[0] 181 | 182 | 183 | ## Run parametric bootstrap 184 | ParametricBootStrap( 185 | simPars, 186 | batchPars, 187 | trainDir, 188 | network=modelWeights, 189 | slices=args.nSlice, 190 | repsPerSlice=args.nReps, 191 | gpuID=args.gpuID, 192 | out=bs_resultFile, 193 | tempDir=bsDir, 194 | nCPU=nProc, 195 | seed=args.seed) 196 | 197 | 198 | ## Plot results from bootstrap 199 | plotParametricBootstrap(bs_resultFile,bs_plotFile) 200 | 201 | 202 | ## Load bootstrap values 203 | with open(bs_resultFile, "rb") as fIN: 204 | bs=pickle.load(fIN) 205 | 206 | 207 | ## Loop, correct, and write output 208 | correctedfile=pred_resultFile.replace(".txt", ".BSCORRECTED.txt") 209 | with open(correctedfile, "w") as fout, open(pred_resultFile, "r") as fin: 210 | for line in fin: 211 | if not line.startswith("chrom\t"): 212 | ar=line.split() 213 | rate=float(ar[4]) 214 | C=get_corrected(rate,bs) 215 | ar[4]=C[0] 216 | ar.extend([C[1],C[2]]) 217 | fout.write("\t".join([str(x) for x in ar])+"\n") 218 | else: 219 | #fout.write(line) 220 | fout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %("chrom","start","end","nSites","recombRate","CI95LO","CI95HI")) 221 | 222 | 223 | ## Remove the bootstrap tree files 224 | shutil.rmtree(bsDir) 225 | print("\n***ReLERNN_BSCORRECT.py FINISHED!***\n") 226 | 227 | 228 | if __name__ == "__main__": 229 | main() 230 | -------------------------------------------------------------------------------- /ReLERNN/ReLERNN_SIMULATE_POOL: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Reads a POOL file, estimates some simulation parameters, and simulates via msprime. 4 | NOTE: This assumes that the user has previously QC'd and filtered the POOL. 5 | """ 6 | 7 | from ReLERNN.imports import * 8 | from ReLERNN.helpers import * 9 | from ReLERNN.manager import * 10 | from ReLERNN.simulator import * 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-p','--pool',dest='pool',help='Filtered and QC-checked pool file') 16 | parser.add_argument('--sampleDepth',dest='samD',help='Number of chromosomes in pool', type=int) 17 | parser.add_argument('-g','--genome',dest='genome',help='BED-formatted (i.e. zero-based) file corresponding to chromosomes and positions to evaluate') 18 | parser.add_argument('-m','--mask',dest='mask',help='BED-formatted file corresponding to inaccessible bases', default=None) 19 | parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None) 20 | parser.add_argument('-n','--demographicHistory',dest='dem',help='Output file from either stairwayplot, SMC++, or MSMC',default=None) 21 | parser.add_argument('-u','--assumedMu',dest='mu',help='Assumed per-base mutation rate',type=float,default=1e-8) 22 | parser.add_argument('-l','--assumedGenTime',dest='genTime',help='Assumed generation time (in years)',type=float) 23 | parser.add_argument('-r','--upperRhoThetaRatio',dest='upRTR',help='Assumed upper bound for the ratio of rho to theta',type=float,default=1.0) 24 | parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None) 25 | parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None) 26 | parser.add_argument('--maxSites',dest='winSizeMx',help='Max number of sites per window to train on. Important: too many sites causes problems in training (see README)!',type=int,default=1750) 27 | parser.add_argument('--forceWinSize',dest='forceWinSize',help='USED ONLY FOR TESTING, LEAVE AS DEFAULT',type=int,default=0) 28 | parser.add_argument('--maskThresh',dest='maskThresh',help='Discard windows where >= maskThresh percent of sites are inaccessible',type=float,default=1.0) 29 | parser.add_argument('--nTrain',dest='nTrain',help='Number of training examples to simulate',type=int,default=100000) 30 | parser.add_argument('--nVali',dest='nVali',help='Number of validation examples to simulate',type=int,default=1000) 31 | parser.add_argument('--nTest',dest='nTest',help='Number of test examples to simulate',type=int,default=1000) 32 | args = parser.parse_args() 33 | 34 | 35 | ## Set seed 36 | if args.seed: 37 | os.environ['PYTHONHASHSEED']=str(args.seed) 38 | random.seed(args.seed) 39 | np.random.seed(args.seed) 40 | 41 | 42 | ## Set number of cores 43 | if args.nCPU: 44 | nProc = args.nCPU 45 | else: 46 | nProc = mp.cpu_count() 47 | 48 | 49 | ## Ensure all required arguments are provided 50 | if not args.samD: 51 | print("Error: assumed sample depth must be provided") 52 | sys.exit(1) 53 | if not args.pool.endswith(".pool"): 54 | print('Error: POOL file must end in extension ".pool"') 55 | sys.exit(1) 56 | if not args.outDir: 57 | print("Warning: No project directory found, using current working directory.") 58 | projectDir = os.getcwd() 59 | else: 60 | projectDir = args.outDir 61 | if not args.mask: 62 | print("Warning: no accessibility mask found. All sites in the genome are assumed to be accessible.") 63 | if args.dem: 64 | demHist = check_demHist(args.dem) 65 | if demHist == -9: 66 | print("Error: demographicHistory file must be raw output from either stairwayplot, SMC++, or MSMC") 67 | sys.exit(1) 68 | if not args.genTime: 69 | print("Error: assumed generation time must be supplied when simulating under stairwayplot, SMC++, or MSMC") 70 | sys.exit(1) 71 | else: 72 | print("Warning: no demographic history file found. All training data will be simulated under demographic equilibrium.") 73 | demHist = 0 74 | 75 | 76 | ## Set up the directory structure to store the simulations data. 77 | time.sleep(5) 78 | trainDir = os.path.join(projectDir,"train") 79 | valiDir = os.path.join(projectDir,"vali") 80 | testDir = os.path.join(projectDir,"test") 81 | networkDir = os.path.join(projectDir,"networks") 82 | poolDir = os.path.join(projectDir,"splitPOOLs") 83 | 84 | 85 | ## Make directories if they do not exist 86 | for p in [projectDir,trainDir,valiDir,testDir,networkDir,poolDir]: 87 | if not os.path.exists(p): 88 | os.makedirs(p) 89 | 90 | 91 | ## Read the genome file 92 | chromosomes = [] 93 | with open(args.genome, "r") as fIN: 94 | for line in fIN: 95 | ar = line.split() 96 | if len(ar)!=3: 97 | print("Error: genome file must be formatted as a bed file (i.e.'chromosome start end')") 98 | sys.exit(1) 99 | chromosomes.append("{}:{}-{}".format(ar[0],ar[1],ar[2])) 100 | 101 | 102 | ## Pass params to the manager 103 | manager_params = { 104 | 'pool':args.pool, 105 | 'mask':args.mask, 106 | 'winSizeMx':args.winSizeMx, 107 | 'forceWinSize':args.forceWinSize, 108 | 'chromosomes':chromosomes, 109 | 'poolDir':poolDir, 110 | 'projectDir':projectDir, 111 | 'networkDir':networkDir, 112 | 'seed':args.seed 113 | } 114 | pool_manager = Manager(**manager_params) 115 | 116 | 117 | ## Split the pool file 118 | pool_manager.splitPOOL(nProc=nProc) 119 | 120 | 121 | ## Calculate nSites per window 122 | wins, nSamps, maxS, maxLen = pool_manager.countSitesPOOL(samD=args.samD, nProc=nProc) 123 | 124 | 125 | ## Prepare the accessibility mask 126 | if args.mask: 127 | mask_fraction, win_masks = pool_manager.maskWins(wins=wins, maxLen=maxLen, nProc=nProc) 128 | else: 129 | mask_fraction, win_masks = 0.0, None 130 | 131 | 132 | ## Define parameters for msprime simulation 133 | print("Simulating with window size = {} bp.".format(maxLen)) 134 | a=0 135 | for i in range(nSamps-1): 136 | a+=1/(i+1) 137 | thetaW=maxS/a 138 | assumedMu = args.mu 139 | Ne=thetaW/(4.0 * assumedMu * ((1-mask_fraction) * maxLen)) 140 | rhoHi=assumedMu*args.upRTR 141 | if demHist: 142 | MspD = convert_demHist(args.dem, nSamps, args.genTime, demHist) 143 | dg_params = { 144 | 'priorLowsRho':0.0, 145 | 'priorHighsRho':rhoHi, 146 | 'priorLowsMu':assumedMu * 0.66, 147 | 'priorHighsMu':assumedMu * 1.33, 148 | 'ChromosomeLength':maxLen, 149 | 'winMasks':win_masks, 150 | 'maskThresh':args.maskThresh, 151 | 'MspDemographics':MspD, 152 | 'seed':args.seed 153 | } 154 | 155 | else: 156 | dg_params = {'N':nSamps, 157 | 'Ne':Ne, 158 | 'priorLowsRho':0.0, 159 | 'priorHighsRho':rhoHi, 160 | 'priorLowsMu':assumedMu * 0.66, 161 | 'priorHighsMu':assumedMu * 1.33, 162 | 'ChromosomeLength':maxLen, 163 | 'winMasks':win_masks, 164 | 'maskThresh':args.maskThresh, 165 | 'seed':args.seed 166 | } 167 | 168 | 169 | # Assign pars for each simulation 170 | dg_train = Simulator(**dg_params) 171 | dg_vali = Simulator(**dg_params) 172 | dg_test = Simulator(**dg_params) 173 | 174 | 175 | ## Dump simulation pars for use with parametric bootstrap 176 | simParsFILE=os.path.join(networkDir,"simPars.p") 177 | with open(simParsFILE, "wb") as fOUT: 178 | dg_params["bn"]=os.path.basename(args.pool).replace(".pool","") 179 | pickle.dump(dg_params,fOUT) 180 | 181 | 182 | ## Simulate data 183 | print("\nTraining set:") 184 | dg_train.simulateAndProduceTrees(numReps=args.nTrain,direc=trainDir,simulator="msprime",nProc=nProc) 185 | print("Validation set:") 186 | dg_vali.simulateAndProduceTrees(numReps=args.nVali,direc=valiDir,simulator="msprime",nProc=nProc) 187 | print("Test set:") 188 | dg_test.simulateAndProduceTrees(numReps=args.nTest,direc=testDir,simulator="msprime",nProc=nProc) 189 | print("\nSIMULATIONS FINISHED!\n") 190 | 191 | 192 | ## Count number of segregating sites in simulation 193 | SS=[] 194 | maxSegSites = 0 195 | minSegSites = float("inf") 196 | for ds in [trainDir,valiDir,testDir]: 197 | DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb")) 198 | SS.extend(DsInfoDir["segSites"]) 199 | segSitesInDs = max(DsInfoDir["segSites"]) 200 | segSitesInDsMin = min(DsInfoDir["segSites"]) 201 | maxSegSites = max(maxSegSites,segSitesInDs) 202 | minSegSites = min(minSegSites,segSitesInDsMin) 203 | 204 | 205 | ## Compare counts of segregating sites between simulations and input pool file 206 | print("SANITY CHECK") 207 | print("====================") 208 | print("numSegSites\t\t\tMin\tMean\tMax") 209 | print("Simulated:\t\t\t%s\t%s\t%s" %(minSegSites, int(sum(SS)/float(len(SS))), maxSegSites)) 210 | for i in range(len(wins)): 211 | print("Input %s:\t\t%s\t%s\t%s" %(wins[i][0],wins[i][3],wins[i][4],wins[i][5])) 212 | print("\n\n***ReLERNN_SIMULATE_POOL.py FINISHED!***\n") 213 | 214 | 215 | if __name__ == "__main__": 216 | main() 217 | -------------------------------------------------------------------------------- /ReLERNN/ReLERNN_SIMULATE: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Reads a VCF file, estimates some simulation parameters, and simulates via msprime. 4 | NOTE: This assumes that the user has previously QC'd and filtered the VCF. 5 | """ 6 | 7 | from ReLERNN.imports import * 8 | from ReLERNN.helpers import * 9 | from ReLERNN.manager import * 10 | from ReLERNN.simulator import * 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-v','--vcf',dest='vcf',help='Filtered and QC-checked VCF file. Important: Every row must correspond to a biallelic SNP') 16 | parser.add_argument('-g','--genome',dest='genome',help='BED-formatted (i.e. zero-based) file corresponding to chromosomes and positions to evaluate') 17 | parser.add_argument('-m','--mask',dest='mask',help='BED-formatted file corresponding to inaccessible bases', default=None) 18 | parser.add_argument('-d','--projectDir',dest='outDir',help='Directory for all project output. NOTE: the same projectDir must be used for all functions of ReLERNN',default=None) 19 | parser.add_argument('-n','--demographicHistory',dest='dem',help='Output file from either stairwayplot, SMC++, or MSMC',default=None) 20 | parser.add_argument('-u','--assumedMu',dest='mu',help='Assumed per-base mutation rate',type=float,default=1e-8) 21 | parser.add_argument('-l','--assumedGenTime',dest='genTime',help='Assumed generation time (in years)',type=float) 22 | parser.add_argument('-r','--upperRhoThetaRatio',dest='upRTR',help='Assumed upper bound for the ratio of rho to theta',type=float,default=1.0) 23 | parser.add_argument('-t','--nCPU',dest='nCPU',help='Number of CPUs to use',type=int,default=None) 24 | parser.add_argument('-s','--seed',dest='seed',help='Random seed',type=int,default=None) 25 | parser.add_argument('--phased',help='Treat genotypes as phased',default=False, action='store_true') 26 | parser.add_argument('--unphased',dest='phased',help='Treat genotypes as unphased',action='store_false') 27 | parser.add_argument('--forceDiploid',help='Treat all samples as diploids with missing data (bad idea; see README)',default=False, action='store_true') 28 | parser.add_argument('--phaseError',dest='phaseError',help='Fraction of bases simulated with incorrect phasing',type=float,default=0.0) 29 | parser.add_argument('--maxSites',dest='winSizeMx',help='Max number of sites per window to train on. Important: too many sites causes problems in training (see README)!',type=int,default=1750) 30 | parser.add_argument('--forceWinSize',dest='forceWinSize',help='USED ONLY FOR TESTING, LEAVE AS DEFAULT',type=int,default=0) 31 | parser.add_argument('--maskThresh',dest='maskThresh',help='Discard windows where >= maskThresh percent of sites are inaccessible',type=float,default=1.0) 32 | parser.add_argument('--nTrain',dest='nTrain',help='Number of training examples to simulate',type=int,default=100000) 33 | parser.add_argument('--nVali',dest='nVali',help='Number of validation examples to simulate',type=int,default=1000) 34 | parser.add_argument('--nTest',dest='nTest',help='Number of test examples to simulate',type=int,default=1000) 35 | args = parser.parse_args() 36 | 37 | 38 | ## Set seed 39 | if args.seed: 40 | os.environ['PYTHONHASHSEED']=str(args.seed) 41 | random.seed(args.seed) 42 | np.random.seed(args.seed) 43 | 44 | 45 | ## Set number of cores 46 | if args.nCPU: 47 | nProc = args.nCPU 48 | else: 49 | nProc = mp.cpu_count() 50 | 51 | 52 | ## Ensure all required arguments are provided 53 | if not args.vcf.endswith(".vcf"): 54 | print('Error: VCF file must end in extension ".vcf"') 55 | sys.exit(1) 56 | if not args.outDir: 57 | print("Warning: No project directory found, using current working directory.") 58 | projectDir = os.getcwd() 59 | else: 60 | projectDir = args.outDir 61 | if not args.mask: 62 | print("Warning: no accessibility mask found. All sites in the genome are assumed to be accessible.") 63 | if args.dem: 64 | demHist = check_demHist(args.dem) 65 | if demHist == -9: 66 | print("Error: demographicHistory file must be raw output from either stairwayplot, SMC++, or MSMC") 67 | print("If using SMC++, file must be in *.csv format (use option -c in SMC++)") 68 | sys.exit(1) 69 | if not args.genTime: 70 | print("Error: assumed generation time must be supplied when simulating under stairwayplot, SMC++, or MSMC") 71 | sys.exit(1) 72 | else: 73 | print("Warning: no demographic history file found. All training data will be simulated under demographic equilibrium.") 74 | demHist = 0 75 | if not args.phased and args.phaseError != 0.0: 76 | print("Error: non-zero 'phaseError' cannot be used in conjunction with '--unphased'") 77 | sys.exit(1) 78 | if args.forceDiploid: 79 | print("Warning: all haploid/hemizygous samples will be treated as diploid samples with missing data!\n", 80 | "If you want to treat haploid/hemizygous samples and haploids without missing data, quit now, ensure no diploid samples are found in this VCF, and rerun without the option `--forceDiploid`.") 81 | time.sleep(10) 82 | else: 83 | time.sleep(5) 84 | 85 | 86 | ## Set up the directory structure to store the simulations data. 87 | trainDir = os.path.join(projectDir,"train") 88 | valiDir = os.path.join(projectDir,"vali") 89 | testDir = os.path.join(projectDir,"test") 90 | networkDir = os.path.join(projectDir,"networks") 91 | vcfDir = os.path.join(projectDir,"splitVCFs") 92 | 93 | 94 | ## Make directories if they do not exist 95 | for p in [projectDir,trainDir,valiDir,testDir,networkDir,vcfDir]: 96 | if not os.path.exists(p): 97 | os.makedirs(p) 98 | 99 | 100 | ## Read the genome file 101 | chromosomes = [] 102 | with open(args.genome, "r") as fIN: 103 | for line in fIN: 104 | ar = line.split() 105 | if len(ar)!=3: 106 | print("Error: genome file must be formatted as a bed file (i.e.'chromosome start end')") 107 | sys.exit(1) 108 | chromosomes.append("{}:{}-{}".format(ar[0],ar[1],ar[2])) 109 | 110 | 111 | ## Pass params to the vcf manager 112 | manager_params = { 113 | 'vcf':args.vcf, 114 | 'mask':args.mask, 115 | 'winSizeMx':args.winSizeMx, 116 | 'forceWinSize':args.forceWinSize, 117 | 'forceDiploid':args.forceDiploid, 118 | 'chromosomes':chromosomes, 119 | 'vcfDir':vcfDir, 120 | 'projectDir':projectDir, 121 | 'networkDir':networkDir, 122 | 'seed':args.seed 123 | } 124 | vcf_manager = Manager(**manager_params) 125 | 126 | 127 | ## Split the VCF file 128 | vcf_manager.splitVCF(nProc=nProc) 129 | 130 | 131 | ## Calculate nSites per window 132 | wins, nSamps, maxS, maxLen = vcf_manager.countSites(nProc=nProc) 133 | 134 | 135 | ## Prepare the accessibility mask 136 | if args.mask: 137 | mask_fraction, win_masks = vcf_manager.maskWins(wins=wins, maxLen=maxLen, nProc=nProc) 138 | else: 139 | mask_fraction, win_masks = 0.0, None 140 | 141 | 142 | ## Prepare the missing data mask 143 | md_mask, mask_files = None, [] 144 | for FILE in glob.glob(os.path.join(vcfDir, "*_md_mask.hdf5")): 145 | mask_files.append(FILE) 146 | md_mask = [] 147 | for FILE in mask_files: 148 | print("Reading HDF5 mask: {}...".format(FILE)) 149 | with h5py.File(FILE, "r") as hf: 150 | md_mask.append(hf["mask"][:]) 151 | if md_mask: 152 | md_mask = np.concatenate(md_mask) 153 | 154 | 155 | ## Define parameters for msprime simulation 156 | print("Simulating with window size = {} bp.".format(maxLen)) 157 | a=0 158 | for i in range(nSamps-1): 159 | a+=1/(i+1) 160 | thetaW=maxS/a 161 | assumedMu = args.mu 162 | Ne=thetaW/(4.0 * assumedMu * ((1-mask_fraction) * maxLen)) 163 | rhoHi=assumedMu*args.upRTR 164 | if demHist: 165 | MspD = convert_demHist(args.dem, nSamps, args.genTime, demHist, assumedMu) 166 | dg_params = { 167 | 'priorLowsRho':0.0, 168 | 'priorHighsRho':rhoHi, 169 | 'priorLowsMu':assumedMu * 0.66, 170 | 'priorHighsMu':assumedMu * 1.33, 171 | 'ChromosomeLength':maxLen, 172 | 'winMasks':win_masks, 173 | 'mdMask':md_mask, 174 | 'maskThresh':args.maskThresh, 175 | 'phased':args.phased, 176 | 'phaseError':args.phaseError, 177 | 'MspDemographics':MspD, 178 | 'seed':args.seed 179 | } 180 | 181 | else: 182 | dg_params = {'N':nSamps, 183 | 'Ne':Ne, 184 | 'priorLowsRho':0.0, 185 | 'priorHighsRho':rhoHi, 186 | 'priorLowsMu':assumedMu * 0.66, 187 | 'priorHighsMu':assumedMu * 1.33, 188 | 'ChromosomeLength':maxLen, 189 | 'winMasks':win_masks, 190 | 'mdMask':md_mask, 191 | 'maskThresh':args.maskThresh, 192 | 'phased':args.phased, 193 | 'phaseError':args.phaseError, 194 | 'seed':args.seed 195 | } 196 | 197 | 198 | # Assign pars for each simulation 199 | dg_train = Simulator(**dg_params) 200 | dg_vali = Simulator(**dg_params) 201 | dg_test = Simulator(**dg_params) 202 | 203 | 204 | ## Dump simulation pars for use with parametric bootstrap 205 | simParsFILE=os.path.join(networkDir,"simPars.p") 206 | with open(simParsFILE, "wb") as fOUT: 207 | dg_params["bn"]=os.path.basename(args.vcf).replace(".vcf","") 208 | pickle.dump(dg_params,fOUT) 209 | 210 | 211 | ## Simulate data 212 | print("Training set:") 213 | dg_train.simulateAndProduceTrees(numReps=args.nTrain,direc=trainDir,simulator="msprime",nProc=nProc) 214 | print("Validation set:") 215 | dg_vali.simulateAndProduceTrees(numReps=args.nVali,direc=valiDir,simulator="msprime",nProc=nProc) 216 | print("Test set:") 217 | dg_test.simulateAndProduceTrees(numReps=args.nTest,direc=testDir,simulator="msprime",nProc=nProc) 218 | print("\nSIMULATIONS FINISHED!\n") 219 | 220 | 221 | ## Count number of segregating sites in simulation 222 | SS=[] 223 | maxSegSites = 0 224 | minSegSites = float("inf") 225 | for ds in [trainDir,valiDir,testDir]: 226 | DsInfoDir = pickle.load(open(os.path.join(ds,"info.p"),"rb")) 227 | SS.extend(DsInfoDir["segSites"]) 228 | segSitesInDs = max(DsInfoDir["segSites"]) 229 | segSitesInDsMin = min(DsInfoDir["segSites"]) 230 | maxSegSites = max(maxSegSites,segSitesInDs) 231 | minSegSites = min(minSegSites,segSitesInDsMin) 232 | 233 | 234 | ## Compare counts of segregating sites between simulations and input VCF 235 | print("SANITY CHECK") 236 | print("====================") 237 | print("numSegSites\t\t\tMin\tMean\tMax") 238 | print("Simulated:\t\t\t%s\t%s\t%s" %(minSegSites, int(sum(SS)/float(len(SS))), maxSegSites)) 239 | for i in range(len(wins)): 240 | print("InputVCF %s:\t\t%s\t%s\t%s" %(wins[i][0],wins[i][3],wins[i][4],wins[i][5])) 241 | print("\n\n***ReLERNN_SIMULATE.py FINISHED!***\n") 242 | 243 | 244 | if __name__ == "__main__": 245 | main() 246 | -------------------------------------------------------------------------------- /ReLERNN/simulator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Jared Galloway, Jeff Adrion 3 | 4 | ''' 5 | 6 | from ReLERNN.imports import * 7 | from ReLERNN.helpers import * 8 | 9 | MAX_SEED = int(2 ** 32 - 1) # maximum allowed seed in msprime 10 | 11 | class Simulator(object): 12 | ''' 13 | 14 | The simulator class is a framework for running N simulations 15 | using Either msprime (coalescent) or SLiM (forward-moving) 16 | in parallel using python's multithreading package. 17 | 18 | With Specified parameters, the class Simulator() populates 19 | a directory with training, validation, and testing datasets. 20 | It stores the the treeSequences resulting from each simulation 21 | in a subdirectory respectfully labeled 'i.trees' where i is the 22 | i^th simulation. 23 | 24 | Included with each dataset this class produces an info.p 25 | in the subdirectory. This uses pickle to store a dictionary 26 | containing all the information for each simulation including the random 27 | target parameter which will be extracted for training. 28 | 29 | ''' 30 | 31 | def __init__(self, 32 | N = 2, 33 | Ne = 1e2, 34 | priorLowsRho = 0.0, 35 | priorLowsMu = 0.0, 36 | priorHighsRho = 1e-7, 37 | priorHighsMu = 1e-8, 38 | ChromosomeLength = 1e5, 39 | MspDemographics = None, 40 | winMasks = None, 41 | mdMask = None, 42 | maskThresh = 1.0, 43 | phased = None, 44 | phaseError = None, 45 | hotspots = False, 46 | nHotWins = 10, 47 | seed = None 48 | ): 49 | 50 | self.N = N 51 | self.Ne = Ne 52 | self.priorLowsRho = priorLowsRho 53 | self.priorHighsRho = priorHighsRho 54 | self.priorLowsMu = priorLowsMu 55 | self.priorHighsMu = priorHighsMu 56 | self.ChromosomeLength = ChromosomeLength 57 | self.MspDemographics = MspDemographics 58 | self.rho = None 59 | self.hotWin = None 60 | self.mu = None 61 | self.segSites = None 62 | self.winMasks = winMasks 63 | self.mdMask = mdMask 64 | self.maskThresh = maskThresh 65 | self.phased = phased 66 | self.phaseError = phaseError 67 | self.hotspots = hotspots 68 | self.nHotWins = nHotWins 69 | self.seed = seed 70 | 71 | 72 | if self.seed: 73 | os.environ['PYTHONHASHSEED']=str(self.seed) 74 | random.seed(self.seed) 75 | np.random.seed(self.seed) 76 | 77 | 78 | def runOneMsprimeSim(self,simNum,direc): 79 | ''' 80 | run one msprime simulation and put the corresponding treeSequence in treesOutputFilePath 81 | 82 | (str,float,float)->None 83 | ''' 84 | 85 | MR = self.mu[simNum] 86 | RR = self.rho[simNum] 87 | SEED = self.seed[simNum] 88 | os.environ['PYTHONHASHSEED']=str(SEED) 89 | random.seed(SEED) 90 | np.random.seed(SEED) 91 | 92 | if self.hotspots: 93 | hotspotMultiplier = self.hotWin[simNum] 94 | 95 | mapName = str(simNum) + "_map.txt" 96 | mapPath = os.path.join(direc,mapName) 97 | 98 | nWins = self.nHotWins 99 | hotSpotWin = np.random.randint(nWins) 100 | 101 | winRates = np.empty(nWins) 102 | 103 | breaks = np.linspace(0,self.ChromosomeLength, num = nWins + 1) 104 | with open(mapPath, "w") as fOUT: 105 | fOUT.write("Chromosome\tstartPos\tRate\n") 106 | for i in range(len(breaks)): 107 | if i == hotSpotWin: 108 | baseRate = RR * hotspotMultiplier * 10**8 109 | winRates[i] = baseRate 110 | elif i == nWins: 111 | baseRate = 0.0 112 | else: 113 | baseRate = RR * 10**8 114 | winRates[i] = baseRate 115 | fOUT.write("{}\t{}\t{}\n".format("chr",int(breaks[i]),baseRate)) 116 | 117 | recomb_map = msp.RecombinationMap.read_hapmap(mapPath) 118 | 119 | if self.MspDemographics: 120 | DE = self.MspDemographics["demographic_events"] 121 | PC = self.MspDemographics["population_configurations"] 122 | MM = self.MspDemographics["migration_matrix"] 123 | ts = msp.simulate( 124 | random_seed=SEED, 125 | mutation_rate=MR, 126 | population_configurations = PC, 127 | migration_matrix = MM, 128 | demographic_events = DE, 129 | recombination_map = recomb_map 130 | ) 131 | 132 | else: 133 | ts = msp.simulate( 134 | random_seed = SEED, 135 | sample_size = self.N, 136 | Ne = self.Ne, 137 | mutation_rate=MR, 138 | recombination_map = recomb_map 139 | ) 140 | 141 | else: 142 | if self.MspDemographics: 143 | DE = self.MspDemographics["demographic_events"] 144 | PC = self.MspDemographics["population_configurations"] 145 | MM = self.MspDemographics["migration_matrix"] 146 | ts = msp.simulate( 147 | random_seed=SEED, 148 | length=self.ChromosomeLength, 149 | mutation_rate=MR, 150 | recombination_rate=RR, 151 | population_configurations = PC, 152 | migration_matrix = MM, 153 | demographic_events = DE 154 | ) 155 | else: 156 | ts = msp.simulate( 157 | random_seed = SEED, 158 | sample_size = self.N, 159 | Ne = self.Ne, 160 | length=self.ChromosomeLength, 161 | mutation_rate=MR, 162 | recombination_rate=RR 163 | ) 164 | 165 | # Convert tree sequence to genotype matrix, and position matrix 166 | H = ts.genotype_matrix() 167 | P = np.array([s.position for s in ts.sites()],dtype='float32') 168 | 169 | # "Unphase" genotypes 170 | if not self.phased: 171 | np.random.shuffle(np.transpose(H)) 172 | 173 | # Simulate phasing error 174 | if self.phaseError: 175 | H = self.phaseErrorer(H,self.phaseError) 176 | 177 | # If there is a missing data mask, sample from the mask and apply to haps 178 | if not self.mdMask is None: 179 | mdMask = self.mdMask[np.random.choice(self.mdMask.shape[0], H.shape[0], replace=True)] 180 | H = np.ma.masked_array(H, mask=mdMask) 181 | H = np.ma.filled(H,2) 182 | 183 | # Sample from the genome-wide distribution of masks and mask both positions and genotypes 184 | if self.winMasks: 185 | while True: 186 | rand_mask = self.winMasks[random.randint(0,len(self.winMasks)-1)] 187 | if rand_mask[0] < self.maskThresh: 188 | break 189 | if rand_mask[0] > 0.0: 190 | H,P = self.maskGenotypes(H, P, rand_mask) 191 | 192 | # Dump 193 | Hname = str(simNum) + "_haps.npy" 194 | Hpath = os.path.join(direc,Hname) 195 | Pname = str(simNum) + "_pos.npy" 196 | Ppath = os.path.join(direc,Pname) 197 | np.save(Hpath,H) 198 | np.save(Ppath,P) 199 | 200 | # Return number of sites 201 | return H.shape[0] 202 | 203 | 204 | def maskGenotypes(self, H, P, rand_mask): 205 | """ 206 | Return the genotype and position matrices where masked sites have been removed 207 | """ 208 | mask_wins = np.array(rand_mask[1]) 209 | mask_wins = np.reshape(mask_wins, 2 * mask_wins.shape[0]) 210 | mask = np.digitize(P, mask_wins) % 2 == 0 211 | return H[mask], P[mask] 212 | 213 | 214 | def phaseErrorer(self, H, rate): 215 | """ 216 | Returns the genotype matrix where some fraction of sites have shuffled samples 217 | """ 218 | H_shuf = copy.deepcopy(H) 219 | np.random.shuffle(np.transpose(H_shuf)) 220 | H_mask = np.random.choice([True,False], H.shape[0], p = [1-rate,rate]) 221 | H_mask = np.repeat(H_mask, H.shape[1]) 222 | H_mask = H_mask.reshape(H.shape) 223 | return np.where(H_mask,H,H_shuf) 224 | 225 | 226 | def simulateAndProduceTrees(self,direc,numReps,simulator,nProc=1): 227 | ''' 228 | determine which simulator to use then populate 229 | 230 | (str,str) -> None 231 | ''' 232 | 233 | if self.hotspots: 234 | self.hotWin=np.zeros(numReps) 235 | for i in range(int(numReps/2.0)): 236 | randomTargetParameter = np.random.uniform(50,50) 237 | self.hotWin[i] = randomTargetParameter 238 | for i in range(int(numReps/2.0),numReps): 239 | randomTargetParameter = np.random.uniform(1,1) 240 | self.hotWin[i] = randomTargetParameter 241 | 242 | self.rho=np.empty(numReps) 243 | for i in range(numReps): 244 | randomTargetParameter = np.random.uniform(self.priorLowsRho,self.priorHighsRho) 245 | self.rho[i] = randomTargetParameter 246 | 247 | self.mu=np.empty(numReps) 248 | for i in range(numReps): 249 | randomTargetParameter = np.random.uniform(self.priorLowsMu,self.priorHighsMu) 250 | self.mu[i] = randomTargetParameter 251 | 252 | if self.seed is None: 253 | self.seed=np.repeat(self.seed, numReps) 254 | else: 255 | self.seed=np.random.randint(0, MAX_SEED, size=(numReps,)) 256 | 257 | try: 258 | assert((simulator=='msprime') | (simulator=='SLiM')) 259 | except: 260 | print("Sorry, only 'msprime' & 'SLiM' are supported simulators") 261 | exit() 262 | 263 | #Pretty straitforward, create the directory passed if it doesn't exits 264 | if not os.path.exists(direc): 265 | print("directory '",direc,"' does not exist, creating it") 266 | os.makedirs(direc) 267 | 268 | # partition data for multiprocessing 269 | mpID = range(numReps) 270 | task_q = mp.JoinableQueue() 271 | result_q = mp.Queue() 272 | params=[simulator, direc] 273 | 274 | # do the work 275 | print("Simulate...") 276 | pids = create_procs(nProc, task_q, result_q, params, self.worker_simulate) 277 | assign_task(mpID, task_q, nProc) 278 | try: 279 | task_q.join() 280 | except KeyboardInterrupt: 281 | print("KeyboardInterrupt") 282 | sys.exit(0) 283 | 284 | self.segSites=np.empty(numReps,dtype="int64") 285 | for i in range(result_q.qsize()): 286 | item = result_q.get() 287 | self.segSites[item[0]]=item[1] 288 | 289 | self.__dict__["numReps"] = numReps 290 | infofile = open(os.path.join(direc,"info.p"),"wb") 291 | pickle.dump(self.__dict__,infofile) 292 | infofile.close() 293 | 294 | for p in pids: 295 | p.terminate() 296 | return None 297 | 298 | 299 | def worker_simulate(self, task_q, result_q, params): 300 | while True: 301 | try: 302 | mpID, nth_job = task_q.get() 303 | #unpack parameters 304 | simulator, direc = params 305 | for i in mpID: 306 | result_q.put([i,self.runOneMsprimeSim(i,direc)]) 307 | finally: 308 | task_q.task_done() 309 | -------------------------------------------------------------------------------- /ReLERNN/manager.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Jeff Adrion 3 | 4 | ''' 5 | 6 | from ReLERNN.imports import * 7 | from ReLERNN.helpers import * 8 | 9 | class Manager(object): 10 | ''' 11 | 12 | The manager class is a framework for handling both VCFs and masks 13 | and can multi-process many of the functions orginally found in ReLERNN_SIMULATE 14 | 15 | ''' 16 | 17 | 18 | def __init__(self, 19 | vcf = None, 20 | pool = None, 21 | chromosomes = None, 22 | mask = None, 23 | winSizeMx = None, 24 | forceWinSize = None, 25 | forceDiploid = None, 26 | vcfDir = None, 27 | poolDir = None, 28 | projectDir = None, 29 | networkDir = None, 30 | seed = None 31 | ): 32 | 33 | self.vcf = vcf 34 | self.pool = pool 35 | self.chromosomes = chromosomes 36 | self.mask = mask 37 | self.winSizeMx = winSizeMx 38 | self.forceWinSize = forceWinSize 39 | self.forceDiploid = forceDiploid 40 | self.vcfDir = vcfDir 41 | self.poolDir = poolDir 42 | self.projectDir = projectDir 43 | self.networkDir = networkDir 44 | self.seed = seed 45 | 46 | 47 | if self.seed: 48 | os.environ['PYTHONHASHSEED']=str(self.seed) 49 | random.seed(self.seed) 50 | np.random.seed(self.seed) 51 | 52 | 53 | def splitVCF(self,nProc=1): 54 | ''' 55 | split the vcf into seperate files by chromosome 56 | ''' 57 | # partition for multiprocessing 58 | mpID = range(len(self.chromosomes)) 59 | task_q = mp.JoinableQueue() 60 | result_q = mp.Queue() 61 | params=self.vcfDir, self.vcf, self.chromosomes 62 | 63 | # do the work 64 | pids = create_procs(nProc, task_q, result_q, params, self.worker_splitVCF) 65 | assign_task(mpID, task_q, nProc) 66 | try: 67 | task_q.join() 68 | except KeyboardInterrupt: 69 | print("KeyboardInterrupt") 70 | sys.exit(0) 71 | 72 | return None 73 | 74 | 75 | def worker_splitVCF(self, task_q, result_q, params): 76 | while True: 77 | try: 78 | mpID, nth_job = task_q.get() 79 | vcfDir, vcf, chroms = params 80 | for i in mpID: 81 | chrom = chroms[i].split(":")[0] 82 | start = int(chroms[i].split(":")[1].split("-")[0])+1 83 | end = int(chroms[i].split(":")[1].split("-")[1])+1 84 | splitVCF=os.path.join(vcfDir, os.path.basename(vcf).replace(".vcf","_%s.vcf" %(chroms[i]))) 85 | print("Split chromosome: %s..." %(chrom)) 86 | with open(vcf, "r") as fIN, open(splitVCF, "w") as fOUT: 87 | for line in fIN: 88 | if line.startswith("#"): 89 | fOUT.write(line) 90 | if line.startswith("%s\t" %(chrom)): 91 | pos = int(line.split()[1]) 92 | if start <= pos <= end: 93 | fOUT.write(line) 94 | print("Converting %s to HDF5..." %(splitVCF)) 95 | h5FILE=splitVCF.replace(".vcf",".hdf5") 96 | allel.vcf_to_hdf5(splitVCF,h5FILE,fields="*",overwrite=True) 97 | os.system("rm %s" %(splitVCF)) 98 | finally: 99 | task_q.task_done() 100 | 101 | 102 | def splitPOOL(self,nProc=1): 103 | ''' 104 | split the pool file into seperate files by chromosome 105 | ''' 106 | # partition for multiprocessing 107 | mpID = range(len(self.chromosomes)) 108 | task_q = mp.JoinableQueue() 109 | result_q = mp.Queue() 110 | params=self.poolDir, self.pool, self.chromosomes 111 | 112 | # do the work 113 | pids = create_procs(nProc, task_q, result_q, params, self.worker_splitPOOL) 114 | assign_task(mpID, task_q, nProc) 115 | try: 116 | task_q.join() 117 | except KeyboardInterrupt: 118 | print("KeyboardInterrupt") 119 | sys.exit(0) 120 | 121 | return None 122 | 123 | 124 | def worker_splitPOOL(self, task_q, result_q, params): 125 | while True: 126 | try: 127 | mpID, nth_job = task_q.get() 128 | poolDir, pool, chroms = params 129 | for i in mpID: 130 | chrom = chroms[i].split(":")[0] 131 | start = int(chroms[i].split(":")[1].split("-")[0])+1 132 | end = int(chroms[i].split(":")[1].split("-")[1])+1 133 | splitPOOL=os.path.join(poolDir, os.path.basename(pool).replace(".pool","_%s.pool" %(chroms[i]))) 134 | print("Split chromosome: %s..." %(chrom)) 135 | with open(pool, "r") as fIN, open(splitPOOL, "w") as fOUT: 136 | for line in fIN: 137 | if line.startswith("%s\t" %(chrom)): 138 | pos = int(line.split()[1]) 139 | if start <= pos <= end: 140 | fOUT.write(line) 141 | finally: 142 | task_q.task_done() 143 | 144 | 145 | def countSites(self, nProc=1): 146 | # partition for multiprocessing 147 | mpID = range(len(self.chromosomes)) 148 | task_q = mp.JoinableQueue() 149 | result_q = mp.Queue() 150 | params=self.chromosomes 151 | 152 | # do the work 153 | pids = create_procs(nProc, task_q, result_q, params, self.worker_countSites) 154 | assign_task(mpID, task_q, nProc) 155 | try: 156 | task_q.join() 157 | except KeyboardInterrupt: 158 | print("KeyboardInterrupt") 159 | sys.exit(0) 160 | 161 | wins = [] 162 | for i in range(result_q.qsize()): 163 | item = result_q.get() 164 | wins.append(item) 165 | 166 | nSamps,maxS,maxLen = [],0,0 167 | sorted_wins = [] 168 | winFILE=os.path.join(self.networkDir,"windowSizes.txt") 169 | with open(winFILE, "w") as fOUT: 170 | for chrom in self.chromosomes: 171 | for win in wins: 172 | if win[0] == chrom: 173 | maxS = max(maxS,win[4]) 174 | maxLen = max(maxLen,win[2]) 175 | nSamps.append(win[1]) 176 | sorted_wins.append(win) 177 | fOUT.write("\t".join([str(x) for x in win])+"\n") 178 | if len(set(nSamps)) != 1: 179 | print("\nError: chromosomes have different sample sizes!\n") 180 | print("chromosome\t\tnum_samples (-9 when n varies between samples)") 181 | for chrom in self.chromosomes: 182 | for win in wins: 183 | if win[0] == chrom: 184 | print("%s\t\t%s"%(chrom.split(":")[0],win[1])) 185 | print("\nAll samples can be treated as 'diploids with missing data' by rerunning with the option `--forceDiploid`, however this is probably a bad idea (see README.md).") 186 | sys.exit(1) 187 | 188 | return sorted_wins, nSamps[0], maxS, maxLen 189 | 190 | 191 | def worker_countSites(self, task_q, result_q, params): 192 | while True: 193 | try: 194 | mpID, nth_job = task_q.get() 195 | chromosomes = params 196 | for i in mpID: 197 | h5FILE=os.path.join(self.vcfDir, os.path.basename(self.vcf).replace(".vcf","_%s.hdf5" %(chromosomes[i]))) 198 | print("""Reading HDF5: "%s"...""" %(h5FILE)) 199 | callset=h5py.File(h5FILE, mode="r") 200 | var=allel.VariantChunkedTable(callset["variants"],names=["CHROM","POS"], index="POS") 201 | chroms=var["CHROM"] 202 | pos=var["POS"] 203 | genos=allel.GenotypeChunkedArray(callset["calldata"]["GT"]) 204 | GT=genos.to_haplotypes() 205 | diploid_check=[] 206 | for n in range(1,len(genos[0]),2): 207 | GTB=GT[:,n:n+1] 208 | if np.unique(GTB).shape[0] == 1 and np.unique(GTB)[0] == -1: 209 | diploid_check.append(0) 210 | else: 211 | diploid_check.append(1) 212 | if 1 in diploid_check or self.forceDiploid: 213 | GT=np.array(GT) 214 | nSamps=len(genos[0])*2 215 | else: 216 | nSamps=len(genos[0]) 217 | GT=GT[:,::2] #Select only the first of the genotypes 218 | if 0 in diploid_check and 1 in diploid_check and not self.forceDiploid: 219 | print("\nError: Both haploid and diploid samples present in %s!"%(chromosomes[i].split(":")[0])) 220 | nSamps=-9 221 | 222 | ## if there is any missing data write a missing data boolean mask to hdf5 223 | md_mask = GT < 0 224 | if md_mask.any(): 225 | md_maskFile=os.path.join(self.vcfDir, os.path.basename(self.vcf).replace(".vcf","_%s_md_mask.hdf5" %(chromosomes[i]))) 226 | with h5py.File(md_maskFile, "w") as hf: 227 | hf.create_dataset("mask", data=md_mask) 228 | 229 | ## Find best window size 230 | if self.forceWinSize != 0: 231 | ip = force_win_size(self.forceWinSize,pos) 232 | result_q.put([chromosomes[i],nSamps,ip[0],ip[1],ip[2],ip[3],ip[4]]) 233 | else: 234 | lo, hi = 0, round(int(chromosomes[i].split(":")[-1].split("-")[-1]),-3) 235 | D = hi - lo 236 | target = lo + int((hi - lo)/2.0) 237 | while D > 10: 238 | ip = find_win_size(target,pos,self.winSizeMx) 239 | if len(ip) != 5: 240 | if ip[0] < 0: 241 | hi = target 242 | if ip[0] > 0: 243 | lo = target 244 | target = lo + int((hi - lo)/2.0) 245 | else: 246 | break 247 | D = hi - lo 248 | ip = force_win_size(round(target, -3), pos) 249 | result_q.put([chromosomes[i],nSamps,ip[0],ip[1],ip[2],ip[3],ip[4]]) 250 | finally: 251 | task_q.task_done() 252 | 253 | 254 | def countSitesPOOL(self, samD=0, nProc=1): 255 | # partition for multiprocessing 256 | mpID = range(len(self.chromosomes)) 257 | task_q = mp.JoinableQueue() 258 | result_q = mp.Queue() 259 | params=self.chromosomes 260 | 261 | # do the work 262 | pids = create_procs(nProc, task_q, result_q, params, self.worker_countSitesPOOL) 263 | assign_task(mpID, task_q, nProc) 264 | try: 265 | task_q.join() 266 | except KeyboardInterrupt: 267 | print("KeyboardInterrupt") 268 | sys.exit(0) 269 | 270 | wins = [] 271 | for i in range(result_q.qsize()): 272 | item = result_q.get() 273 | wins.append(item) 274 | 275 | nSamps,maxS,maxLen = [],0,0 276 | sorted_wins = [] 277 | winFILE=os.path.join(self.networkDir,"windowSizes.txt") 278 | with open(winFILE, "w") as fOUT: 279 | for chrom in self.chromosomes: 280 | for win in wins: 281 | if win[0] == chrom: 282 | maxS = max(maxS,win[3]) 283 | maxLen = max(maxLen,win[1]) 284 | win.insert(1,samD) 285 | nSamps.append(samD) 286 | sorted_wins.append(win) 287 | fOUT.write("\t".join([str(x) for x in win])+"\n") 288 | return sorted_wins, nSamps[0], maxS, maxLen 289 | 290 | 291 | def worker_countSitesPOOL(self, task_q, result_q, params): 292 | while True: 293 | try: 294 | mpID, nth_job = task_q.get() 295 | chromosomes = params 296 | for i in mpID: 297 | pos = [] 298 | poolFILE=os.path.join(self.poolDir, os.path.basename(self.pool).replace(".pool","_%s.pool" %(chromosomes[i]))) 299 | print("poolFILE:",poolFILE) 300 | with open(poolFILE, "r") as fIN: 301 | for line in fIN: 302 | pos.append(int(line.split()[1])) 303 | pos=np.array(pos) 304 | 305 | ## Find best window size 306 | if self.forceWinSize != 0: 307 | ip = force_win_size(self.forceWinSize,pos) 308 | result_q.put([chromosomes[i],ip[0],ip[1],ip[2],ip[3],ip[4]]) 309 | else: 310 | lo, hi = 0, round(int(chromosomes[i].split(":")[-1].split("-")[-1]),-3) 311 | D = hi - lo 312 | target = lo + int((hi - lo)/2.0) 313 | while D > 10: 314 | ip = find_win_size(target,pos,self.winSizeMx) 315 | if len(ip) != 5: 316 | if ip[0] < 0: 317 | hi = target 318 | if ip[0] > 0: 319 | lo = target 320 | target = lo + int((hi - lo)/2.0) 321 | else: 322 | break 323 | D = hi - lo 324 | ip = force_win_size(round(target, -2), pos) 325 | result_q.put([chromosomes[i],ip[0],ip[1],ip[2],ip[3],ip[4]]) 326 | finally: 327 | task_q.task_done() 328 | 329 | 330 | def maskWins(self, wins=None, maxLen=None, nProc=1): 331 | ## Read accessability mask 332 | print("\nAccessibility mask found: calculating the proportion of the genome that is masked...") 333 | genome = [x[0].split(":")[0] for x in wins] 334 | mask={} 335 | with open(self.mask, "r") as fIN: 336 | for line in fIN: 337 | ar = line.split() 338 | try: 339 | if int(ar[1]) < mask[ar[0]][-1][1]: 340 | print("Error: positions in accessibility mask are required to be non-overlapping and ascending!") 341 | sys.exit(1) 342 | else: 343 | mask[ar[0]].append([int(pos) for pos in ar[1:]]) 344 | except KeyError: 345 | if ar[0] in genome: 346 | mask[ar[0]] = [[int(pos) for pos in ar[1:]]] 347 | 348 | ## Combine genomic windows 349 | genomic_wins = [] 350 | for win in wins: 351 | win_chrom = win[0] 352 | win_len = win[2] 353 | win_ct = win[6] 354 | start = 0 355 | for i in range(win_ct): 356 | genomic_wins.append([win_chrom, start, win_len]) 357 | start += win_len 358 | 359 | # partition for multiprocessing 360 | mpID = range(len(genomic_wins)) 361 | task_q = mp.JoinableQueue() 362 | result_q = mp.Queue() 363 | params=genomic_wins, mask, maxLen 364 | 365 | # do the work 366 | pids = create_procs(nProc, task_q, result_q, params, self.worker_maskWins) 367 | assign_task(mpID, task_q, nProc) 368 | try: 369 | task_q.join() 370 | except KeyboardInterrupt: 371 | print("KeyboardInterrupt") 372 | sys.exit(0) 373 | 374 | masks = [] 375 | for i in range(result_q.qsize()): 376 | item = result_q.get() 377 | masks.append(item) 378 | 379 | mask_fraction, win_masks = [], [] 380 | for mask in masks: 381 | mask_fraction.append(mask[0]) 382 | win_masks.append(mask) 383 | 384 | mean_mask_fraction = sum(mask_fraction)/float(len(mask_fraction)) 385 | print("{}% of genome inaccessible\n".format(round(mean_mask_fraction * 100,1))) 386 | return mean_mask_fraction, win_masks 387 | 388 | 389 | def worker_maskWins(self, task_q, result_q, params): 390 | while True: 391 | try: 392 | mpID, nth_job = task_q.get() 393 | genomic_wins, mask, maxLen = params 394 | last_win = 0 395 | last_chrom = genomic_wins[0][0].split(":")[0] 396 | for i in mpID: 397 | if genomic_wins[i][0].split(":")[0] != last_chrom: 398 | last_win = 0 399 | last_chrom = genomic_wins[i][0].split(":")[0] 400 | M = maskStats(genomic_wins[i], last_win, mask, maxLen) 401 | last_win = M[2] 402 | result_q.put(M) 403 | finally: 404 | task_q.task_done() 405 | 406 | -------------------------------------------------------------------------------- /ReLERNN/sequenceBatchGenerator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Authors: Jared Galloway, Jeff Adrion 3 | ''' 4 | 5 | from ReLERNN.imports import * 6 | 7 | class SequenceBatchGenerator(tf.keras.utils.Sequence): 8 | 9 | ''' 10 | This class, SequenceBatchGenerator, extends tf.keras.utils.Sequence. 11 | So as to multithread the batch preparation in tandum with network training 12 | for maximum effeciency on the hardware provided. 13 | 14 | It generated batches of genotype matrices from a given .trees directory 15 | (which is generated most effeciently from the Simulator class) 16 | which have been prepped according to the given parameters. 17 | 18 | It also offers a range of data prepping heuristics as well as normalizing 19 | the targets. 20 | 21 | def __getitem__(self, idx): 22 | 23 | def __data_generation(self, batchTreeIndices): 24 | 25 | ''' 26 | 27 | #Initialize the member variables which largely determine the data prepping heuristics 28 | #in addition to the .trees directory containing the data from which to generate the batches 29 | def __init__(self, 30 | treesDirectory, 31 | targetNormalization = 'zscore', 32 | batchSize=64, 33 | maxLen=None, 34 | frameWidth=0, 35 | center=False, 36 | shuffleInds=False, 37 | sortInds=False, 38 | ancVal = -1, 39 | padVal = -1, 40 | derVal = 1, 41 | realLinePos = True, 42 | posPadVal = 0, 43 | shuffleExamples = True, 44 | splitFLAG = False, 45 | seqD = None, 46 | maf = None, 47 | hotspots = False, 48 | seed = None 49 | ): 50 | 51 | self.treesDirectory = treesDirectory 52 | self.targetNormalization = targetNormalization 53 | infoFilename = os.path.join(self.treesDirectory,"info.p") 54 | self.infoDir = pickle.load(open(infoFilename,"rb")) 55 | self.batch_size = batchSize 56 | self.maxLen = maxLen 57 | self.frameWidth = frameWidth 58 | self.center = center 59 | self.shuffleInds = shuffleInds 60 | self.sortInds=sortInds 61 | self.ancVal = ancVal 62 | self.padVal = padVal 63 | self.derVal = derVal 64 | self.realLinePos = realLinePos 65 | self.posPadVal = posPadVal 66 | self.indices = np.arange(self.infoDir["numReps"]) 67 | self.shuffleExamples = shuffleExamples 68 | self.splitFLAG = splitFLAG 69 | self.seqD = seqD 70 | self.maf = maf 71 | self.hotspots = hotspots 72 | self.seed = seed 73 | 74 | if self.seed: 75 | os.environ['PYTHONHASHSEED']=str(self.seed) 76 | random.seed(self.seed) 77 | np.random.seed(self.seed) 78 | 79 | if(targetNormalization != None): 80 | if self.hotspots: 81 | self.normalizedTargets = self.normalizeTargetsBinaryClass() 82 | else: 83 | self.normalizedTargets = self.normalizeTargets() 84 | 85 | if(shuffleExamples): 86 | np.random.shuffle(self.indices) 87 | 88 | def sort_min_diff(self,amat): 89 | '''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity. 90 | this problem is NP, so here we use a nearest neighbors approx. it's not perfect, but it's fast and generally performs ok. 91 | assumes your input matrix is a numpy array''' 92 | 93 | mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat) 94 | v = mb.kneighbors(amat) 95 | smallest = np.argmin(v[0].sum(axis=1)) 96 | return amat[v[1][smallest]] 97 | 98 | def pad_HapsPos(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False): 99 | ''' 100 | pads the haplotype and positions tensors 101 | to be uniform with the largest tensor 102 | ''' 103 | 104 | haps = haplotypes 105 | pos = positions 106 | 107 | #Normalize the shape of all haplotype vectors with padding 108 | for i in range(len(haps)): 109 | numSNPs = haps[i].shape[0] 110 | paddingLen = maxSNPs - numSNPs 111 | if(center): 112 | prior = paddingLen // 2 113 | post = paddingLen - prior 114 | haps[i] = np.pad(haps[i],((prior,post),(0,0)),"constant",constant_values=2.0) 115 | pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0) 116 | 117 | else: 118 | if(paddingLen < 0): 119 | haps[i] = np.pad(haps[i],((0,0),(0,0)),"constant",constant_values=2.0)[:paddingLen] 120 | pos[i] = np.pad(pos[i],(0,0),"constant",constant_values=-1.0)[:paddingLen] 121 | else: 122 | haps[i] = np.pad(haps[i],((0,paddingLen),(0,0)),"constant",constant_values=2.0) 123 | pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0) 124 | 125 | haps = np.array(haps,dtype='float32') 126 | pos = np.array(pos,dtype='float32') 127 | 128 | if(frameWidth): 129 | fw = frameWidth 130 | haps = np.pad(haps,((0,0),(fw,fw),(fw,fw)),"constant",constant_values=2.0) 131 | pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0) 132 | 133 | return haps,pos 134 | 135 | def padAlleleFqs(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False): 136 | ''' 137 | convert haps to allele frequencies, normalize, and 138 | pad the haplotype and positions tensors 139 | to be uniform with the largest tensor 140 | ''' 141 | 142 | haps = haplotypes 143 | positions = positions 144 | fqs, pos = [], [] 145 | 146 | # Resample to sequencing depth and convert to allele frequencies 147 | for i in range(len(haps)): 148 | tmp_freqs = [] 149 | tmp_pos = [] 150 | fqs_list = haps[i].tolist() 151 | for j in range(len(fqs_list)): 152 | 153 | if self.seqD != -9: 154 | ## Resample 155 | z = resample(fqs_list[j], n_samples=self.seqD, replace=True) 156 | raw_freq = round(np.count_nonzero(z)/float(len(z)),3) 157 | if self.maf <= raw_freq < 1.0: 158 | tmp_freqs.append(raw_freq) 159 | tmp_pos.append(positions[i][j]) 160 | else: 161 | ## Don't resample 162 | raw_freq = round(np.count_nonzero(fqs_list[j])/float(len(fqs_list[j])),3) 163 | tmp_freqs.append(raw_freq) 164 | tmp_pos.append(positions[i][j]) 165 | 166 | fqs.append(np.array(tmp_freqs)) 167 | pos.append(np.array(tmp_pos)) 168 | 169 | # Normalize 170 | fqs = self.normalizeAlleleFqs(fqs) 171 | 172 | # Pad 173 | for i in range(len(fqs)): 174 | numSNPs = fqs[i].shape[0] 175 | paddingLen = maxSNPs - numSNPs 176 | if(center): 177 | prior = paddingLen // 2 178 | post = paddingLen - prior 179 | fqs[i] = np.pad(fqs[i],(prior,post),"constant",constant_values=-1.0) 180 | pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0) 181 | 182 | else: 183 | if(paddingLen < 0): 184 | fqs[i] = np.pad(fqs[i],(0,0),"constant",constant_values=-1.0)[:paddingLen] 185 | pos[i] = np.pad(pos[i],(0,0),"constant",constant_values=-1.0)[:paddingLen] 186 | else: 187 | fqs[i] = np.pad(fqs[i],(0,paddingLen),"constant",constant_values=-1.0) 188 | pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0) 189 | 190 | fqs = np.array(fqs,dtype='float32') 191 | pos = np.array(pos,dtype='float32') 192 | 193 | if(frameWidth): 194 | fw = frameWidth 195 | fqs = np.pad(fqs,((0,0),(fw,fw)),"constant",constant_values=-1.0) 196 | pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0) 197 | 198 | return fqs,pos 199 | 200 | def normalizeTargets(self): 201 | 202 | ''' 203 | We want to normalize all targets. 204 | ''' 205 | 206 | norm = self.targetNormalization 207 | nTargets = copy.deepcopy(self.infoDir['rho']) 208 | 209 | if(norm == 'zscore'): 210 | tar_mean = np.mean(nTargets,axis=0) 211 | tar_sd = np.std(nTargets,axis=0) 212 | nTargets -= tar_mean 213 | nTargets = np.divide(nTargets,tar_sd,out=np.zeros_like(nTargets),where=tar_sd!=0) 214 | 215 | elif(norm == 'divstd'): 216 | tar_sd = np.std(nTargets,axis=0) 217 | nTargets = np.divide(nTargets,tar_sd,out=np.zeros_like(nTargets),where=tar_sd!=0) 218 | 219 | return nTargets 220 | 221 | def normalizeTargetsBinaryClass(self): 222 | 223 | ''' 224 | We want to normalize all targets. 225 | ''' 226 | 227 | norm = self.targetNormalization 228 | nTargets = copy.deepcopy(self.infoDir['hotWin']) 229 | 230 | nTargets[nTargets<5] = 0 231 | nTargets[nTargets>=5] = 1 232 | 233 | return nTargets.astype(np.uint8) 234 | 235 | def normalizeAlleleFqs(self, fqs): 236 | 237 | ''' 238 | normalize the allele frequencies for the batch 239 | ''' 240 | 241 | norm = self.targetNormalization 242 | 243 | if(norm == 'zscore'): 244 | allVals = np.concatenate([a.flatten() for a in fqs]) 245 | fqs_mean = np.mean(allVals) 246 | fqs_sd = np.std(allVals) 247 | for i in range(len(fqs)): 248 | fqs[i] = np.subtract(fqs[i],fqs_mean) 249 | fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0) 250 | 251 | elif(norm == 'divstd'): 252 | allVals = np.concatenate([a.flatten() for a in fqs]) 253 | fqs_sd = np.std(allVals) 254 | for i in range(len(fqs)): 255 | fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0) 256 | 257 | return fqs 258 | 259 | def on_epoch_end(self): 260 | 261 | if(self.shuffleExamples): 262 | np.random.shuffle(self.indices) 263 | 264 | def __len__(self): 265 | 266 | return int(np.floor(self.infoDir["numReps"]/self.batch_size)) 267 | 268 | def __getitem__(self, idx): 269 | 270 | indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size] 271 | X, y = self.__data_generation(indices) 272 | return X,y 273 | 274 | def shuffleIndividuals(self,x): 275 | 276 | t = np.arange(x.shape[1]) 277 | np.random.shuffle(t) 278 | return x[:,t] 279 | 280 | def __data_generation(self, batchTreeIndices): 281 | 282 | haps = [] 283 | pos = [] 284 | for treeIndex in batchTreeIndices: 285 | Hfilepath = os.path.join(self.treesDirectory,str(treeIndex) + "_haps.npy") 286 | Pfilepath = os.path.join(self.treesDirectory,str(treeIndex) + "_pos.npy") 287 | H = np.load(Hfilepath) 288 | P = np.load(Pfilepath) 289 | haps.append(H) 290 | pos.append(P) 291 | respectiveNormalizedTargets = [[t] for t in self.normalizedTargets[batchTreeIndices]] 292 | targets = np.array(respectiveNormalizedTargets) 293 | 294 | if(self.realLinePos): 295 | for p in range(len(pos)): 296 | pos[p] = pos[p] / self.infoDir["ChromosomeLength"] 297 | 298 | if(self.sortInds): 299 | for i in range(len(haps)): 300 | haps[i] = np.transpose(self.sort_min_diff(np.transpose(haps[i]))) 301 | 302 | if(self.shuffleInds): 303 | for i in range(len(haps)): 304 | haps[i] = self.shuffleIndividuals(haps[i]) 305 | 306 | if self.seqD: 307 | # simulate pool-sequencing 308 | if(self.maxLen != None): 309 | # convert the haps to allele frequecies and then pad 310 | haps,pos = self.padAlleleFqs(haps,pos, 311 | maxSNPs=self.maxLen, 312 | frameWidth=self.frameWidth, 313 | center=self.center) 314 | 315 | haps=np.where(haps == -1.0, self.posPadVal,haps) 316 | pos=np.where(pos == -1.0, self.posPadVal,pos) 317 | z = np.stack((haps,pos), axis=-1) 318 | 319 | return z, targets 320 | else: 321 | if(self.maxLen != None): 322 | # pad 323 | haps,pos = self.pad_HapsPos(haps,pos, 324 | maxSNPs=self.maxLen, 325 | frameWidth=self.frameWidth, 326 | center=self.center) 327 | 328 | pos=np.where(pos == -1.0, self.posPadVal,pos) 329 | haps=np.where(haps < 1.0, self.ancVal, haps) 330 | haps=np.where(haps > 1.0, self.padVal, haps) 331 | haps=np.where(haps == 1.0, self.derVal, haps) 332 | 333 | return [haps,pos], targets 334 | 335 | 336 | class VCFBatchGenerator(tf.keras.utils.Sequence): 337 | """Basically same as SequenceBatchGenerator Class except for VCF files""" 338 | def __init__(self, 339 | INFO, 340 | CHROM, 341 | winLen, 342 | numWins, 343 | IDs, 344 | GT, 345 | POS, 346 | batchSize=64, 347 | maxLen=None, 348 | frameWidth=0, 349 | center=False, 350 | sortInds=False, 351 | ancVal = -1, 352 | padVal = -1, 353 | derVal = 1, 354 | realLinePos = True, 355 | posPadVal = 0, 356 | phase=None, 357 | seed=None 358 | ): 359 | 360 | self.INFO=INFO 361 | self.CHROM=CHROM 362 | self.winLen=winLen 363 | self.numWins=numWins 364 | self.indices=np.arange(self.numWins) 365 | self.IDs=IDs 366 | self.GT=GT 367 | self.POS=POS 368 | self.batch_size = batchSize 369 | self.maxLen = maxLen 370 | self.frameWidth = frameWidth 371 | self.center = center 372 | self.sortInds=sortInds 373 | self.ancVal = ancVal 374 | self.padVal = padVal 375 | self.derVal = derVal 376 | self.realLinePos = realLinePos 377 | self.posPadVal = posPadVal 378 | self.phase=phase 379 | self.seed=seed 380 | 381 | if self.seed: 382 | os.environ['PYTHONHASHSEED']=str(self.seed) 383 | random.seed(self.seed) 384 | np.random.seed(self.seed) 385 | 386 | 387 | def pad_HapsPosVCF(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False): 388 | ''' 389 | pads the haplotype and positions tensors 390 | to be uniform with the largest tensor 391 | ''' 392 | 393 | haps = haplotypes 394 | pos = positions 395 | 396 | nSNPs=[] 397 | 398 | #Normalize the shape of all haplotype vectors with padding 399 | for i in range(len(haps)): 400 | numSNPs = haps[i].shape[0] 401 | nSNPs.append(numSNPs) 402 | paddingLen = maxSNPs - numSNPs 403 | if(center): 404 | prior = paddingLen // 2 405 | post = paddingLen - prior 406 | haps[i] = np.pad(haps[i],((prior,post),(0,0)),"constant",constant_values=2.0) 407 | pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0) 408 | 409 | else: 410 | haps[i] = np.pad(haps[i],((0,paddingLen),(0,0)),"constant",constant_values=2.0) 411 | pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0) 412 | 413 | haps = np.array(haps,dtype='float32') 414 | pos = np.array(pos,dtype='float32') 415 | 416 | if(frameWidth): 417 | fw = frameWidth 418 | haps = np.pad(haps,((0,0),(fw,fw),(fw,fw)),"constant",constant_values=2.0) 419 | pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0) 420 | return haps,pos,nSNPs 421 | 422 | 423 | def __getitem__(self, idx): 424 | 425 | indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size] 426 | X, nSNPs = self.__data_generation(indices) 427 | 428 | return X, self.CHROM, self.winLen, self.INFO, nSNPs 429 | 430 | 431 | def __data_generation(self, indices): 432 | 433 | if self.seed: 434 | os.environ['PYTHONHASHSEED']=str(self.seed) 435 | random.seed(self.seed) 436 | np.random.seed(self.seed) 437 | 438 | #def __getitem__(self, idx): 439 | genos=self.GT 440 | GT=self.GT.to_haplotypes() 441 | diploid_check=[] 442 | for n in range(1,len(genos[0]),2): 443 | GTB=GT[:,n:n+1] 444 | if np.unique(GTB).shape[0] == 1 and np.unique(GTB)[0] == -1: 445 | diploid_check.append(0) 446 | else: 447 | diploid_check.append(1) 448 | break 449 | if 1 in diploid_check: 450 | GT=np.array(GT) 451 | else: 452 | GT=GT[:,::2] #Select only the first of the genotypes 453 | GT = np.where(GT == -1, 2, GT) # Code missing data as 2, these will ultimately end up being transformed to the pad value 454 | 455 | if not self.phase: 456 | np.random.shuffle(np.transpose(GT)) 457 | 458 | haps,pos=[],[] 459 | for i in indices: 460 | haps.append(GT[self.IDs[i][0]:self.IDs[i][1]]) 461 | pos.append(self.POS[self.IDs[i][0]:self.IDs[i][1]]) 462 | 463 | if(self.realLinePos): 464 | for i in range(len(pos)): 465 | pos[i] = (pos[i]-(self.winLen*indices[i])) / self.winLen 466 | 467 | if(self.sortInds): 468 | for i in range(len(haps)): 469 | haps[i] = np.transpose(sort_min_diff(np.transpose(haps[i]))) 470 | 471 | if(self.maxLen != None): 472 | haps,pos,nSNPs = self.pad_HapsPosVCF(haps,pos, 473 | maxSNPs=self.maxLen, 474 | frameWidth=self.frameWidth, 475 | center=self.center) 476 | 477 | pos=np.where(pos == -1.0, self.posPadVal,pos) 478 | haps=np.where(haps < 1.0, self.ancVal, haps) 479 | haps=np.where(haps > 1.0, self.padVal, haps) 480 | haps=np.where(haps == 1.0, self.derVal, haps) 481 | 482 | return [haps,pos], nSNPs 483 | 484 | 485 | class POOLBatchGenerator(tf.keras.utils.Sequence): 486 | """Basically same as SequenceBatchGenerator Class except for POOL files""" 487 | def __init__(self, 488 | INFO, 489 | CHROM, 490 | winLen, 491 | numWins, 492 | IDs, 493 | GT, 494 | POS, 495 | batchSize=64, 496 | maxLen=None, 497 | frameWidth=0, 498 | center=False, 499 | sortInds=False, 500 | ancVal = -1, 501 | padVal = -1, 502 | derVal = 1, 503 | realLinePos = True, 504 | posPadVal = 0, 505 | normType = 'zscore', 506 | seed = None 507 | ): 508 | 509 | self.INFO=INFO 510 | self.normType = normType 511 | self.CHROM=CHROM 512 | self.winLen=winLen 513 | self.numWins=numWins 514 | self.indices=np.arange(self.numWins) 515 | self.IDs=IDs 516 | self.GT=GT 517 | self.POS=POS 518 | self.batch_size = batchSize 519 | self.maxLen = maxLen 520 | self.frameWidth = frameWidth 521 | self.center = center 522 | self.sortInds=sortInds 523 | self.ancVal = ancVal 524 | self.padVal = padVal 525 | self.derVal = derVal 526 | self.realLinePos = realLinePos 527 | self.posPadVal = posPadVal 528 | self.seed = seed 529 | 530 | if self.seed: 531 | os.environ['PYTHONHASHSEED']=str(self.seed) 532 | random.seed(self.seed) 533 | np.random.seed(self.seed) 534 | 535 | def padFqs(self,haplotypes,positions,maxSNPs=None,frameWidth=0,center=False): 536 | ''' 537 | normalize, and pad the haplotype and positions tensors 538 | to be uniform with the largest tensor 539 | ''' 540 | 541 | fqs = haplotypes 542 | pos = positions 543 | 544 | # Normalize 545 | fqs = self.normalizeAlleleFqs(fqs) 546 | 547 | nSNPs=[] 548 | # Pad 549 | for i in range(len(fqs)): 550 | numSNPs = fqs[i].shape[0] 551 | nSNPs.append(numSNPs) 552 | paddingLen = maxSNPs - numSNPs 553 | if(center): 554 | prior = paddingLen // 2 555 | post = paddingLen - prior 556 | fqs[i] = np.pad(fqs[i],(prior,post),"constant",constant_values=-1.0) 557 | pos[i] = np.pad(pos[i],(prior,post),"constant",constant_values=-1.0) 558 | 559 | else: 560 | if(paddingLen < 0): 561 | fqs[i] = np.pad(fqs[i],(0,0),"constant",constant_values=-1.0)[:paddingLen] 562 | pos[i] = np.pad(pos[i],(0,0),"constant",constant_values=-1.0)[:paddingLen] 563 | else: 564 | fqs[i] = np.pad(fqs[i],(0,paddingLen),"constant",constant_values=-1.0) 565 | pos[i] = np.pad(pos[i],(0,paddingLen),"constant",constant_values=-1.0) 566 | 567 | fqs = np.array(fqs,dtype='float32') 568 | pos = np.array(pos,dtype='float32') 569 | 570 | if(frameWidth): 571 | fw = frameWidth 572 | fqs = np.pad(fqs,((0,0),(fw,fw)),"constant",constant_values=-1.0) 573 | pos = np.pad(pos,((0,0),(fw,fw)),"constant",constant_values=-1.0) 574 | 575 | return fqs,pos,nSNPs 576 | 577 | 578 | def normalizeAlleleFqs(self, fqs): 579 | 580 | ''' 581 | normalize the allele frequencies for the batch 582 | ''' 583 | 584 | norm = self.normType 585 | 586 | if(norm == 'zscore'): 587 | allVals = np.concatenate([a.flatten() for a in fqs]) 588 | fqs_mean = np.mean(allVals) 589 | fqs_sd = np.std(allVals) 590 | for i in range(len(fqs)): 591 | fqs[i] = np.subtract(fqs[i],fqs_mean) 592 | fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0) 593 | 594 | elif(norm == 'divstd'): 595 | allVals = np.concatenate([a.flatten() for a in fqs]) 596 | fqs_sd = np.std(allVals) 597 | for i in range(len(fqs)): 598 | fqs[i] = np.divide(fqs[i],fqs_sd,out=np.zeros_like(fqs[i]),where=fqs_sd!=0) 599 | 600 | return fqs 601 | 602 | 603 | def __getitem__(self, idx): 604 | 605 | indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size] 606 | X, nSNPs = self.__data_generation(indices) 607 | 608 | return X, self.CHROM, self.winLen, self.INFO, nSNPs 609 | 610 | 611 | def __data_generation(self, indices): 612 | 613 | if self.seed: 614 | os.environ['PYTHONHASHSEED']=str(self.seed) 615 | random.seed(self.seed) 616 | np.random.seed(self.seed) 617 | 618 | GT=self.GT 619 | 620 | haps,pos=[],[] 621 | for i in indices: 622 | haps.append(GT[self.IDs[i][0]:self.IDs[i][1]]) 623 | pos.append(self.POS[self.IDs[i][0]:self.IDs[i][1]]) 624 | 625 | if(self.realLinePos): 626 | for i in range(len(pos)): 627 | pos[i] = (pos[i]-(self.winLen*indices[i])) / self.winLen 628 | 629 | if(self.sortInds): 630 | for i in range(len(haps)): 631 | haps[i] = np.transpose(sort_min_diff(np.transpose(haps[i]))) 632 | 633 | # pad the allele freqs and positions 634 | if(self.maxLen != None): 635 | haps,pos,nSNPs = self.padFqs(haps,pos, 636 | maxSNPs=self.maxLen, 637 | frameWidth=self.frameWidth, 638 | center=self.center) 639 | 640 | haps=np.where(haps == -1.0, self.posPadVal,haps) 641 | pos=np.where(pos == -1.0, self.posPadVal,pos) 642 | np.set_printoptions(threshold=sys.maxsize) 643 | z = np.stack((haps,pos), axis=-1) 644 | 645 | return z, nSNPs 646 | 647 | 648 | -------------------------------------------------------------------------------- /ReLERNN/helpers.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | Authors: Jared Galloway, Jeff Adrion 4 | ''' 5 | 6 | from ReLERNN.imports import * 7 | 8 | #------------------------------------------------------------------------------------------- 9 | 10 | def assign_task(mpID, task_q, nProcs): 11 | c,i,nth_job=0,0,1 12 | while (i+1)*nProcs <= len(mpID): 13 | i+=1 14 | nP1=nProcs-(len(mpID)%nProcs) 15 | for j in range(nP1): 16 | task_q.put((mpID[c:c+i], nth_job)) 17 | nth_job += 1 18 | c=c+i 19 | for j in range(nProcs-nP1): 20 | task_q.put((mpID[c:c+i+1], nth_job)) 21 | nth_job += 1 22 | c=c+i+1 23 | 24 | #------------------------------------------------------------------------------------------- 25 | 26 | def create_procs(nProcs, task_q, result_q, params, worker): 27 | pids = [] 28 | for _ in range(nProcs): 29 | p = mp.Process(target=worker, args=(task_q, result_q, params)) 30 | p.daemon = True 31 | p.start() 32 | pids.append(p) 33 | return pids 34 | 35 | #------------------------------------------------------------------------------------------- 36 | 37 | def get_corrected_index(L,N): 38 | idx,outN="","" 39 | dist=float("inf") 40 | for i in range(len(L)): 41 | D=abs(N-L[i]) 42 | if D < dist: 43 | idx=i 44 | outN=L[i] 45 | dist=D 46 | return [idx,outN] 47 | 48 | #------------------------------------------------------------------------------------------- 49 | 50 | def get_corrected(rate,bs): 51 | idx=get_corrected_index(bs["Q2"],rate) 52 | CI95LO=bs["CI95LO"][idx[0]] 53 | CI95HI=bs["CI95HI"][idx[0]] 54 | cRATE=relu(rate+(bs["rho"][idx[0]]-idx[1])) 55 | ciHI=relu(cRATE+(CI95HI-idx[1])) 56 | ciLO=relu(cRATE+(CI95LO-idx[1])) 57 | return [cRATE,ciLO,ciHI] 58 | 59 | #------------------------------------------------------------------------------------------- 60 | 61 | def get_index(pos, winSize): 62 | y=snps_per_win(pos,winSize) 63 | st=0 64 | indices=[] 65 | for i in range(len(y)): 66 | indices.append([st,st+y[i]]) 67 | st+=y[i] 68 | return indices 69 | 70 | #------------------------------------------------------------------------------------------- 71 | 72 | def snps_per_win(pos, window_size): 73 | bins = np.arange(1, pos.max()+window_size, window_size) #use 1-based coordinates, per VCF standard 74 | y,x = np.histogram(pos,bins=bins) 75 | return y 76 | 77 | #------------------------------------------------------------------------------------------- 78 | 79 | def find_win_size(winSize, pos, winSizeMx): 80 | snpsWin=snps_per_win(pos,winSize) 81 | mn,u,mx = snpsWin.min(), int(snpsWin.mean()), snpsWin.max() 82 | if mx > winSizeMx: 83 | return [-1] 84 | elif mx < winSizeMx: 85 | return [1] 86 | else: 87 | return [winSize,mn,u,mx,len(snpsWin)] 88 | 89 | #------------------------------------------------------------------------------------------- 90 | 91 | def force_win_size(winSize, pos): 92 | snpsWin=snps_per_win(pos,winSize) 93 | mn,u,mx = snpsWin.min(), int(snpsWin.mean()), snpsWin.max() 94 | return [winSize,mn,u,mx,len(snpsWin)] 95 | 96 | #------------------------------------------------------------------------------------------- 97 | 98 | def maskStats(wins, last_win, mask, maxLen): 99 | """ 100 | return a three-element list with the first element being the total proportion of the window that is masked, 101 | the second element being a list of masked positions that are relative to the windown start=0 and the window end = window length, 102 | and the third being the last window before breaking to expidite the next loop 103 | """ 104 | chrom = wins[0].split(":")[0] 105 | a = wins[1] 106 | L = wins[2] 107 | b = a + L 108 | prop = [0.0,[],0] 109 | try: 110 | for i in range(last_win, len(mask[chrom])): 111 | x, y = mask[chrom][i][0], mask[chrom][i][1] 112 | if y < a: 113 | continue 114 | if b < x: 115 | return prop 116 | else: # i.e. [a--b] and [x--y] overlap 117 | if a >= x and b <= y: 118 | return [1.0, [[0,maxLen]], i] 119 | elif a >= x and b > y: 120 | win_prop = (y-a)/float(b-a) 121 | prop[0] += win_prop 122 | prop[1].append([0,int(win_prop * maxLen)]) 123 | prop[2] = i 124 | elif b <= y and a < x: 125 | win_prop = (b-x)/float(b-a) 126 | prop[0] += win_prop 127 | prop[1].append([int((1-win_prop)*maxLen),maxLen]) 128 | prop[2] = i 129 | else: 130 | win_prop = (y-x)/float(b-a) 131 | prop[0] += win_prop 132 | prop[1].append([int(((x-a)/float(b-a))*maxLen), int(((y-a)/float(b-a))*maxLen)]) 133 | prop[2] = i 134 | return prop 135 | except KeyError: 136 | return prop 137 | 138 | #------------------------------------------------------------------------------------------- 139 | 140 | def check_demHist(path): 141 | fTypeFlag = -9 142 | with open(path, "r") as fIN: 143 | for line in fIN: 144 | if line.startswith("mutation_per_site"): 145 | fTypeFlag = 1 146 | break 147 | if line.startswith("label"): 148 | fTypeFlag = 2 149 | break 150 | if line.startswith("time_index"): 151 | fTypeFlag = 3 152 | break 153 | return fTypeFlag 154 | 155 | #------------------------------------------------------------------------------------------- 156 | 157 | def convert_msmc_output(results_file, mutation_rate, generation_time): 158 | """ 159 | This function converts the output from msmc into a csv the will be read in for 160 | plotting comparison. 161 | 162 | MSMC outputs times and rates scaled by the mutation rate per basepair per generation. 163 | First, scaled times are given in units of the per-generation mutation rate. 164 | This means that in order to convert scaled times to generations, 165 | divide them by the mutation rate. In humans, we used mu=1e-8 per basepair per generation. 166 | To convert generations into years, multiply by the generation time, for which we used 10 years. 167 | 168 | To get population sizes out of coalescence rates, first take the inverse of the coalescence rate, 169 | scaledPopSize = 1 / lambda00. Then divide this scaled population size by 2*mu 170 | """ 171 | outfile = results_file+".csv" 172 | out_fp = open(outfile, "w") 173 | in_fp = open(results_file, "r") 174 | header = in_fp.readline() 175 | out_fp.write("label,x,y\n") 176 | for line in in_fp: 177 | result = line.split() 178 | time = float(result[1]) 179 | time_generation = time / mutation_rate 180 | time_years = time_generation * generation_time 181 | lambda00 = float(result[3]) 182 | scaled_pop_size = 1 / lambda00 183 | size = scaled_pop_size / (2*mutation_rate) 184 | out_fp.write(f"pop0,{time_years},{size}\n") 185 | out_fp.close 186 | return None 187 | 188 | #------------------------------------------------------------------------------------------- 189 | 190 | def convert_demHist(path, nSamps, gen, fType, mu): 191 | swp, PC, DE = [],[],[] 192 | # Convert stairwayplot to msp demographic_events 193 | if fType == 1: 194 | with open(path, "r") as fIN: 195 | flag=0 196 | lCt=0 197 | for line in fIN: 198 | if flag == 1: 199 | if lCt % 2 == 0: 200 | swp.append(line.split()) 201 | lCt+=1 202 | if line.startswith("mutation_per_site"): 203 | flag=1 204 | N0 = int(float(swp[0][6])) 205 | for i in range(len(swp)): 206 | if i == 0: 207 | PC.append(msp.PopulationConfiguration(sample_size=nSamps, initial_size=N0)) 208 | else: 209 | DE.append(msp.PopulationParametersChange(time=int(float(swp[i][5])/float(gen)), initial_size=int(float(swp[i][6])), population=0)) 210 | ## Convert MSMC to similar format to smc++ 211 | if fType == 3: 212 | convert_msmc_output(path, mu, gen) 213 | path+=".csv" 214 | ## Convert smc++ or MSMC results to msp demographic_events 215 | if fType == 2 or fType == 3: 216 | with open(path, "r") as fIN: 217 | fIN.readline() 218 | for line in fIN: 219 | ar=line.split(",") 220 | swp.append([int(float(ar[1])/gen),int(float(ar[2]))]) 221 | N0 = swp[0][1] 222 | for i in range(len(swp)): 223 | if i == 0: 224 | PC.append(msp.PopulationConfiguration(sample_size=nSamps, initial_size=N0)) 225 | else: 226 | DE.append(msp.PopulationParametersChange(time=swp[i][0], initial_size=swp[i][1], population=0)) 227 | dd=msp.DemographyDebugger(population_configurations=PC, 228 | demographic_events=DE) 229 | print("Simulating under the following population size history:") 230 | dd.print_history() 231 | MspD = {"population_configurations" : PC, 232 | "migration_matrix" : None, 233 | "demographic_events" : DE} 234 | if MspD: 235 | return MspD 236 | else: 237 | print("Error in converting demographic history file.") 238 | sys.exit(1) 239 | 240 | #------------------------------------------------------------------------------------------- 241 | 242 | def relu(x): 243 | return max(0,x) 244 | 245 | #------------------------------------------------------------------------------------------- 246 | 247 | def zscoreTargets(self): 248 | norm = self.targetNormalization 249 | nTargets = copy.deepcopy(self.infoDir['y']) 250 | if(norm == 'zscore'): 251 | tar_mean = np.mean(nTargets,axis=0) 252 | tar_sd = np.std(nTargets,axis=0) 253 | nTargets -= tar_mean 254 | nTargets = np.divide(nTargets,tar_sd,out=np.zeros_like(nTargets),where=tar_sd!=0) 255 | 256 | #------------------------------------------------------------------------------------------- 257 | 258 | def load_and_predictVCF(VCFGenerator, 259 | resultsFile=None, 260 | network=None, 261 | chromStr=None, 262 | minS = 50, 263 | numWins = None, 264 | batchSize = None, 265 | gpuID = 0, 266 | hotspots = False): 267 | 268 | if hotspots: 269 | print("Error: hotspot detection under construction") 270 | sys.exit(1) 271 | 272 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpuID) 273 | 274 | ## The following code block appears necessary for running with tf2 and cudnn 275 | from tensorflow.compat.v1 import ConfigProto 276 | from tensorflow.compat.v1 import Session 277 | config = ConfigProto() 278 | config.gpu_options.allow_growth = True 279 | Session(config=config) 280 | ### 281 | 282 | # load json and create model 283 | if(network != None): 284 | jsonFILE = open(network[0],"r") 285 | loadedModel = jsonFILE.read() 286 | jsonFILE.close() 287 | model=model_from_json(loadedModel) 288 | model.load_weights(network[1]) 289 | else: 290 | print("Error: no pretrained network found!") 291 | sys.exit(1) 292 | 293 | num_batches = int(np.ceil(numWins / batchSize)) 294 | 295 | with open(resultsFile, "w") as fOUT: 296 | ct=0 297 | last = int(chromStr.split(":")[-1].split("-")[-1]) 298 | fOUT.write("\t".join([str(head) for head in ["chrom","start","end","nSites","recombRate"]])+"\n") 299 | for i in range(num_batches): 300 | X,chrom,win,info,nSNPs = VCFGenerator.__getitem__(i) 301 | predictions = model.predict(X) 302 | u=np.mean(info["rho"]) 303 | sd=np.std(info["rho"]) 304 | for j in range(len(predictions)): 305 | if nSNPs[j] >= minS: 306 | fOUT.write("%s\t%s\t%s\t%s\t%s\n" %(chrom,ct,min(ct+win,last),nSNPs[j],relu(sd*predictions[j][0]+u))) 307 | ct+=win 308 | 309 | return None 310 | 311 | #------------------------------------------------------------------------------------------- 312 | 313 | def runModels(ModelFuncPointer, 314 | ModelName, 315 | TrainDir, 316 | TrainGenerator, 317 | ValidationGenerator, 318 | TestGenerator, 319 | resultsFile=None, 320 | numEpochs=10, 321 | epochSteps=100, 322 | validationSteps=1, 323 | network=None, 324 | nCPU = 1, 325 | gpuID = 0): 326 | 327 | 328 | os.environ["CUDA_VISIBLE_DEVICES"]=str(gpuID) 329 | 330 | ## The following code block appears necessary for running with tf2 and cudnn 331 | from tensorflow.compat.v1 import ConfigProto 332 | from tensorflow.compat.v1 import Session 333 | config = ConfigProto() 334 | config.gpu_options.allow_growth = True 335 | Session(config=config) 336 | ### 337 | 338 | if(resultsFile == None): 339 | 340 | resultsFilename = os.path.basename(trainFile)[:-4] + ".p" 341 | resultsFile = os.path.join("./results/",resultsFilename) 342 | 343 | x,y = TrainGenerator.__getitem__(0) 344 | model = ModelFuncPointer(x,y) 345 | 346 | # Early stopping and saving the best weights 347 | callbacks_list = [ 348 | EarlyStopping( 349 | monitor='val_loss', 350 | verbose=1, 351 | min_delta=0.01, 352 | patience=100), 353 | ModelCheckpoint( 354 | filepath=network[1], 355 | monitor='val_loss', 356 | save_best_only=True), 357 | TerminateOnNaN() 358 | ] 359 | 360 | if nCPU > 1: 361 | history = model.fit(TrainGenerator, 362 | steps_per_epoch=epochSteps, 363 | epochs=numEpochs, 364 | validation_data=ValidationGenerator, 365 | callbacks=callbacks_list, 366 | use_multiprocessing=True, 367 | max_queue_size=nCPU, 368 | workers=nCPU) 369 | else: 370 | history = model.fit(TrainGenerator, 371 | steps_per_epoch=epochSteps, 372 | epochs=numEpochs, 373 | validation_data=ValidationGenerator, 374 | callbacks=callbacks_list, 375 | use_multiprocessing=False) 376 | 377 | # Write the network 378 | if(network != None): 379 | ##serialize model to JSON 380 | model_json = model.to_json() 381 | with open(network[0], "w") as json_file: 382 | json_file.write(model_json) 383 | 384 | # Load json and create model 385 | if(network != None): 386 | jsonFILE = open(network[0],"r") 387 | loadedModel = jsonFILE.read() 388 | jsonFILE.close() 389 | model=model_from_json(loadedModel) 390 | model.load_weights(network[1]) 391 | else: 392 | print("Error: model and weights not loaded") 393 | sys.exit(1) 394 | 395 | x,y = TestGenerator.__getitem__(0) 396 | predictions = model.predict(x) 397 | 398 | history.history['loss'] = np.array(history.history['loss']) 399 | history.history['val_loss'] = np.array(history.history['val_loss']) 400 | history.history['predictions'] = np.array(predictions) 401 | history.history['Y_test'] = np.array(y) 402 | history.history['name'] = ModelName 403 | 404 | print("results written to: ",resultsFile) 405 | pickle.dump(history.history, open( resultsFile, "wb" )) 406 | 407 | return None 408 | 409 | #------------------------------------------------------------------------------------------- 410 | 411 | #def indicesGenerator(batchSize,numReps): 412 | # ''' 413 | # Generate indices randomly from range (0,numReps) in batches of size batchSize 414 | # without replacement. 415 | # 416 | # This is for the batch generator to randomly choose trees from a directory 417 | # but make sure 418 | # ''' 419 | # availableIndices = np.arange(numReps) 420 | # np.random.shuffle(availableIndices) 421 | # ci = 0 422 | # while 1: 423 | # if((ci+batchSize) > numReps): 424 | # ci = 0 425 | # np.random.shuffle(availableIndices) 426 | # batchIndices = availableIndices[ci:ci+batchSize] 427 | # ci = ci+batchSize 428 | # 429 | # yield batchIndices 430 | 431 | #------------------------------------------------------------------------------------------- 432 | 433 | def getHapsPosLabels(direc,simulator,shuffle=False): 434 | ''' 435 | loops through a trees directory created by the data generator class 436 | and returns the repsective genotype matrices, positions, and labels 437 | ''' 438 | haps = [] 439 | positions = [] 440 | infoFilename = os.path.join(direc,"info.p") 441 | infoDict = pickle.load(open(infoFilename,"rb")) 442 | labels = infoDict["y"] 443 | 444 | #how many trees files are in this directory. 445 | li = os.listdir(direc) 446 | numReps = len(li) - 1 #minus one for the 'info.p' file 447 | 448 | for i in range(numReps): 449 | filename = str(i) + ".trees" 450 | filepath = os.path.join(direc,filename) 451 | treeSequence = msp.load(filepath) 452 | haps.append(treeSequence.genotype_matrix()) 453 | positions.append(np.array([s.position for s in treeSequence.sites()])) 454 | 455 | 456 | haps = np.array(haps) 457 | positions = np.array(positions) 458 | 459 | return haps,positions,labels 460 | 461 | #------------------------------------------------------------------------------------------- 462 | 463 | def simplifyTreeSequenceOnSubSampleSet_stub(ts,numSamples): 464 | ''' 465 | This function should take in a tree sequence, generate 466 | a subset the size of numSamples, and return the tree sequence simplified on 467 | that subset of individuals 468 | ''' 469 | 470 | ts = ts.simplify() #is this neccessary 471 | inds = [ind.id for ind in ts.individuals()] 472 | sample_subset = np.sort(np.random.choice(inds,sample_size,replace=False)) 473 | sample_nodes = [] 474 | for i in sample_subset: 475 | ind = ts.individual(i) 476 | sample_nodes.append(ind.nodes[0]) 477 | sample_nodes.append(ind.nodes[1]) 478 | 479 | ts = ts.simplify(sample_nodes) 480 | 481 | return ts 482 | 483 | #------------------------------------------------------------------------------------------- 484 | 485 | def sort_min_diff(amat): 486 | '''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity. 487 | this problem is NP, so here we use a nearest neighbors approx. it's not perfect, but it's fast and generally performs ok. 488 | assumes your input matrix is a numpy array''' 489 | 490 | mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat) 491 | v = mb.kneighbors(amat) 492 | smallest = np.argmin(v[0].sum(axis=1)) 493 | return amat[v[1][smallest]] 494 | 495 | #------------------------------------------------------------------------------------------- 496 | 497 | def mutateTrees(treesDirec,outputDirec,muLow,muHigh,numMutsPerTree=1,simulator="msprime"): 498 | ''' 499 | read in .trees files from treesDirec, mutate that tree numMuts seperate times 500 | using a mutation rate pulled from a uniform dirstribution between muLow and muHigh 501 | 502 | also, re-write the labels file to reflect. 503 | ''' 504 | if(numMutsPerTree > 1): 505 | assert(treesDirec != outputDirec) 506 | 507 | if not os.path.exists(outputDirec): 508 | print("directory '",outputDirec,"' does not exist, creating it") 509 | os.makedirs(outputDirec) 510 | 511 | infoFilename = os.path.join(treesDirec,"info.p") 512 | infoDict = pickle.load(open(infoFilename,"rb")) 513 | labels = infoDict["y"] 514 | 515 | newLabels = [] 516 | newMaxSegSites = 0 517 | 518 | #how many trees files are in this directory. 519 | li = os.listdir(treesDirec) 520 | numReps = len(li) - 1 #minus one for the 'labels.txt' file 521 | 522 | for i in range(numReps): 523 | filename = str(i) + ".trees" 524 | filepath = os.path.join(treesDirec,filename) 525 | treeSequence = msp.load(filepath) 526 | blankTreeSequence = msp.mutate(treeSequence,0) 527 | rho = labels[i] 528 | for mut in range(numMuts): 529 | simNum = (i*numMuts) + mut 530 | simFileName = os.path.join(outputDirec,str(simNum)+".trees") 531 | mutationRate = np.random.uniform(muLow,muHigh) 532 | mutatedTreeSequence = msp.mutate(blankTreeSequence,mutationRate) 533 | mutatedTreeSequence.dump(simFileName) 534 | newMaxSegSites = max(newMaxSegSites,mutatedTreeSequence.num_sites) 535 | newLabels.append(rho) 536 | 537 | infoCopy = copy.deepcopy(infoDict) 538 | infoCopy["maxSegSites"] = newMaxSeqSites 539 | if(numMutsPerTree > 1): 540 | infoCopy["y"] = np.array(newLabels,dtype="float32") 541 | infoCopy["numReps"] = numReps * numMuts 542 | outInfoFilename = os.path.join(outputDirec,"info.p") 543 | pickle.dump(infocopy,open(outInfoFilename,"wb")) 544 | 545 | return None 546 | 547 | #------------------------------------------------------------------------------------------- 548 | 549 | def segSitesStats(treesDirec): 550 | ''' 551 | DEPRICATED 552 | ''' 553 | 554 | infoFilename = os.path.join(treesDirec,"info.p") 555 | infoDict = pickle.load(open(infoFilename,"rb")) 556 | 557 | newLabels = [] 558 | newMaxSegSites = 0 559 | 560 | #how many trees files are in this directory. 561 | li = os.listdir(treesDirec) 562 | numReps = len(li) - 1 #minus one for the 'labels.txt' file 563 | 564 | segSites = [] 565 | 566 | for i in range(numReps): 567 | filename = str(i) + ".trees" 568 | filepath = os.path.join(treesDirec,filename) 569 | treeSequence = msp.load(filepath) 570 | segSites.append(treeSequence.num_sites) 571 | 572 | return segSites 573 | 574 | #------------------------------------------------------------------------------------------- 575 | 576 | def mae(x,y): 577 | ''' 578 | Compute mean absolute error between predictions and targets 579 | 580 | float[],float[] -> float 581 | ''' 582 | assert(len(x) == len(y)) 583 | summ = 0.0 584 | length = len(x) 585 | for i in range(length): 586 | summ += abs(x[i] - y[i]) 587 | return summ/length 588 | 589 | #------------------------------------------------------------------------------------------- 590 | 591 | def mse(x,y): 592 | ''' 593 | Compute mean squared error between predictions and targets 594 | 595 | float[],float[] -> float 596 | ''' 597 | 598 | assert(len(x) == len(y)) 599 | summ = 0.0 600 | length = len(x) 601 | for i in range(length): 602 | summ += (x[i] - y[i])**2 603 | return summ/length 604 | 605 | #------------------------------------------------------------------------------------------- 606 | 607 | def plotResults(resultsFile,saveas): 608 | 609 | ''' 610 | plotting code for testing a model on simulation. 611 | using the resulting pickle file on a training run (resultsFile). 612 | This function plots the results of the final test set predictions, 613 | as well as validation loss as a function of Epochs during training. 614 | 615 | ''' 616 | 617 | plt.rc('font', family='serif', serif='Times') 618 | plt.rc('xtick', labelsize=6) 619 | plt.rc('ytick', labelsize=6) 620 | plt.rc('axes', labelsize=6) 621 | 622 | results = pickle.load(open( resultsFile , "rb" )) 623 | 624 | fig,axes = plt.subplots(2,1) 625 | plt.subplots_adjust(hspace=0.5) 626 | 627 | predictions = np.array([float(Y) for Y in results["predictions"]]) 628 | realValues = np.array([float(X) for X in results["Y_test"]]) 629 | 630 | r_2 = round((np.corrcoef(predictions,realValues)[0,1])**2,5) 631 | 632 | mae_0 = round(mae(realValues,predictions),4) 633 | mse_0 = round(mse(realValues,predictions),4) 634 | labels = "$R^{2} = $"+str(r_2)+"\n"+"$mae = $" + str(mae_0)+" | "+"$mse = $" + str(mse_0) 635 | 636 | axes[0].scatter(realValues,predictions,marker = "o", color = 'tab:purple',s=5.0,alpha=0.6) 637 | 638 | lims = [ 639 | np.min([axes[0].get_xlim(), axes[0].get_ylim()]), # min of both axes 640 | np.max([axes[0].get_xlim(), axes[0].get_ylim()]), # max of both axes 641 | ] 642 | axes[0].set_xlim(lims) 643 | axes[0].set_ylim(lims) 644 | axes[0].plot(lims, lims, 'k-', alpha=0.75, zorder=0) 645 | axes[0].set_title(results["name"]+"\n"+labels,fontsize=6) 646 | 647 | lossRowIndex = 1 648 | axes[1].plot(results["loss"],label = "mae loss",color='tab:cyan') 649 | axes[1].plot(results["val_loss"], label= "mae validation loss",color='tab:pink') 650 | 651 | #axes[1].plot(results["mean_squared_error"],label = "mse loss",color='tab:green') 652 | #axes[1].plot(results["val_mean_squared_error"], label= "mse validation loss",color='tab:olive') 653 | 654 | axes[1].legend(frameon = False,fontsize = 6) 655 | axes[1].set_ylabel("mse") 656 | 657 | axes[0].set_ylabel(str(len(predictions))+" msprime predictions") 658 | axes[0].set_xlabel(str(len(realValues))+" msprime real values") 659 | fig.subplots_adjust(left=.15, bottom=.16, right=.85, top=.92,hspace = 0.5,wspace=0.4) 660 | height = 7.00 661 | width = 7.00 662 | 663 | axes[0].grid() 664 | fig.set_size_inches(height, width) 665 | fig.savefig(saveas) 666 | 667 | #------------------------------------------------------------------------------------------- 668 | 669 | def getMeanSDMax(trainDir): 670 | ''' 671 | get the mean and standard deviation of rho from training set 672 | 673 | str -> int,int,int 674 | 675 | ''' 676 | info = pickle.load(open(trainDir+"/info.p","rb")) 677 | rho = info["rho"] 678 | segSites = info["segSites"] 679 | tar_mean = np.mean(rho,axis=0) 680 | tar_sd = np.std(rho,axis=0) 681 | return tar_mean,tar_sd,max(segSites) 682 | 683 | #------------------------------------------------------------------------------------------- 684 | 685 | def unNormalize(mean,sd,data): 686 | ''' 687 | un-zcore-ify. do the inverse to get real value predictions 688 | 689 | float,float,float[] -> float[] 690 | ''' 691 | 692 | data *= sd 693 | data += mean ##comment this line out for GRU_TUNED84_RELU 694 | return data 695 | 696 | #------------------------------------------------------------------------------------------- 697 | 698 | def plotParametricBootstrap(results,saveas): 699 | 700 | ''' 701 | Use the location of "out" paramerter to parametric bootstrap 702 | as input to plot the results of said para-boot 703 | ''' 704 | 705 | stats = pickle.load(open(results,'rb')) 706 | x = stats["rho"] 707 | 708 | fig, ax = plt.subplots() 709 | 710 | 711 | for i,s in enumerate(stats): 712 | if(i == 0): 713 | continue 714 | 715 | ax.plot(x,stats[s]) 716 | 717 | lims = [ 718 | np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes 719 | np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes 720 | ] 721 | ax.set_xlim(lims) 722 | ax.set_ylim(lims) 723 | ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) 724 | 725 | fig.savefig(saveas) 726 | 727 | return None 728 | 729 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # *ReLERNN* 2 | ## *Recombination Landscape Estimation using Recurrent Neural Networks* 3 | ==================================================================== 4 | 5 | ReLERNN uses deep learning to infer the genome-wide landscape of recombination from as few as four individually sequenced chromosomes, or from allele frequencies inferred by pooled sequencing. 6 | This repository contains the code and instructions required to run ReLERNN, and includes example files to ensure everything is working properly. The manuscript detailing ReLERNN can be found [here](https://academic.oup.com/mbe/advance-article/doi/10.1093/molbev/msaa038/5741419). 7 | 8 | ## Recommended installation on linux 9 | Install `tensorflow 2` on your system. Directions can be found [here](https://www.tensorflow.org/install). You will also need to install the CUDA toolkit and CuDNN. 10 | ReLERNN requires the use of a CUDA-Enabled NVIDIA GPU. The current version of ReLERNN has been successfully tested with tensorflow/2.2.0, cudatoolkit/10.1.243, and cudnn/7.6.5. 11 | 12 | Further dependencies for ReLERNN can be installed with pip. 13 | This is done with the following commands: 14 | 15 | ```bash 16 | $ git clone https://github.com/kr-colab/ReLERNN.git 17 | $ cd ReLERNN 18 | $ pip install . 19 | ``` 20 | 21 | It should be as simple as that. 22 | 23 | ### Installing `CUDA` 24 | 25 | We are asked often about installing `CUDA` and the NVIDIA requirements. This can be quite finicky depending on your hardware setup, but many users 26 | have had luck installing the `tensorflow`/`cuda` requirements using `mamba` with the following recipe 27 | 28 | ```bash 29 | $ mamba create -n relearnn-1.0.0 -c conda-forge -c nvidia python=3.10 tensorflow=2.15.0 cuda-toolkit h5py -y 30 | # then install ReLERNN as above 31 | $ git clone https://github.com/kr-colab/ReLERNN.git 32 | $ cd ReLERNN 33 | $ pip install . 34 | ``` 35 | 36 | ## Testing ReLERNN 37 | An example VCF file (5 contigs; 10 haploid chromosomes) and a shell script for running ReLERNN's four modules is located in `$/ReLERNN/examples`. 38 | To test the functionality of ReLERNN simply use the following commands: 39 | 40 | ```bash 41 | $ cd examples 42 | $ ./example_pipeline.sh 43 | ``` 44 | 45 | Provided everything worked as planned, `$ReLERNN/examples/example_output/` should be populated with a few directories along with the files: `example.PREDICT.txt` and `example.PREDICT.BSCORRECT.txt`. 46 | The latter is the finalized output file with your recombination rate predictions and estimates of uncertainty. 47 | 48 | The above example took 57 seconds to complete on a Xeon machine using four CPUs and one NVIDIA 2070 GPU. 49 | Note that the parameters used for this example were designed only to test the success of the installation, not to make accurate predictions. 50 | Please use the guidelines below for the best results when analyzing real data. 51 | 52 | You can now test the functionality of ReLERNN for use with pool-seq data by using the following commands: 53 | 54 | ```bash 55 | $ cd examples 56 | $ ./example_pipeline_pool.sh 57 | ``` 58 | 59 | ## Estimating a recombination landscape from individually sequenced chromosomes 60 | 61 | The ReLERNN pipeline is executed using four commands: `ReLERNN_SIMULATE`, `ReLERNN_TRAIN`, `ReLERNN_PREDICT`, and the optional `ReLERNN_BSCORRECT` (see the [Method flow diagram](./methodFlow.png)). 62 | 63 | ### Before running ReLERNN 64 | ReLERNN takes as input a VCF file of biallelic variants. Users should use appropriate QC techniques (filtering low-quality variants, etc.) and remove non-biallelic variants before running ReLERNN. Small contigs (<< 250 SNPs) should not be included in the genome file `--genome`, though these do not need to be removed from the VCF. 65 | ReLERNN also requires that the number of sampled chromosomes is identical across all contigs, and VCFs should be filtered accordingly. Hemizygous chromosomes or haploid samples in an otherwise diploid dataset 66 | should ideally be run separately using a separate VCF. It is possible to treat hemizygous chromosomes as "diploids with missing data" using the `--forceDiploid` option, however this is not recommended. 67 | It is now possible to run ReLERNN on VCFs with missing genotypes (coded as a `.`). 68 | 69 | If you want to make predictions based on equilibrium simulations, you can skip ahead to executing `ReLERNN_SIMULATE`. 70 | While ReLERNN is generally robust to demographic model misspecification, prediction accuracy may potentially be improved by simulating the training set under a demographic history that accurately matches that of your sample. ReLERNN optionally takes the output files from three popular demographic history inference programs ([stairwayplot_v1](https://sites.google.com/site/jpopgen/stairway-plot), [SMC++](https://github.com/popgenmethods/smcpp), and [MSMC](https://github.com/stschiff/msmc)), and simulates a training set under these histories. Note: for SMC++ use the .csv output (option -c in SMC++). It is up to the user to perform the proper due diligence to ensure that the population size histories reported by these programs are sound. In our opinion, unless you know exactly how these programs work and you expect your data to represent a history dramatically different from equilibrium, you are better off skipping this step and training ReLERNN on equilibrium simulations. Once you have run one of the demographic history inference programs listed above, you simply provide the raw output file from that program to ReLERNN_SIMULATE using the `--demographicHistory` option. 71 | 72 | 73 | ### Step 1) ReLERNN_SIMULATE 74 | `ReLERNN_SIMULATE` reads your VCF file and splits it by chromosome. The chromosomes to be evaluated must be specified by providing a BED file of said positions using the `--genome` argument. A BED-formatted accessibility mask (with non-overlapping ascending windows) may be optionally provided using the `--mask` option. Use the `--phased` or `--unphased` flag to train using phased or unphased genotypes (the default is unphased). It is required that the VCF file use the extension `.vcf`. The prefix of that file will serve as the prefix used for all output files (e.g. running ReLERNN on the file `population7.vcf` will generate the result file `population7.PREDICT.txt`). It is strongly recommended that you use the default setting for `--maxWinSize`, larger values can cause training to fail and smaller values can result in lower accuracy. Users are required to provide an estimate of the per-base mutation rate for your sample, along with an estimate for generation time (in years). If you previously ran one of the demographic history inference programs listed above, just use the same values that you used for them. This is also where you will point to the output from said program, using `--demographicHistory`. If you are not simulating under an inferred history, simply do not include this option. Importantly, you can also set a value for the maximum recombination rate to be simulated using `--upperRhoThetaRatio`. If you have an a priori estimate for an upper bound to the ratio of rho to theta go ahead and set this here. Keep in mind that higher values will dramatically slow the coalescent simulations. We recommend using the default number of train/test/validation simulation examples, but if you want to simulate more examples, go right ahead. `ReLERNN_SIMULATE` then uses msprime to simulate 100k training examples and 1k validation and test examples. All output files will be generated in subdirectories within the path provided to `--projectDir`. It is required that you use the same projectDir for all four ReLERNN commands. If you want to run ReLERNN of multiple populations/taxa, you can run them independently using a unique projectDir for each. This step is simulation heavy and runtimes will strongly depend on the inferred population size. 75 | 76 | The complete list of arguments used in `ReLERNN_SIMULATE` is found below: 77 | ``` 78 | ReLERNN_SIMULATE -h 79 | 80 | usage: ReLERNN_SIMULATE [-h] [-v VCF] [-g GENOME] [-m MASK] [-d OUTDIR] 81 | [-n DEM] [-u MU] [-l GENTIME] [-r UPRTR] [-t NCPU] [-s SEED] 82 | [--phased] [--unphased] [--forceDiploid] [--phaseError PHASEERROR] 83 | [--maxWinSize WINSIZEMX] [--maskThresh MASKTHRESH] 84 | [--nTrain NTRAIN] [--nVali NVALI] [--nTest NTEST] 85 | 86 | optional arguments: 87 | -h, --help show this help message and exit 88 | -v VCF, --vcf VCF Filtered and QC-checked VCF file. Important: Every row 89 | must correspond to a biallelic SNP with no missing 90 | data! 91 | -g GENOME, --genome GENOME 92 | BED-formatted (i.e. zero-based) file corresponding to 93 | chromosomes and positions to consider 94 | -m MASK, --mask MASK BED-formatted file corresponding to inaccessible bases 95 | -d OUTDIR, --projectDir OUTDIR 96 | Directory for all project output. NOTE: the same 97 | projectDir must be used for all functions of ReLERNN 98 | -n DEM, --demographicHistory DEM 99 | Output file from either stairwayplot, SMC++, or MSMC 100 | -u MU, --assumedMu MU 101 | Assumed per-base mutation rate 102 | -l GENTIME, --assumedGenTime GENTIME 103 | Assumed generation time (in years) 104 | -r UPRTR, --upperRhoThetaRatio UPRTR 105 | Assumed upper bound for the ratio of rho to theta 106 | -t NCPU, --nCPU NCPU Number of CPUs to use (defaults to total available cores) 107 | -s SEED, --seed SEED Random seed 108 | --phased VCF file is phased 109 | --unphased VCF file is unphased 110 | --forceDiploid Treats all samples as diploids 111 | with missing data (bad idea; see README) 112 | --phaseError PHASEERROR 113 | Fraction of bases simulated with incorrect phasing 114 | --maxWinSize WINSIZEMX 115 | Max number of sites per window to train on. Important: 116 | too many sites causes problems in training 117 | --maskThresh MASKTHRESH 118 | Discard windows where >= maskThresh percent of sites 119 | are inaccessible 120 | --nTrain NTRAIN Number of training examples to simulate 121 | --nVali NVALI Number of validation examples to simulate 122 | --nTest NTEST Number of test examples to simulate 123 | ``` 124 | 125 | 126 | ### Step 2) ReLERNN_TRAIN 127 | `ReLERNN_TRAIN` takes the simulations created by `ReLERNN_SIMULATE` and uses them to train a recurrent neural network. Again, we recommend using the defaults for `--nEpochs` and `--nValSteps`, but if you would like to do more training, feel free. To set the GPU to be used for machines with multiple dedicated GPUs use `--gpuID` (e.g. if running an analysis on two populations simultaneously, set `--gpuID 0` for the first population and `--gpuID 1` for the second). `ReLERNN_TRAIN` outputs some basic metrics of the training results for you, generating the figure `$/projectDir/networks/vcfprefix.pdf`. The default value of `-nCPU` is 1 for this step, as this is often produces the shortest training times per epoch (depending on missing data and the mask). Feel free to test training times using multiple cores, and set `-nCPU` to whatever works best for your data/machine. 128 | 129 | The complete list of arguments used in `ReLERNN_TRAIN` is found below: 130 | ``` 131 | ReLERNN_TRAIN -h 132 | 133 | usage: ReLERNN_TRAIN [-h] [-d OUTDIR] [--nEpochs NEPOCHS] 134 | [-t NCPU] [-s SEED] 135 | [--nValSteps NVALSTEPS] [--gpuID GPUID] 136 | 137 | optional arguments: 138 | -h, --help show this help message and exit 139 | -d OUTDIR, --projectDir OUTDIR 140 | Directory for all project output. NOTE: the same 141 | projectDir must be used for all functions of ReLERNN 142 | -t NCPU, --nCPU NCPU Number of CPUs to use (defaults to 1) 143 | -s SEED, --seed SEED Random seed 144 | --nEpochs NEPOCHS Number of epochs to train over 145 | --nValSteps NVALSTEPS 146 | Number of validation steps 147 | --gpuID GPUID Identifier specifying which GPU to use 148 | ``` 149 | 150 | 151 | 152 | ### Step 3) ReLERNN_PREDICT 153 | `ReLERNN_PREDICT` now takes the same VCF file you used in `ReLERNN_SIMULATE` and predicts per-base recombination rates in non-overlapping windows across the genome. The output file of predictions will be created as `$/projectDir/vcfprefix.PREDICT.txt`. It is important to note that the window size used for predictions might be different for different chromosomes. A complete list of the window sizes used for each chromosome can be found in third column of `$/projectDir/networks/windowSizes.txt`. Use the optional `--minSites` argument to exclude windows with fewer than the desired number of SNPs. If you are not interested in estimating confidence intervals around the predictions, your ReLERNN analysis is now finished. If you are getting OOM errors at this step you can try setting `--batchSizeOverride` to a value significantly less than the total number of windows along a chromosome (found in the last column of `$/projectDir/networks/windowSizes.txt`). 154 | 155 | 156 | The complete list of arguments used in `ReLERNN_PREDICT` is found below: 157 | ``` 158 | ReLERNN_PREDICT -h 159 | 160 | usage: ReLERNN_PREDICT [-h] [-v VCF] [-d OUTDIR] [--minSites MINS] 161 | [--gpuID GPUID] [--batchSizeOverride BSO] [-s SEED] 162 | 163 | optional arguments: 164 | -h, --help show this help message and exit 165 | -v VCF, --vcf VCF Filtered and QC-checked VCF file. Important: Every row 166 | must correspond to a biallelic SNP with no missing 167 | data! 168 | -d OUTDIR, --projectDir OUTDIR 169 | Directory for all project output. NOTE: the same 170 | projectDir must be used for all functions of ReLERNN 171 | --phased VCF file is phased 172 | --unphased VCF file is unphased 173 | --minSites MINS Minimum number of SNPs in a genomic window required to 174 | return a prediction 175 | --gpuID GPUID Identifier specifying which GPU to use 176 | --batchSizeOverride BSO 177 | Batch size to use for low memory applications 178 | -s SEED, --seed SEED Random seed 179 | 180 | ``` 181 | 182 | ### Optional Step 4) ReLERNN_BSCORRECT 183 | However, you might want to have an idea of the uncertainty around your predictions. This is where `ReLERNN_BSCORRECT` comes in. `ReLERNN_BSCORRECT` generates 95% confidence intervals around each prediction, and additionally attempts to correct for systematic bias ([see Materials and Methods](https://www.biorxiv.org/content/biorxiv/early/2019/08/16/662247.full.pdf)). It does this by simulated a set of `--nReps` examples at each of `nSlice` recombination rate bins. It then uses the network that was trained in `ReLERNN_TRAIN` and estimates the distribution of predictions around each know recombination rate. The result is both an estimate of uncertainty, and a prediction that has been slightly corrected to account for biases in how the network predicts in this area of parameter space. The resulting file is created as `$/projectDir/vcfprefix.PREDICT.BSCORRECT.txt`, and is formatted similarly to `$/projectDir/vcfprefix.PREDICT.txt`, with the addition of columns for the low and high 95CI bounds. Note that this step is simulation heavy and runtimes can be slow. 184 | 185 | The complete list of arguments used in `ReLERNN_BSCORRECT` is found below: 186 | ``` 187 | ReLERNN_BSCORRECT -h 188 | 189 | usage: ReLERNN_BSCORRECT [-h] [-d OUTDIR] [-t NCPU] [-s SEED] [--gpuID GPUID] 190 | [--nSlice NSLICE] [--nReps NREPS] 191 | 192 | optional arguments: 193 | -h, --help show this help message and exit 194 | -d OUTDIR, --projectDir OUTDIR 195 | Directory for all project output. NOTE: the same 196 | projectDir must be used for all functions of ReLERNN 197 | -t NCPU, --nCPU NCPU Number of CPUs to use (defaults to total available cores) 198 | -s SEED, --seed SEED Random seed 199 | --gpuID GPUID Identifier specifying which GPU to use 200 | --nSlice NSLICE Number of recombination rate bins to simulate over 201 | --nReps NREPS Number of simulations per step 202 | ``` 203 | 204 | ## Estimating a recombination landscape from pool-seq data 205 | 206 | Similar to the directions above, the ReLERNN pipeline for pool-seq data is executed using four commands: `ReLERNN_SIMULATE_POOL`, `ReLERNN_TRAIN_POOL`, `ReLERNN_PREDICT_POOL`, and the optional `ReLERNN_BSCORRECT`. 207 | 208 | ### Before running ReLERNN 209 | ReLERNN for pool-seq analyses takes as input a file of genomic positions and allele frequencies (herein a 'POOLFILE'; see example file). 210 | 211 | Similar to ReLERNN for individually sequenced chromosomes, if you want to make predictions based on equilibrium simulations, you can skip ahead to executing `ReLERNN_SIMULATE_POOL`. 212 | While ReLERNN is generally robust to demographic model misspecification, prediction accuracy may potentially be improved by simulating the training set under a demographic history that accurately matches that of your sample. ReLERNN optionally takes the raw output files from three popular demographic history inference programs ([stairwayplot_v1](https://sites.google.com/site/jpopgen/stairway-plot), [SMC++](https://github.com/popgenmethods/smcpp), and [MSMC](https://github.com/stschiff/msmc)), and simulates a training set under these histories. It is up to the user to perform the proper due diligence to ensure that the population size histories reported by these programs are sound. In our opinion, unless you know exactly how these programs work and you expect your data to represent a history dramatically different from equilibrium, you are better off skipping this step and training ReLERNN on equilibrium simulations. Once you have run one of the demographic history inference programs listed above, you simply provide the raw output file from that program to ReLERNN_SIMULATE_POOL using the `--demographicHistory` option. 213 | 214 | 215 | ### Step 1) ReLERNN_SIMULATE_POOL 216 | `ReLERNN_SIMULATE_POOL` reads your POOLFILE and splits it by chromosome. The number of chromosomes in the pool must be specified using the `--sampleDepth` argument. The genomic chromosomes to be evaluated must be specified by providing a BED file of said positions using the `--genome` argument. A BED-formatted accessibility mask (with non-overlapping ascending windows) may be optionally provided using the `--mask` option. It is required that the POOLFILE use the extension `.pool`. The prefix of that file will serve as the prefix used for all output files (e.g. running ReLERNN on the file `population7.pool` will generate the result file `population7.PREDICT.txt`). It is strongly recommended that you use the default setting for `--maxSites`, larger values can cause training to fail and smaller values can result in lower accuracy. Users are required to provide an estimate of the per-base mutation rate for your sample, along with an estimate for generation time (in years). If you previously ran one of the demographic history inference programs listed above, just use the same values that you used for them. This is also where you will point to the output from said program, using `--demographicHistory`. If you are not simulating under an inferred history, simply do not include this option. Importantly, you can also set a value for the maximum recombination rate to be simulated using `--upperRhoThetaRatio`. If you have an a priori estimate for an upper bound to the ratio of rho to theta go ahead and set this here. Keep in mind that higher values will dramatically slow the coalescent simulations. We recommend using the default number of train/test/validation simulation examples, but if you want to simulate more examples, go right ahead. `ReLERNN_SIMULATE_POOL` then uses msprime to simulate 100k training examples and 1k validation and test examples. All output files will be generated in subdirectories within the path provided to `--projectDir`. It is required that you use the same projectDir for all four ReLERNN commands. If you want to run ReLERNN of multiple populations/taxa, you can run them independently using a unique projectDir for each. This step is simulation heavy and runtimes will strongly depend on the inferred population size. 217 | 218 | The complete list of arguments used in `ReLERNN_SIMULATE_POOL` is found below: 219 | ``` 220 | ReLERNN_SIMULATE_POOL -h 221 | 222 | usage: ReLERNN_SIMULATE_POOL [-h] [-p POOL] [--sampleDepth SAMD] [-g GENOME] [-m MASK] [-d OUTDIR] 223 | [-n DEM] [-u MU] [-l GENTIME] [-r UPRTR] [-t NCPU] [-s SEED] 224 | [--maxSites WINSIZEMX] [--maskThresh MASKTHRESH] 225 | [--nTrain NTRAIN] [--nVali NVALI] [--nTest NTEST] 226 | 227 | optional arguments: 228 | -h, --help show this help message and exit 229 | -p POOL, --pool POOL Filtered and QC-checked POOL file. 230 | --sampleDepth SAMD Number of chromosomes in pool 231 | -g GENOME, --genome GENOME 232 | BED-formatted (i.e. zero-based) file corresponding to 233 | chromosomes and positions to consider 234 | -m MASK, --mask MASK BED-formatted file corresponding to inaccessible bases 235 | -d OUTDIR, --projectDir OUTDIR 236 | Directory for all project output. NOTE: the same 237 | projectDir must be used for all functions of ReLERNN 238 | -n DEM, --demographicHistory DEM 239 | Output file from either stairwayplot, SMC++, or MSMC 240 | -u MU, --assumedMu MU 241 | Assumed per-base mutation rate 242 | -l GENTIME, --assumedGenTime GENTIME 243 | Assumed generation time (in years) 244 | -r UPRTR, --upperRhoThetaRatio UPRTR 245 | Assumed upper bound for the ratio of rho to theta 246 | -t NCPU, --nCPU NCPU Number of CPUs to use (defaults to total available cores) 247 | -s SEED, --seed SEED Random seed 248 | --maxSites WINSIZEMX 249 | Max number of sites per window to train on. Important: 250 | too many sites causes problems in training 251 | --maskThresh MASKTHRESH 252 | Discard windows where >= maskThresh percent of sites 253 | are inaccessible 254 | --nTrain NTRAIN Number of training examples to simulate 255 | --nVali NVALI Number of validation examples to simulate 256 | --nTest NTEST Number of test examples to simulate 257 | ``` 258 | 259 | 260 | ### Step 2) ReLERNN_TRAIN_POOL 261 | `ReLERNN_TRAIN_POOL` takes the simulations created by `ReLERNN_SIMULATE_POOL` and uses them to train a recurrent neural network. The only difference here is that the mean read depth of the pool must be specified using the `--readDepth` argument. You can also specify a minor allele frequency threshold (`--maf`), if a similar threshold was used to generate your POOLFILE. Again, we recommend using the defaults for `--nEpochs` and `--nValSteps`, but if you would like to do more training, feel free. To set the GPU to be used for machines with multiple dedicated GPUs use `--gpuID` (e.g. if running an analysis on two populations simultaneously, set `--gpuID 0` for the first population and `--gpuID 1` for the second). `ReLERNN_TRAIN_POOL` outputs some basic metrics of the training results for you, generating the figure `$/projectDir/networks/poolprefix.pdf`. The default value of `-nCPU` for this step is the max number of available cores, as training on pooled data with a single core can be very slow. 262 | 263 | The complete list of arguments used in `ReLERNN_TRAIN_POOL` is found below: 264 | ``` 265 | ReLERNN_TRAIN_POOL -h 266 | 267 | usage: ReLERNN_TRAIN_POOL [-h] [-d OUTDIR] [--readDepth SEQD] [--maf MAF] [--nEpochs NEPOCHS] 268 | [--nValSteps NVALSTEPS] [-t NCPU] [-s SEED] [--gpuID GPUID] 269 | 270 | optional arguments: 271 | -h, --help show this help message and exit 272 | -d OUTDIR, --projectDir OUTDIR 273 | Directory for all project output. NOTE: the same 274 | projectDir must be used for all functions of ReLERNN 275 | --readDepth SEQD Mean read depth of the pool 276 | --maf MAF discard simulated sites with allele frequencies < maf 277 | --nEpochs NEPOCHS Number of epochs to train over 278 | --nValSteps NVALSTEPS 279 | Number of validation steps 280 | -t NCPU, --nCPU NCPU Number of CPUs to use (defaults to total available cores) 281 | -s SEED, --seed SEED Random seed 282 | --gpuID GPUID Identifier specifying which GPU to use 283 | ``` 284 | 285 | 286 | 287 | ### Step 3) ReLERNN_PREDICT_POOL 288 | `ReLERNN_PREDICT_POOL` now takes the same POOL file you used in `ReLERNN_SIMULATE_POOL` and predicts per-base recombination rates in non-overlapping windows across the genome. The output file of predictions will be created as `$/projectDir/poolprefix.PREDICT.txt`. It is important to note that the window size used for predictions might be different for different chromosomes. A complete list of the window sizes used for each chromosome can be found in third column of `$/projectDir/networks/windowSizes.txt`. Use the optional `--minSites` argument to exclude windows with fewer than the desired number of SNPs. If you are not interested in estimating confidence intervals around the predictions, your ReLERNN analysis is now finished. If you are getting OOM errors at this step you can try setting `--batchSizeOverride` to a value significantly less than the total number of windows along a chromosome (found in the last column of `$/projectDir/networks/windowSizes.txt`). 289 | 290 | 291 | The complete list of arguments used in `ReLERNN_PREDICT_POOL` is found below: 292 | ``` 293 | ReLERNN_PREDICT_POOL -h 294 | 295 | usage: ReLERNN_PREDICT [-h] [-p POOL] [-d OUTDIR] [--minSites MINS] 296 | [--batchSizeOverride BSO] [--gpuID GPUID] [-s SEED] 297 | 298 | optional arguments: 299 | -h, --help show this help message and exit 300 | -p POOL, --pool POOL Filtered and QC-checked POOL file. 301 | -d OUTDIR, --projectDir OUTDIR 302 | Directory for all project output. NOTE: the same 303 | projectDir must be used for all functions of ReLERNN 304 | --minSites MINS Minimum number of SNPs in a genomic window required to 305 | return a prediction 306 | --batchSizeOverride BSO 307 | Batch size to use for low memory applications 308 | --gpuID GPUID Identifier specifying which GPU to use 309 | -s SEED, --seed SEED Random seed 310 | ``` 311 | 312 | ### Optional Step 4) ReLERNN_BSCORRECT 313 | This step is exactly the same as in ReLERNN for individually sequenced chromosomes (above). 314 | 315 | The complete list of arguments used in `ReLERNN_BSCORRECT` is found below: 316 | ``` 317 | ReLERNN_BSCORRECT -h 318 | 319 | usage: ReLERNN_BSCORRECT [-h] [-d OUTDIR] [-t NCPU] [-s SEED] [--gpuID GPUID] 320 | [--nSlice NSLICE] [--nReps NREPS] 321 | 322 | optional arguments: 323 | -h, --help show this help message and exit 324 | -d OUTDIR, --projectDir OUTDIR 325 | Directory for all project output. NOTE: the same 326 | projectDir must be used for all functions of ReLERNN 327 | -t NCPU, --nCPU NCPU Number of CPUs to use (defaults to total available cores) 328 | -s SEED, --seed SEED Random seed 329 | --gpuID GPUID Identifier specifying which GPU to use 330 | --nSlice NSLICE Number of recombination rate bins to simulate over 331 | --nReps NREPS Number of simulations per step 332 | ``` 333 | --------------------------------------------------------------------------------