├── KERAS ├── keras_CNN_BOOT.py ├── keras_CNN_apply.py ├── keras_CNN_TOPO_trainmore.py └── keras_CNN_TOPO.py ├── INDELible ├── indelible_controlgen_INDEL001_ANTI_WARNOW.R ├── indelible_controlgen_INDEL001_WARNOW.R ├── indelible_controlgen_FE_ULTRASHORT_NOINDEL.R ├── indelible_controlgen_FE_ULTRASHORT_INDEL001.R ├── indelible_controlgen_NOINDEL.R ├── indelible_controlgen_INDEL001.R ├── .ipynb_checkpoints │ └── indelible_controlgen_INDEL001-checkpoint.R ├── indelible_controlgen_REGIONS_NOINDEL0.R ├── indelible_controlgen_REGIONS_INDEL001.R └── figure_prep.R ├── Utils └── fasta2numeric.py └── README.md /KERAS/keras_CNN_BOOT.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import tensorflow as tf 3 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) 4 | import pandas 5 | from itertools import product 6 | import sys, argparse, os 7 | import numpy as np 8 | from math import log, ceil 9 | from scipy.stats import multinomial, chi2 10 | from math import factorial 11 | from keras.utils import plot_model, to_categorical 12 | from keras.models import Model, Sequential, load_model 13 | from keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization, ZeroPadding2D 14 | from keras.layers.convolutional import Conv2D 15 | from keras.layers.pooling import MaxPooling2D, AveragePooling2D 16 | from keras.layers.merge import concatenate 17 | from keras.callbacks import EarlyStopping, ModelCheckpoint 18 | from keras.optimizers import Adam 19 | 20 | #N unrooted trees given N taxa 21 | def n_unroot(Ntaxa): 22 | N=factorial(2*Ntaxa-5)/(factorial(Ntaxa-3)*2**(Ntaxa-3)) 23 | return(int(N)) 24 | 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser(description='Keras BOOT run') 28 | parser.add_argument( '--test', help = "Test dataset in FASTA",dest='TEST') 29 | parser.add_argument( '--lab', help = "Labels of TEST dataset",dest='LAB') 30 | parser.add_argument( '-w', help = "Weights file",dest='WEIGHTS') 31 | parser.add_argument( '-k', help = "Keras model",dest='JASON') 32 | parser.add_argument( '-b', help = "N bootstrap replicates", type=int, dest='NBoot') 33 | parser.add_argument( '-N', help = "N taxa", type=int, dest='Ntaxa') 34 | args = parser.parse_args() 35 | 36 | #Read inputs 37 | print("Reading input") 38 | test_data1=np.load(args.TEST) 39 | print("Done") 40 | 41 | #Model and Weight load 42 | loaded_model = load_model(args.JASON) 43 | loaded_model.summary() 44 | loaded_model.load_weights(args.WEIGHTS) 45 | loaded_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 46 | 47 | #Generate labels 48 | class_lab=np.loadtxt(args.LAB) 49 | Nlabels=n_unroot(args.Ntaxa) 50 | #Bootstrapping 51 | cnn_boot=[] 52 | for i in range(0,len(test_data1)): 53 | aln=test_data1[i] 54 | booty=[] 55 | for b in range(0,args.NBoot): 56 | site=np.argwhere(aln[0,]>=0)[:,0] 57 | pad=np.argwhere(aln[0,]<0)[:,0] 58 | boot_site=np.random.choice(site,len(site),replace=True) 59 | boot_site=np.concatenate((boot_site,pad),axis=None) 60 | boot_aln=aln[:,boot_site] 61 | booty.append(boot_aln) 62 | booty=np.array(booty) 63 | boot_lab=to_categorical(np.repeat(class_lab[i], args.NBoot),num_classes=Nlabels) 64 | boot_eval = loaded_model.evaluate(booty,boot_lab,batch_size=300, verbose=1, steps=None) 65 | cnn_boot.append(boot_eval) 66 | np.savetxt("test.boots.txt",np.array(cnn_boot)[:,1],fmt='%f') 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_INDEL001_ANTI_WARNOW.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | library('phangorn') 7 | library('MCMCpack') 8 | library('dplyr') 9 | library('scales') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 20 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 21 | model=model 22 | model_id=model_id+1 23 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',0,' ',0,' 0','\n [indelmodel] POW 1.5 50\n [indelrate] 0.01'),file,append=T) 24 | 25 | } 26 | return(models_selected) 27 | } 28 | #TREE generating function 29 | tree_gen=function(tr,n_sim) 30 | { 31 | nbranch=length(tr$edge[,1]) 32 | tr$edge.lengths=rep(0,nbranch) 33 | tr_newick=unlist(strsplit(write.tree(tr),"")) 34 | boot=matrix(runif(nbranch*n_sim,0,0.5),ncol=nbranch) 35 | pos=which(tr_newick==0) 36 | trees=data.frame(t(tr_newick)) 37 | trees=trees[rep(1,n_sim),] 38 | trees[,pos]=boot[,1:5] 39 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 40 | return(tree_list) 41 | } 42 | 43 | 44 | indelib_gen=function(n_taxa,n_sim,aln_length,parameter,region) # n_sim = number of simulations per topology 45 | { 46 | dir.create(region, showWarnings = FALSE) 47 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 48 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 49 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 50 | iter=0 51 | for (tr in all_topo) 52 | { 53 | iter=iter+1 54 | dir.create(paste(region,"/topo",iter,sep=""), showWarnings = FALSE) 55 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(region,"/topo",iter,'/control.txt',sep="")) 56 | n_datasets=n_sim 57 | #Set MODEL block 58 | modelset=rep('JC',n_datasets) 59 | MODEL=model_gen(modelset,paste(region,"/topo",iter,'/control.txt',sep="")) 60 | #Set TREE block 61 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 62 | print(iter) 63 | print("Newick") 64 | NEWICK=tree_gen(all_topo[[iter]],n_sim) 65 | print("Done newick") 66 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 67 | #Set PARTITIONS block 68 | PNAME=paste("p",1:n_datasets,sep="") 69 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 70 | #Set EVOLVE block 71 | write('[EVOLVE]',paste(region,"/topo",iter,'/control.txt',sep=""),append=T) 72 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 73 | } 74 | } 75 | for (r in c("WARNOW")) 76 | { 77 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3]),as.numeric(args[4]),r) 78 | } 79 | -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_INDEL001_WARNOW.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | library('phangorn') 7 | library('MCMCpack') 8 | library('dplyr') 9 | library('scales') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 20 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 21 | model=model 22 | model_id=model_id+1 23 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',1,' ',0,' 0','\n [indelmodel] POW 1.5 50\n [indelrate] 0.01'),file,append=T) 24 | 25 | } 26 | return(models_selected) 27 | } 28 | #TREE generating function 29 | tree_gen=function(tr,n_sim) 30 | { 31 | nbranch=length(tr$edge[,1]) 32 | tr$edge.lengths=rep(0,nbranch) 33 | tr_newick=unlist(strsplit(write.tree(tr),"")) 34 | boot=matrix(runif(nbranch*n_sim,0,0.5),ncol=nbranch) 35 | pos=which(tr_newick==0) 36 | trees=data.frame(t(tr_newick)) 37 | trees=trees[rep(1,n_sim),] 38 | trees[,pos]=boot[,1:5] 39 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 40 | return(tree_list) 41 | } 42 | 43 | 44 | indelib_gen=function(n_taxa,n_sim,aln_length,parameter,region) # n_sim = number of simulations per topology 45 | { 46 | dir.create(region, showWarnings = FALSE) 47 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 48 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 49 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 50 | iter=0 51 | for (tr in all_topo) 52 | { 53 | iter=iter+1 54 | dir.create(paste(region,"/topo",iter,sep=""), showWarnings = FALSE) 55 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(region,"/topo",iter,'/control.txt',sep="")) 56 | n_datasets=n_sim 57 | #Set MODEL block 58 | modelset=rep('JC',n_datasets) 59 | MODEL=model_gen(modelset,paste(region,"/topo",iter,'/control.txt',sep="")) 60 | #Set TREE block 61 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 62 | print(iter) 63 | print("Newick") 64 | NEWICK=tree_gen(all_topo[[iter]],n_sim) 65 | print("Done newick") 66 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 67 | #Set PARTITIONS block 68 | PNAME=paste("p",1:n_datasets,sep="") 69 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 70 | #Set EVOLVE block 71 | write('[EVOLVE]',paste(region,"/topo",iter,'/control.txt',sep=""),append=T) 72 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 73 | } 74 | } 75 | for (r in c("WARNOW")) 76 | { 77 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3]),as.numeric(args[4]),r) 78 | } 79 | 80 | 81 | -------------------------------------------------------------------------------- /Utils/fasta2numeric.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas 3 | from itertools import product 4 | import sys, argparse, os 5 | import numpy as np 6 | from math import log, ceil 7 | from scipy.stats import multinomial, chi2 8 | from math import factorial 9 | 10 | ###Convert TRAIN VALID TEST to numeric and save as nupy array 11 | 12 | #N unrooted trees given N taxa 13 | def n_unroot(Ntaxa): 14 | N=factorial(2*Ntaxa-5)/(factorial(Ntaxa-3)*2**(Ntaxa-3)) 15 | return(int(N)) 16 | 17 | #Read FASTA convert to numeric 18 | def fasta_pars(aln_file,seq_number,Lmax): 19 | aln=open(aln_file) 20 | dic={'A':'0','T':'1','C':'2','G':'3','-':'4'} 21 | matrix_out=[] 22 | fasta_dic={} 23 | for line in aln: 24 | if line[0]==">": 25 | header=line[1:].rstrip('\n').strip() 26 | fasta_dic[header]=[] 27 | elif line[0].isalpha() or line[0]=='-': 28 | for base, num in dic.items(): 29 | line=line[:].rstrip('\n').strip().replace(base,num) 30 | line=list(line) 31 | line=[int(n) for n in line] 32 | #Mkae all matrices of equal length for CNN +[-15]*(Lmax-len(line)) 33 | fasta_dic[header]+=line+[-15]*(Lmax-len(line)) 34 | if len(fasta_dic)==seq_number: 35 | taxa_block=[] 36 | for taxa in sorted(list(fasta_dic.keys())): 37 | taxa_block.append(fasta_dic[taxa.strip()]) 38 | fasta_dic={} 39 | matrix_out.append(taxa_block) 40 | return(np.array(matrix_out)) 41 | 42 | #Read training, validation and test datasets to equalize sizes 43 | def tv_parse(train,valid,test,seq_number): 44 | tr=open(train) 45 | va=open(valid) 46 | te=open(test) 47 | LT=max([len(r.strip()) for r in tr]) 48 | print("Training largest alignment: "+str(LT)) 49 | LV=max([len(r.strip()) for r in va]) 50 | print("Validation largest alignment: "+str(LV)) 51 | LTE=max([len(r.strip()) for r in te]) 52 | print("Testing largest alignment: "+str(LTE)) 53 | Lmax=max([LT]+[LV]+[LTE]) 54 | tr=fasta_pars(train,seq_number,Lmax) 55 | va=fasta_pars(valid,seq_number,Lmax) 56 | te=fasta_pars(test,seq_number,Lmax) 57 | print("Training N: "+str(len(tr))) 58 | print("Validation N: "+str(len(va))) 59 | print("Testing N: "+str(len(te))) 60 | return tr, va, te 61 | 62 | def main(): 63 | parser = argparse.ArgumentParser(description='fasta2numeric conversion Ready for Keras') 64 | parser.add_argument( '-t', help = "Training dataset in FASTA",dest='TRAIN') 65 | parser.add_argument( '-v', help = "Validation dataset in FASTA",dest='VALID') 66 | parser.add_argument( '--test', help = "Test dataset in FASTA",dest='TEST') 67 | parser.add_argument( '-N', help = "N taxa", type=int, dest='Ntaxa') 68 | args = parser.parse_args() 69 | 70 | print("Reading input") 71 | train_data1, valid_data1, test_data1 = tv_parse(args.TRAIN,args.VALID,args.TEST,args.Ntaxa) 72 | #Reshape for Keras 73 | train_data1=train_data1.reshape(train_data1.shape[0],train_data1.shape[1],train_data1.shape[2],1) 74 | valid_data1=valid_data1.reshape(valid_data1.shape[0],valid_data1.shape[1],valid_data1.shape[2],1) 75 | test_data1=test_data1.reshape(test_data1.shape[0],test_data1.shape[1],test_data1.shape[2],1) 76 | np.save("TRAIN", train_data1) 77 | np.save("VALID", valid_data1) 78 | np.save("TEST", test_data1) 79 | 80 | if __name__ == "__main__": 81 | main() 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /KERAS/keras_CNN_apply.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import tensorflow as tf 3 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) 4 | from itertools import product 5 | import sys, argparse, os 6 | import numpy as np 7 | from math import log 8 | from scipy.stats import multinomial, chi2 9 | from math import factorial 10 | from keras.utils import plot_model, to_categorical 11 | from keras.models import Model, Sequential 12 | from keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization 13 | from keras.layers.convolutional import Conv2D 14 | from keras.layers.pooling import MaxPooling2D 15 | from keras.layers.merge import concatenate 16 | from keras.callbacks import EarlyStopping, ModelCheckpoint 17 | from keras.models import model_from_json 18 | from keras.models import load_model 19 | #N unrooted trees given N taxa 20 | def n_unroot(Ntaxa): 21 | N=factorial(2*Ntaxa-5)/(factorial(Ntaxa-3)*2**(Ntaxa-3)) 22 | return(int(N)) 23 | 24 | #Generate Numpy array of labels for Keras 25 | def label_gen(classes,n_labels): 26 | vec=[[0]*classes]*int(n_labels) 27 | vec=np.array(vec) 28 | section=int(int(n_labels)/int(classes)) 29 | for cl in range(0,classes): 30 | vec[int(cl*section):int(section+section*cl),cl]=1 31 | return(vec) 32 | 33 | #Read FASTA convert to numeric 34 | def fasta_pars(aln_file,seq_number,Lmax): 35 | aln=open(aln_file) 36 | #Lmax=max([len(r) for r in aln]) 37 | aln=open(aln_file) 38 | dic={'A':'0','T':'1','C':'2','G':'3','-':'4'} 39 | matrix_out=[] 40 | fasta_dic={} 41 | for line in aln: 42 | if line[0]==">": 43 | header=line[1:].rstrip('\n').strip() 44 | fasta_dic[header]=[] 45 | elif line[0].isalpha() or line[0]=='-': 46 | for base, num in dic.items(): 47 | line=line[:].rstrip('\n').strip().replace(base,num) 48 | line=list(line) 49 | line=[int(n) for n in line] 50 | #Mkae all matrices of equal length for CNN +[-15]*(Lmax-len(line)) 51 | fasta_dic[header]+=line+[-15]*(Lmax-len(line)) 52 | if len(fasta_dic)==seq_number: 53 | taxa_block=[] 54 | for taxa in sorted(list(fasta_dic.keys())): 55 | taxa_block.append(fasta_dic[taxa.strip()]) 56 | fasta_dic={} 57 | matrix_out.append(taxa_block) 58 | return(np.array(matrix_out)) 59 | 60 | 61 | def main(): 62 | parser = argparse.ArgumentParser(description='Keras run') 63 | parser.add_argument( '-t', help = "Evaluation dataset in FASTA",dest='EVAL') 64 | parser.add_argument( '-w', help = "Weights file",dest='WEIGHTS') 65 | parser.add_argument( '-k', help = "Keras model",dest='JASON') 66 | parser.add_argument( '-N', help = "N taxa", type=int, dest='Ntaxa') 67 | args = parser.parse_args() 68 | 69 | loaded_model = load_model(args.JASON) 70 | loaded_model.summary() 71 | Lmax=loaded_model.layers[0].get_output_at(0).get_shape().as_list()[2] 72 | #Read eval 73 | print("Reading input") 74 | eval_data=fasta_pars(args.EVAL,args.Ntaxa,Lmax) 75 | eval_data=eval_data.reshape(eval_data.shape[0],eval_data.shape[1],eval_data.shape[2],1) 76 | print("Done") 77 | 78 | #Generate labels 79 | Nlabels=n_unroot(args.Ntaxa) 80 | eval_label=to_categorical(np.repeat(range(0,Nlabels),len(eval_data)/Nlabels), num_classes=None) 81 | 82 | # load weights into new model 83 | print("Loading weights from file") 84 | loaded_model.load_weights(args.WEIGHTS) 85 | loaded_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 86 | 87 | evals = loaded_model.evaluate(eval_data,eval_label,verbose=1, steps=None) 88 | classes = loaded_model.predict(eval_data, verbose=1, steps=None) 89 | np.savetxt("test.evals.txt",evals,fmt='%f') 90 | np.savetxt("test.classes.txt",classes,fmt='%f') 91 | class_lab = classes.argmax(axis=-1) 92 | np.savetxt("test.classeslab.txt",class_lab,fmt='%f') 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_FE_ULTRASHORT_NOINDEL.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | library('phangorn') 7 | library('MCMCpack') 8 | library('dplyr') 9 | library('scales') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | #Invariant sites Unif 20 | I=runif(1,0,1) 21 | A=runif(1,0,5) 22 | #Nucl proportions DIRICHLET 23 | options(digits=5) 24 | Pi=format(rdirichlet(1, alpha=c(5,5,5,5))) 25 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 26 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 27 | options(digits=2) 28 | if (model %in% c('HKY','K80')){ 29 | model=paste(c(model,' ',format(runif(1,0,3))),sep = '') 30 | } else if (model == 'TrN'){ 31 | model=paste(c(model,' ',format(runif(2,0,3))),sep = '') 32 | } else if (model %in% c('TIM' ,'TIMef')){ 33 | model=paste(c(model,' ',format(runif(3,0,3))),sep = '') 34 | } else if (model == 'TVM'){ 35 | model=paste(c(model,' ',format(runif(4,0,3))),sep = '') 36 | } else if (model %in% c('SYM','GTR')){ 37 | model=paste(c(model,' ',format(runif(5,0,3))),sep = '') 38 | } else if (model == 'UNREST'){ 39 | model=paste(c(model,' ',format(runif(11,0,3))),sep = '') 40 | } else { 41 | model=model 42 | } 43 | model_id=model_id+1 44 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',I,' ',A,' 0'),file,append=T) 45 | if (model_orig %in% c('F81','HKY','TrN','TIM','TVM','GTR')) 46 | { 47 | write(paste(' [statefreq]',paste(Pi,collapse=' ')),file,append=T) 48 | } 49 | } 50 | return(models_selected) 51 | } 52 | 53 | 54 | tree_genFE_SHORTULTRA=function(tr,n_sim) 55 | { 56 | nbranch=length(tr$edge[,1]) 57 | tr$edge.lengths=rep(0,nbranch) 58 | tr_newick=unlist(strsplit(write.tree(tr),"")) 59 | boot1=matrix(runif(nbranch*n_sim,c(0.1,0,0,0.1,0,0.1,0,0,0,0.1,0,0.1,0,0.1,0,0,0.1,0,0,0.1),c(0.5,0.05,0.05,0.5,0.05,0.5,0.05,0.05,0.05,0.5,0.05,0.5,0.05,0.5,0.05,0.05,0.5,0.05,0.05,0.5)),ncol=5,byrow=T) 60 | boot2=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.01,0.01,0.01,0.01,0.01)),ncol=5,byrow=T) 61 | boot=rbind(boot1,boot2) 62 | boot=boot[sample(1:nrow(boot),n_sim),] 63 | pos=which(tr_newick==0) 64 | trees=data.frame(t(tr_newick)) 65 | trees=trees[rep(1,n_sim),] 66 | trees[,pos]=boot[,1:5] 67 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 68 | return(tree_list) 69 | } 70 | 71 | 72 | 73 | 74 | indelib_gen=function(n_taxa,n_sim,aln_length) # n_sim = number of simulations per topology 75 | { 76 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 77 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 78 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 79 | iter=0 80 | for (tr in all_topo) 81 | { 82 | iter=iter+1 83 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(iter,'control.txt',sep="")) 84 | n_datasets=n_sim 85 | #Set MODEL block 86 | modelset=sample(c('JC','TIM','TIMef','GTR','UNREST'),n_datasets,replace=T) 87 | MODEL=model_gen(modelset,paste(iter,'control.txt',sep="")) 88 | #Set TREE block 89 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 90 | print(iter) 91 | print("Newick") 92 | NEWICK=tree_genFE_SHORTULTRA(all_topo[[iter]],n_sim) 93 | print("Done newick") 94 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 95 | #Set PARTITIONS block 96 | PNAME=paste("p",1:n_datasets,sep="") 97 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 98 | #Set EVOLVE block 99 | write('[EVOLVE]',paste(iter,'control.txt',sep=""),append=T) 100 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 101 | } 102 | } 103 | 104 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3])) -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_FE_ULTRASHORT_INDEL001.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | library('phangorn') 7 | library('MCMCpack') 8 | library('dplyr') 9 | library('scales') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | #Invariant sites Unif 20 | I=runif(1,0,1) 21 | A=runif(1,0,5) 22 | #Nucl proportions DIRICHLET 23 | options(digits=5) 24 | Pi=format(rdirichlet(1, alpha=c(5,5,5,5))) 25 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 26 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 27 | options(digits=2) 28 | if (model %in% c('HKY','K80')){ 29 | model=paste(c(model,' ',format(runif(1,0,3))),sep = '') 30 | } else if (model == 'TrN'){ 31 | model=paste(c(model,' ',format(runif(2,0,3))),sep = '') 32 | } else if (model %in% c('TIM' ,'TIMef')){ 33 | model=paste(c(model,' ',format(runif(3,0,3))),sep = '') 34 | } else if (model == 'TVM'){ 35 | model=paste(c(model,' ',format(runif(4,0,3))),sep = '') 36 | } else if (model %in% c('SYM','GTR')){ 37 | model=paste(c(model,' ',format(runif(5,0,3))),sep = '') 38 | } else if (model == 'UNREST'){ 39 | model=paste(c(model,' ',format(runif(11,0,3))),sep = '') 40 | } else { 41 | model=model 42 | } 43 | model_id=model_id+1 44 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',I,' ',A,' 0','\n [indelmodel] POW 1.5 50\n [indelrate] 0.01'),file,append=T) 45 | if (model_orig %in% c('F81','HKY','TrN','TIM','TVM','GTR')) 46 | { 47 | write(paste(' [statefreq]',paste(Pi,collapse=' ')),file,append=T) 48 | } 49 | } 50 | return(models_selected) 51 | } 52 | 53 | 54 | tree_genFE_SHORTULTRA=function(tr,n_sim) 55 | { 56 | nbranch=length(tr$edge[,1]) 57 | tr$edge.lengths=rep(0,nbranch) 58 | tr_newick=unlist(strsplit(write.tree(tr),"")) 59 | boot1=matrix(runif(nbranch*n_sim,c(0.1,0,0,0.1,0,0.1,0,0,0,0.1,0,0.1,0,0.1,0,0,0.1,0,0,0.1),c(0.5,0.05,0.05,0.5,0.05,0.5,0.05,0.05,0.05,0.5,0.05,0.5,0.05,0.5,0.05,0.05,0.5,0.05,0.05,0.5)),ncol=5,byrow=T) 60 | boot2=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.01,0.01,0.01,0.01,0.01)),ncol=5,byrow=T) 61 | boot=rbind(boot1,boot2) 62 | boot=boot[sample(1:nrow(boot),n_sim),] 63 | pos=which(tr_newick==0) 64 | trees=data.frame(t(tr_newick)) 65 | trees=trees[rep(1,n_sim),] 66 | trees[,pos]=boot[,1:5] 67 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 68 | return(tree_list) 69 | } 70 | 71 | 72 | 73 | 74 | indelib_gen=function(n_taxa,n_sim,aln_length) # n_sim = number of simulations per topology 75 | { 76 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 77 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 78 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 79 | iter=0 80 | for (tr in all_topo) 81 | { 82 | iter=iter+1 83 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(iter,'control.txt',sep="")) 84 | n_datasets=n_sim 85 | #Set MODEL block 86 | modelset=sample(c('JC','TIM','TIMef','GTR','UNREST'),n_datasets,replace=T) 87 | MODEL=model_gen(modelset,paste(iter,'control.txt',sep="")) 88 | #Set TREE block 89 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 90 | print(iter) 91 | print("Newick") 92 | NEWICK=tree_genFE_SHORTULTRA(all_topo[[iter]],n_sim) 93 | print("Done newick") 94 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 95 | #Set PARTITIONS block 96 | PNAME=paste("p",1:n_datasets,sep="") 97 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 98 | #Set EVOLVE block 99 | write('[EVOLVE]',paste(iter,'control.txt',sep=""),append=T) 100 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 101 | } 102 | } 103 | 104 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3])) -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_NOINDEL.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | library('phangorn') 7 | library('MCMCpack') 8 | library('dplyr') 9 | library('scales') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | #Invariant sites Unif 20 | I=runif(1,0,1) 21 | A=runif(1,0,5) 22 | #Nucl proportions DIRICHLET 23 | options(digits=5) 24 | Pi=format(rdirichlet(1, alpha=c(5,5,5,5))) 25 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 26 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 27 | options(digits=2) 28 | if (model %in% c('HKY','K80')){ 29 | model=paste(c(model,' ',format(runif(1,0,3))),sep = '') 30 | } else if (model == 'TrN'){ 31 | model=paste(c(model,' ',format(runif(2,0,3))),sep = '') 32 | } else if (model %in% c('TIM' ,'TIMef')){ 33 | model=paste(c(model,' ',format(runif(3,0,3))),sep = '') 34 | } else if (model == 'TVM'){ 35 | model=paste(c(model,' ',format(runif(4,0,3))),sep = '') 36 | } else if (model %in% c('SYM','GTR')){ 37 | model=paste(c(model,' ',format(runif(5,0,3))),sep = '') 38 | } else if (model == 'UNREST'){ 39 | model=paste(c(model,' ',format(runif(11,0,3))),sep = '') 40 | } else { 41 | model=model 42 | } 43 | model_id=model_id+1 44 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',I,' ',A,' 0'),file,append=T) 45 | if (model_orig %in% c('F81','HKY','TrN','TIM','TVM','GTR')) 46 | { 47 | write(paste(' [statefreq]',paste(Pi,collapse=' ')),file,append=T) 48 | } 49 | } 50 | return(models_selected) 51 | } 52 | #TREE generating function 53 | tree_gen=function(tr,n_sim) 54 | { 55 | nbranch=length(tr$edge[,1]) 56 | tr$edge.lengths=rep(0,nbranch) 57 | tr_newick=unlist(strsplit(write.tree(tr),"")) 58 | v_select=combn(1:5,2) 59 | boot=c() 60 | 61 | space_t=matrix(sample(rbeta(100000000,c(0.1,0.5,1),c(0.1,0.5,1))),ncol=5) 62 | #AS assymetry score NB neigbour sum + L tree length 63 | AS=apply(space_t[,v_select[1,]]-space_t[,v_select[2,]],1,sum) 64 | LB=2*apply(space_t,1,sum)+apply(space_t[,2:4],1,sum) 65 | x=seq(-6,6,0.1) 66 | y=seq(0,13,0.1) 67 | m=matrix(1:(length(x)*length(y)),nrow=length(x),ncol=length(y)) 68 | xint=findInterval(AS,x) 69 | yint=findInterval(LB,y) 70 | all_t=data.frame(space_t,AS=AS,LB=LB,XI=xint,YI=yint,Fact=m[cbind(xint,yint)]) 71 | #Uniform sampling from tree space 72 | for (i in 1:50) 73 | { 74 | sampletree=data.frame(all_t %>% group_by(Fact) %>% sample_n(size = 1,replace=F)) 75 | boot=rbind(boot,sampletree) 76 | } 77 | boot=boot[sample(nrow(boot),n_sim),] 78 | #plot(boot$AS,boot$LB,col=alpha("black",0.05),pch=16) 79 | pos=which(tr_newick==0) 80 | trees=data.frame(t(tr_newick)) 81 | trees=trees[rep(1,n_sim),] 82 | trees[,pos]=boot[,1:5] 83 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 84 | return(tree_list) 85 | } 86 | 87 | 88 | 89 | 90 | indelib_gen=function(n_taxa,n_sim,aln_length) # n_sim = number of simulations per topology 91 | { 92 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 93 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 94 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 95 | iter=0 96 | for (tr in all_topo) 97 | { 98 | iter=iter+1 99 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(iter,'control.txt',sep="")) 100 | n_datasets=n_sim 101 | #Set MODEL block 102 | modelset=sample(c('JC','TIM','TIMef','GTR','UNREST'),n_datasets,replace=T) 103 | MODEL=model_gen(modelset,paste(iter,'control.txt',sep="")) 104 | #Set TREE block 105 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 106 | print(iter) 107 | print("Newick") 108 | NEWICK=tree_gen(all_topo[[iter]],n_sim) 109 | print("Done newick") 110 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 111 | #Set PARTITIONS block 112 | PNAME=paste("p",1:n_datasets,sep="") 113 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 114 | #Set EVOLVE block 115 | write('[EVOLVE]',paste(iter,'control.txt',sep=""),append=T) 116 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 117 | } 118 | } 119 | 120 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3])) 121 | -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_INDEL001.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | library('phangorn') 7 | library('MCMCpack') 8 | library('dplyr') 9 | library('scales') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | #Invariant sites Unif 20 | I=runif(1,0,1) 21 | A=runif(1,0,5) 22 | #Nucl proportions DIRICHLET 23 | options(digits=5) 24 | Pi=format(rdirichlet(1, alpha=c(5,5,5,5))) 25 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 26 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 27 | options(digits=2) 28 | if (model %in% c('HKY','K80')){ 29 | model=paste(c(model,' ',format(runif(1,0,3))),sep = '') 30 | } else if (model == 'TrN'){ 31 | model=paste(c(model,' ',format(runif(2,0,3))),sep = '') 32 | } else if (model %in% c('TIM' ,'TIMef')){ 33 | model=paste(c(model,' ',format(runif(3,0,3))),sep = '') 34 | } else if (model == 'TVM'){ 35 | model=paste(c(model,' ',format(runif(4,0,3))),sep = '') 36 | } else if (model %in% c('SYM','GTR')){ 37 | model=paste(c(model,' ',format(runif(5,0,3))),sep = '') 38 | } else if (model == 'UNREST'){ 39 | model=paste(c(model,' ',format(runif(11,0,3))),sep = '') 40 | } else { 41 | model=model 42 | } 43 | model_id=model_id+1 44 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',I,' ',A,' 0','\n [indelmodel] POW 1.5 50\n [indelrate] 0.01'),file,append=T) 45 | if (model_orig %in% c('F81','HKY','TrN','TIM','TVM','GTR')) 46 | { 47 | write(paste(' [statefreq]',paste(Pi,collapse=' ')),file,append=T) 48 | } 49 | } 50 | return(models_selected) 51 | } 52 | #TREE generating function 53 | tree_gen=function(tr,n_sim) 54 | { 55 | nbranch=length(tr$edge[,1]) 56 | tr$edge.lengths=rep(0,nbranch) 57 | tr_newick=unlist(strsplit(write.tree(tr),"")) 58 | v_select=combn(1:5,2) 59 | boot=c() 60 | 61 | space_t=matrix(sample(rbeta(100000000,c(0.1,0.5,1),c(0.1,0.5,1))),ncol=5) 62 | #AS assymetry score NB neigbour sum + L tree length 63 | AS=apply(space_t[,v_select[1,]]-space_t[,v_select[2,]],1,sum) 64 | LB=2*apply(space_t,1,sum)+apply(space_t[,2:4],1,sum) 65 | x=seq(-6,6,0.1) 66 | y=seq(0,13,0.1) 67 | m=matrix(1:(length(x)*length(y)),nrow=length(x),ncol=length(y)) 68 | xint=findInterval(AS,x) 69 | yint=findInterval(LB,y) 70 | all_t=data.frame(space_t,AS=AS,LB=LB,XI=xint,YI=yint,Fact=m[cbind(xint,yint)]) 71 | #Uniform sampling from tree space 72 | for (i in 1:10) 73 | { 74 | print(i) 75 | sampletree=data.frame(all_t %>% group_by(Fact) %>% sample_n(size = 1,replace=F)) 76 | boot=rbind(boot,sampletree) 77 | } 78 | boot=boot[sample(nrow(boot),n_sim),] 79 | #plot(boot$AS,boot$LB,col=alpha("black",0.05),pch=16) 80 | pos=which(tr_newick==0) 81 | trees=data.frame(t(tr_newick)) 82 | trees=trees[rep(1,n_sim),] 83 | trees[,pos]=boot[,1:5] 84 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 85 | return(tree_list) 86 | } 87 | 88 | indelib_gen=function(n_taxa,n_sim,aln_length) # n_sim = number of simulations per topology 89 | { 90 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 91 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 92 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 93 | iter=0 94 | for (tr in all_topo) 95 | { 96 | iter=iter+1 97 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(iter,'control.txt',sep="")) 98 | n_datasets=n_sim 99 | #Set MODEL block 100 | modelset=sample(c('JC','TIM','TIMef','GTR','UNREST'),n_datasets,replace=T) 101 | MODEL=model_gen(modelset,paste(iter,'control.txt',sep="")) 102 | #Set TREE block 103 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 104 | print(iter) 105 | print("Newick") 106 | NEWICK=tree_gen(all_topo[[iter]],n_sim) 107 | print("Done newick") 108 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 109 | #Set PARTITIONS block 110 | PNAME=paste("p",1:n_datasets,sep="") 111 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 112 | #Set EVOLVE block 113 | write('[EVOLVE]',paste(iter,'control.txt',sep=""),append=T) 114 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 115 | } 116 | } 117 | 118 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3])) 119 | -------------------------------------------------------------------------------- /INDELible/.ipynb_checkpoints/indelible_controlgen_INDEL001-checkpoint.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | library('phangorn') 7 | library('MCMCpack') 8 | library('dplyr') 9 | library('scales') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | #Invariant sites Unif 20 | I=runif(1,0,1) 21 | A=runif(1,0,5) 22 | #Nucl proportions DIRICHLET 23 | options(digits=5) 24 | Pi=format(rdirichlet(1, alpha=c(5,5,5,5))) 25 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 26 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 27 | options(digits=2) 28 | if (model %in% c('HKY','K80')){ 29 | model=paste(c(model,' ',format(runif(1,0,3))),sep = '') 30 | } else if (model == 'TrN'){ 31 | model=paste(c(model,' ',format(runif(2,0,3))),sep = '') 32 | } else if (model %in% c('TIM' ,'TIMef')){ 33 | model=paste(c(model,' ',format(runif(3,0,3))),sep = '') 34 | } else if (model == 'TVM'){ 35 | model=paste(c(model,' ',format(runif(4,0,3))),sep = '') 36 | } else if (model %in% c('SYM','GTR')){ 37 | model=paste(c(model,' ',format(runif(5,0,3))),sep = '') 38 | } else if (model == 'UNREST'){ 39 | model=paste(c(model,' ',format(runif(11,0,3))),sep = '') 40 | } else { 41 | model=model 42 | } 43 | model_id=model_id+1 44 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',I,' ',A,' 0','\n [indelmodel] POW 1.5 50\n [indelrate] 0.01'),file,append=T) 45 | if (model_orig %in% c('F81','HKY','TrN','TIM','TVM','GTR')) 46 | { 47 | write(paste(' [statefreq]',paste(Pi,collapse=' ')),file,append=T) 48 | } 49 | } 50 | return(models_selected) 51 | } 52 | #TREE generating function 53 | tree_gen=function(tr,n_sim) 54 | { 55 | nbranch=length(tr$edge[,1]) 56 | tr$edge.lengths=rep(0,nbranch) 57 | tr_newick=unlist(strsplit(write.tree(tr),"")) 58 | v_select=combn(1:5,2) 59 | boot=c() 60 | 61 | space_t=matrix(sample(rbeta(100000000,c(0.1,0.5,1),c(0.1,0.5,1))),ncol=5) 62 | #AS assymetry score NB neigbour sum + L tree length 63 | AS=apply(space_t[,v_select[1,]]-space_t[,v_select[2,]],1,sum) 64 | LB=2*apply(space_t,1,sum)+apply(space_t[,2:4],1,sum) 65 | x=seq(-6,6,0.1) 66 | y=seq(0,13,0.1) 67 | m=matrix(1:(length(x)*length(y)),nrow=length(x),ncol=length(y)) 68 | xint=findInterval(AS,x) 69 | yint=findInterval(LB,y) 70 | all_t=data.frame(space_t,AS=AS,LB=LB,XI=xint,YI=yint,Fact=m[cbind(xint,yint)]) 71 | #Uniform sampling from tree space 72 | for (i in 1:10) 73 | { 74 | print(i) 75 | sampletree=data.frame(all_t %>% group_by(Fact) %>% sample_n(size = 1,replace=F)) 76 | boot=rbind(boot,sampletree) 77 | } 78 | boot=boot[sample(nrow(boot),n_sim),] 79 | #plot(boot$AS,boot$LB,col=alpha("black",0.05),pch=16) 80 | pos=which(tr_newick==0) 81 | trees=data.frame(t(tr_newick)) 82 | trees=trees[rep(1,n_sim),] 83 | trees[,pos]=boot[,1:5] 84 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 85 | return(tree_list) 86 | } 87 | 88 | indelib_gen=function(n_taxa,n_sim,aln_length) # n_sim = number of simulations per topology 89 | { 90 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 91 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 92 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 93 | iter=0 94 | for (tr in all_topo) 95 | { 96 | iter=iter+1 97 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(iter,'control.txt',sep="")) 98 | n_datasets=n_sim 99 | #Set MODEL block 100 | modelset=sample(c('JC','TIM','TIMef','GTR','UNREST'),n_datasets,replace=T) 101 | MODEL=model_gen(modelset,paste(iter,'control.txt',sep="")) 102 | #Set TREE block 103 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 104 | print(iter) 105 | print("Newick") 106 | NEWICK=tree_gen(all_topo[[iter]],n_sim) 107 | print("Done newick") 108 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 109 | #Set PARTITIONS block 110 | PNAME=paste("p",1:n_datasets,sep="") 111 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 112 | #Set EVOLVE block 113 | write('[EVOLVE]',paste(iter,'control.txt',sep=""),append=T) 114 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(iter,'control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 115 | } 116 | } 117 | 118 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3])) 119 | -------------------------------------------------------------------------------- /KERAS/keras_CNN_TOPO_trainmore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import tensorflow as tf 3 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) 4 | from itertools import product 5 | import sys, argparse, os 6 | import time 7 | import numpy as np 8 | from math import log 9 | from scipy.stats import multinomial, chi2 10 | from math import factorial 11 | from keras.utils import plot_model, to_categorical 12 | from keras.models import Model, Sequential 13 | from keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization 14 | from keras.layers.convolutional import Conv2D 15 | from keras.layers.pooling import MaxPooling2D 16 | from keras.layers.merge import concatenate 17 | from keras.callbacks import EarlyStopping, ModelCheckpoint 18 | from keras.models import model_from_json 19 | from keras.models import load_model 20 | #N unrooted trees given N taxa 21 | def n_unroot(Ntaxa): 22 | N=factorial(2*Ntaxa-5)/(factorial(Ntaxa-3)*2**(Ntaxa-3)) 23 | return(int(N)) 24 | 25 | #Generate Numpy array of labels for Keras 26 | def label_gen(classes,n_labels): 27 | vec=[[0]*classes]*int(n_labels) 28 | vec=np.array(vec) 29 | section=int(int(n_labels)/int(classes)) 30 | for cl in range(0,classes): 31 | vec[int(cl*section):int(section+section*cl),cl]=1 32 | return(vec) 33 | 34 | #Read FASTA convert to numeric 35 | def fasta_pars(aln_file,seq_number,Lmax): 36 | aln=open(aln_file) 37 | #Lmax=max([len(r) for r in aln]) 38 | aln=open(aln_file) 39 | dic={'A':'0','T':'1','C':'2','G':'3','-':'4'} 40 | matrix_out=[] 41 | fasta_dic={} 42 | for line in aln: 43 | if line[0]==">": 44 | header=line[1:].rstrip('\n').strip() 45 | fasta_dic[header]=[] 46 | elif line[0].isalpha() or line[0]=='-': 47 | for base, num in dic.items(): 48 | line=line[:].rstrip('\n').strip().replace(base,num) 49 | line=list(line) 50 | line=[int(n) for n in line] 51 | #Mkae all matrices of equal length for CNN +[-15]*(Lmax-len(line)) 52 | fasta_dic[header]+=line+[-15]*(Lmax-len(line)) 53 | if len(fasta_dic)==seq_number: 54 | taxa_block=[] 55 | for taxa in sorted(list(fasta_dic.keys())): 56 | taxa_block.append(fasta_dic[taxa.strip()]) 57 | fasta_dic={} 58 | matrix_out.append(taxa_block) 59 | return(np.array(matrix_out)) 60 | 61 | #Read training, validation and test datasets to equalize sizes 62 | def tv_parse(train,valid,test,seq_number,Lmax): 63 | tr=fasta_pars(train,seq_number,Lmax) 64 | va=fasta_pars(valid,seq_number,Lmax) 65 | te=fasta_pars(test,seq_number,Lmax) 66 | print("Training N: "+str(len(tr))) 67 | print("Validation N: "+str(len(va))) 68 | print("Testing N: "+str(len(te))) 69 | return tr, va, te 70 | 71 | 72 | #Classification: tree topology 73 | def build_standartCNN(X_train, Y_train, X_valid, Y_valid, Ntaxa,batch_sizes,model,w): 74 | 75 | model_cnn=load_model(model) 76 | model_cnn.load_weights(w) 77 | #Model stopping criteria 78 | callback1=EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto') 79 | callback2=ModelCheckpoint('best_weights_clas', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) 80 | #Run 81 | model_cnn.fit(x=X_train,y=Y_train,validation_data=(X_valid,Y_valid),batch_size=batch_sizes,callbacks=[callback1,callback2],epochs=200,verbose=1,shuffle=True) 82 | return(model_cnn) 83 | 84 | 85 | 86 | def main(): 87 | parser = argparse.ArgumentParser(description='Keras run') 88 | parser.add_argument( '-w', help = "Weights file",dest='WEIGHTS') 89 | parser.add_argument( '-k', help = "Keras model",dest='JASON') 90 | parser.add_argument( '-t', help = "Training dataset in .fasta",dest='TRAIN') 91 | parser.add_argument( '-v', help = "Validation dataset in .fasta",dest='VALID') 92 | parser.add_argument( '--test', help = "Test dataset in .fasta",dest='TEST') 93 | parser.add_argument( '-N', help = "N taxa", type=int, dest='Ntaxa') 94 | args = parser.parse_args() 95 | 96 | #Read inputs 97 | 98 | loaded_model = load_model(args.JASON) 99 | loaded_model.summary() 100 | Lmax=loaded_model.layers[0].get_output_at(0).get_shape().as_list()[2] 101 | 102 | print("Reading input") 103 | train_data1, valid_data1, test_data1 = tv_parse(args.TRAIN,args.VALID,args.TEST,args.Ntaxa,Lmax) 104 | #Reshape for Keras 105 | train_data1=train_data1.reshape(train_data1.shape[0],train_data1.shape[1],train_data1.shape[2],1) 106 | valid_data1=valid_data1.reshape(valid_data1.shape[0],valid_data1.shape[1],valid_data1.shape[2],1) 107 | test_data1=test_data1.reshape(test_data1.shape[0],test_data1.shape[1],test_data1.shape[2],1) 108 | print("Done") 109 | 110 | #Generate labels 111 | Nlabels=n_unroot(args.Ntaxa) 112 | train_label=to_categorical(np.repeat(range(0,Nlabels),len(train_data1)/Nlabels), num_classes=None) 113 | valid_label=to_categorical(np.repeat(range(0,Nlabels),len(valid_data1)/Nlabels), num_classes=None) 114 | test_label=to_categorical(np.repeat(range(0,Nlabels),len(test_data1)/Nlabels), num_classes=None) 115 | 116 | #load weights into new model 117 | model_cnn=build_standartCNN(X_train=train_data1, Y_train=train_label, X_valid=valid_data1, Y_valid=valid_label,Ntaxa=args.Ntaxa,batch_sizes=100,model=args.JASON,w=args.WEIGHTS) 118 | model_cnn.load_weights('best_weights_clas') 119 | model_cnn.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 120 | 121 | evals = model_cnn.evaluate(test_data1,test_label,batch_size=100, verbose=1, steps=None) 122 | classes = model_cnn.predict(test_data1, batch_size=100, verbose=1, steps=None) 123 | np.savetxt("test.evals_class.txt",evals,fmt='%f') 124 | np.savetxt("test.classprobs_class.txt",classes,fmt='%f') 125 | class_lab = classes.argmax(axis=-1) 126 | np.savetxt("test.classeslab_class.txt",class_lab,fmt='%f') 127 | 128 | if __name__ == "__main__": 129 | main() -------------------------------------------------------------------------------- /KERAS/keras_CNN_TOPO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import tensorflow as tf 3 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) 4 | import pandas 5 | import time 6 | from itertools import product 7 | import sys, argparse, os 8 | import numpy as np 9 | from math import log, ceil 10 | from scipy.stats import multinomial, chi2 11 | from math import factorial 12 | from keras.utils import plot_model, to_categorical 13 | from keras.models import Model, Sequential 14 | from keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization, ZeroPadding2D 15 | from keras.layers.convolutional import Conv2D 16 | from keras.layers.pooling import MaxPooling2D, AveragePooling2D 17 | from keras.layers.merge import concatenate 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint 19 | from keras.optimizers import Adam 20 | 21 | #N unrooted trees given N taxa 22 | def n_unroot(Ntaxa): 23 | N=factorial(2*Ntaxa-5)/(factorial(Ntaxa-3)*2**(Ntaxa-3)) 24 | return(int(N)) 25 | 26 | #Read FASTA convert to numeric 27 | def fasta_pars(aln_file,seq_number,Lmax): 28 | aln=open(aln_file) 29 | dic={'A':'0','T':'1','C':'2','G':'3','-':'4'} 30 | matrix_out=[] 31 | fasta_dic={} 32 | for line in aln: 33 | if line[0]==">": 34 | header=line[1:].rstrip('\n').strip() 35 | fasta_dic[header]=[] 36 | elif line[0].isalpha() or line[0]=='-': 37 | for base, num in dic.items(): 38 | line=line[:].rstrip('\n').strip().replace(base,num) 39 | line=list(line) 40 | line=[int(n) for n in line] 41 | #Mkae all matrices of equal length for CNN +[-15]*(Lmax-len(line)) 42 | fasta_dic[header]+=line+[-15]*(Lmax-len(line)) 43 | if len(fasta_dic)==seq_number: 44 | taxa_block=[] 45 | for taxa in sorted(list(fasta_dic.keys())): 46 | taxa_block.append(fasta_dic[taxa.strip()]) 47 | fasta_dic={} 48 | matrix_out.append(taxa_block) 49 | return(np.array(matrix_out)) 50 | 51 | #Read training, validation and test datasets to equalize sizes 52 | def tv_parse(train,valid,test,seq_number): 53 | tr=open(train) 54 | va=open(valid) 55 | te=open(test) 56 | LT=max([len(r.strip()) for r in tr]) 57 | print("Training largest alignment: "+str(LT)) 58 | LV=max([len(r.strip()) for r in va]) 59 | print("Validation largest alignment: "+str(LV)) 60 | LTE=max([len(r.strip()) for r in te]) 61 | print("Testing largest alignment: "+str(LTE)) 62 | Lmax=max([LT]+[LV]+[LTE]) 63 | tr=fasta_pars(train,seq_number,Lmax) 64 | va=fasta_pars(valid,seq_number,Lmax) 65 | te=fasta_pars(test,seq_number,Lmax) 66 | return tr, va, te 67 | 68 | 69 | #Classification: tree topology 70 | def build_standartCNN(X_train, Y_train, X_valid, Y_valid, Ntaxa,conv_pool_n,filter_n,droput_rates,batch_sizes): 71 | Aln_length=X_train.shape[2] 72 | Nlabels=n_unroot(Ntaxa) 73 | 74 | #Hyperparameters 75 | #Hight (horizontal) 76 | conv_x=[4,1,1,1,1,1,1,1] 77 | #Width (vertical) 78 | conv_y=[1,2,2,2,2,2,2,2] 79 | pool=[1,4,4,4,2,2,2,1] 80 | filter_s=[1024,1024,128,128,128,128,128,128] 81 | 82 | print(conv_pool_n,filter_n,droput_rates,batch_sizes) 83 | 84 | #Arhitecture 85 | visible = Input(shape=(Ntaxa,Aln_length,1)) 86 | x = visible 87 | for l in list(range(0,conv_pool_n)): 88 | x = ZeroPadding2D(padding=((0, 0), (0,conv_y[l]-1)))(x) 89 | x = Conv2D(filters=filter_s[l], kernel_size=(conv_x[l], conv_y[l]), strides=1, activation='relu')(x) 90 | x = BatchNormalization()(x) 91 | x = Dropout(rate=0.2)(x) 92 | x = AveragePooling2D(pool_size=(1,pool[l]))(x) 93 | flat = Flatten()(x) 94 | hidden1 = Dense(1024,activation='relu')(flat) 95 | drop1=Dropout(rate=0.2)(hidden1) 96 | output = Dense(Nlabels, activation='softmax')(drop1) 97 | model_cnn = Model(inputs=visible, outputs=output) 98 | model_cnn.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 99 | 100 | #Print model 101 | print(model_cnn.summary()) 102 | 103 | #Model stopping criteria 104 | callback1=EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto') 105 | callback2=ModelCheckpoint('best_weights_clas', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) 106 | #Run 107 | 108 | model_cnn.fit(x=X_train,y=Y_train,validation_data=(X_valid,Y_valid),batch_size=batch_sizes,callbacks=[callback1,callback2],epochs=200,verbose=1,shuffle=True) 109 | return(model_cnn) 110 | 111 | 112 | def main(): 113 | parser = argparse.ArgumentParser(description='Keras run') 114 | parser.add_argument( '-t', help = "Training dataset in .npy",dest='TRAIN') 115 | parser.add_argument( '-v', help = "Validation dataset in .npy",dest='VALID') 116 | parser.add_argument( '--test', help = "Test dataset in .npy",dest='TEST') 117 | parser.add_argument( '-N', help = "N taxa", type=int, dest='Ntaxa') 118 | args = parser.parse_args() 119 | 120 | #Read inputs 121 | print("Reading input") 122 | 123 | train_data1=np.load(args.TRAIN) 124 | valid_data1=np.load(args.VALID) 125 | test_data1=np.load(args.TEST) 126 | print("Done") 127 | 128 | #Generate labels 129 | Nlabels=n_unroot(args.Ntaxa) 130 | train_label=to_categorical(np.repeat(range(0,Nlabels),len(train_data1)/Nlabels), num_classes=None) 131 | valid_label=to_categorical(np.repeat(range(0,Nlabels),len(valid_data1)/Nlabels), num_classes=None) 132 | test_label=to_categorical(np.repeat(range(0,Nlabels),len(test_data1)/Nlabels), num_classes=None) 133 | 134 | 135 | #Model Run 136 | #Classification TOPO 137 | starttime=time.time() 138 | model_cnn=build_standartCNN(X_train=train_data1, Y_train=train_label, X_valid=valid_data1, Y_valid=valid_label,Ntaxa=args.Ntaxa,conv_pool_n=8,filter_n=500,droput_rates=0.20,batch_sizes=100) 139 | #Load best model 140 | model_cnn.load_weights('best_weights_clas') 141 | model_cnn.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 142 | model_cnn.save("keras_model.h5") 143 | endtime=time.time() 144 | print(endtime-starttime,"sec for training") 145 | print('Evaluate with best class weights') 146 | 147 | 148 | starttime=time.time() 149 | evals = model_cnn.evaluate(test_data1,test_label,batch_size=100, verbose=1, steps=None) 150 | classes = model_cnn.predict(test_data1, batch_size=100, verbose=1, steps=None) 151 | endtime=time.time() 152 | print(endtime-starttime,"sec for testing") 153 | np.savetxt("test.evals_class.txt",evals,fmt='%f') 154 | np.savetxt("test.classprobs_class.txt",classes,fmt='%f') 155 | class_lab = classes.argmax(axis=-1) 156 | np.savetxt("test.classeslab_class.txt",class_lab,fmt='%f') 157 | 158 | if __name__ == "__main__": 159 | main() 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/174181866.svg)](https://zenodo.org/badge/latestdoi/174181866) 2 | # Tree topology inference from multiple sequence alignments using Deep Learning 3 | 4 | This repository contains R (>=3.5.0) and Python (>=3.6) scripts that were used in the project **"Accurate inference of tree topologies from multiple sequence alignments using deep learning"** 5 | 6 | **Citation**: 7 | Anton Suvorov, Joshua Hochuli, Daniel R. Schrider (2019). Accurate inference of tree topologies from multiple sequence alignments using deep learning. Systematic Biology, [DOI](https://academic.oup.com/sysbio/advance-article-abstract/doi/10.1093/sysbio/syz060/5559282) 8 | 9 | **R scripts** can be found in [INDELible directory](https://github.com/SchriderLab/Tree_learning/tree/master/INDELible). They will generate various control files for [INDElible](http://abacus.gene.ucl.ac.uk/software/indelible/) program that simulates MSAs under given tree topology, branch lengths and different substitution as well as indel model parameters. 10 | 11 | Required CRAN R packages: 12 | [phangorn](https://cran.r-project.org/web/packages/phangorn/index.html) 13 | [MCMCpack](https://cran.r-project.org/web/packages/MCMCpack/index.html) 14 | [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) 15 | [scales](https://cran.r-project.org/web/packages/scales/index.html) 16 | 17 | 1) [indelible_controlgen_INDEL001.R](https://github.com/SchriderLab/Tree_learning/blob/master/INDELible/indelible_controlgen_INDEL001.R) and [indelible_controlgen_NOINDEL.R](https://github.com/SchriderLab/Tree_learning/blob/master/INDELible/indelible_controlgen_NOINDEL.R) 18 | These scripts generate control files for MSA simulation with (INDEL001) and without (NOINDEL) indels/gaps. The control files will be stored in three directories (topo1, topo2 and topo3) that correspond to three topologies. These scripts are used to generate MSAs for generating TRAINING, VALIDATION and TEST data sets. 19 | Example: ```Rscript indelible_controlgen_NOINDEL.R 4 1000 500```(generates 1000 MSAs of length 500 per topology) 20 | 21 | 2) [indelible_controlgen_REGIONS_INDEL001.R](https://github.com/SchriderLab/Tree_learning/blob/master/INDELible/indelible_controlgen_REGIONS_INDEL001.R) and [indelible_controlgen_REGIONS_NOINDEL0.R](https://github.com/SchriderLab/Tree_learning/blob/master/INDELible/indelible_controlgen_REGIONS_NOINDEL0.R) 22 | These scripts generate control files for MSA simulation with (INDEL001) and without (NOINDEL) indels/gaps. The scripts will generate EXP, FA, FAE, FAT, FE, FEE, LONG, LONGOUT, LONGULTRA, SHORT, SHORTINT, SHORTOUT and SHORTULTRA directories each with topo1, topo2 and topo3 subdirectories. These correspond to heterogeneous branch length regions, namely Truncated exponential (EXP), Farris zone (FA), Extended Farris zone (FAE), ["Twisted" Farris zone](https://www.sciencedirect.com/science/article/pii/S1055790315002316?via%3Dihub) (FAT), Felsenstein zone (FE), Extended Felsenstein zone (FEE), Long branches (LONG), Single long branch (LONGOUT), Extra-long branches (LONGULTRA), Short branches (SHORT), Short internal branch (SHORTINT), Single short branch (SHORTOUT) and Extra-short branches (SHORTULTRA). These MSAs were used to test performance of different tree inference methods. 23 | Example: ```Rscript indelible_controlgen_REGIONS_INDEL001.R 4 1000 500```(generates 1000 MSAs of length 500 per topology for each region) 24 | 25 | 3) [indelible_controlgen_INDEL001_WARNOW.R](https://github.com/SchriderLab/Tree_learning/blob/master/INDELible/indelible_controlgen_INDEL001_WARNOW.R) 26 | This script generates control files for MSA simulation with no substitutions, only indels (i.e. p_inv=1). This is the scenario under which maximum likelihood (ML) tree inference has been shown to be statistically inconsistent ([Warnow, 2012](http://currents.plos.org/treeoflife/index.html%3Fp=1609.html)). These MSAs were used to test performance of different tree inference methods. 27 | Example: ```Rscript indelible_controlgen_INDEL001_WARNOW.R 4 1000 500``` (generates 1000 MSAs of length 500 per topology) 28 | 4) [indelible_controlgen_INDEL001_ANTI_WARNOW.R](https://github.com/SchriderLab/Tree_learning/blob/master/INDELible/indelible_controlgen_INDEL001_ANTI_WARNOW.R) 29 | This script generates control files for MSA simulation with indels and allowing all MSA sites to vary (i.e. p_inv=0). These MSAs were used to test performance of different tree inference methods. 30 | Example: ```Rscript indelible_controlgen_INDEL001_ANTI_WARNOW.R 4 1000 500``` (generates 1000 MSAs of length 500 per topology) 31 | 32 | **Python scripts** can be found in [KERAS directory](https://github.com/SchriderLab/Tree_learning/tree/master/KERAS). They are used for building, training, validating and testing Convolutional Neuronal Networks (CNNs). These scripts are optimized to run on GPUs. 33 | 34 | Required Python dependencies: 35 | [Tensorflow](https://www.tensorflow.org/install) 36 | [Keras API](https://keras.io/) 37 | [SciPy](https://www.scipy.org/) 38 | [pandas](https://pandas.pydata.org/) 39 | 40 | 1) [keras_CNN_TOPO.py](https://github.com/SchriderLab/Tree_learning/blob/master/KERAS/keras_CNN_TOPO.py) 41 | This script builds, trains, validates and tests CNN. As an input it takes TRAINING, VALIDATION and TESTING MSAs generated by INDELible and saved in .npy array using [fasta2numeric.py](https://github.com/SchriderLab/Tree_learning/tree/master/Utils) script. As an input this utility script takes TRAINING, VALIDATION and TESTING datasets produced by concatinating MSAs. E.g. ```cat topo1/* topo2/* topo3/* > TRAINING``` The keras model (keras.h5) and optimal CNN weights (best_weights_clas) will be outputted by the script after testing is completed. 42 | Example: ```keras_CNN_TOPO.py -t TRAIN.npy -v VALID.npy --test TEST.npy -N 4``` (tested only on 4-taxon MSA cases i.e. ```-N 4```) 43 | 44 | ``` 45 | Options: 46 | -h, --help 47 | -t Training dataset in .npy 48 | -v Validation dataset in .npy 49 | --test Test dataset in .npy 50 | -N N taxa 51 | ``` 52 | 53 | 2) [keras_CNN_apply.py](https://github.com/SchriderLab/Tree_learning/blob/master/KERAS/keras_CNN_apply.py) 54 | This script infers a tree from an MSA. It requires keras model and weights files produced by [keras_CNN_TOPO.py](https://github.com/SchriderLab/Tree_learning/blob/master/KERAS/keras_CNN_TOPO.py), a data set in FASTA format. 55 | Example: ```keras_CNN_apply.py -t TEST.fasta -w best_weights_clas -k keras_model.h5 -N 4``` 56 | ``` 57 | Options: 58 | -h, --help 59 | -t Evaluation dataset in FASTA 60 | -w Weights file 61 | -k Keras model 62 | -N N taxa 63 | ``` 64 | 3) [keras_CNN_BOOT.py](https://github.com/SchriderLab/Tree_learning/blob/master/KERAS/keras_CNN_BOOT.py) 65 | This script performs MSA nonparametric bootstrapping. It requires keras model and weights files produced by [keras_CNN_TOPO.py](https://github.com/SchriderLab/Tree_learning/blob/master/KERAS/keras_CNN_TOPO.py), a data set in FASTA format and labeles for data set. 66 | Example: ```keras_CNN_BOOT.py --test TEST.fasta --lab labels.txt -w best_weights_clas -k keras_model.h5 -b 100 -N 4``` 67 | ``` 68 | Options: 69 | -h, --help 70 | --test Test dataset in FASTA 71 | --lab Labels of TEST dataset 72 | -w Weights file 73 | -k Keras model 74 | -b N bootstrap replicates 75 | -N N taxa 76 | ``` 77 | Python scripts that were used to reconstruct error surface are avalible [here](https://github.com/SchriderLab/error_surface). 78 | -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_REGIONS_NOINDEL0.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | args = commandArgs(trailingOnly=TRUE) 5 | library('phangorn') 6 | library('MCMCpack') 7 | library('dplyr') 8 | library('scales') 9 | library('RGeode') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | #Invariant sites Unif 20 | I=runif(1,0,1) 21 | A=runif(1,0,5) 22 | #Nucl proportions DIRICHLET 23 | options(digits=5) 24 | Pi=format(rdirichlet(1, alpha=c(5,5,5,5))) 25 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 26 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 27 | options(digits=2) 28 | if (model %in% c('HKY','K80')){ 29 | model=paste(c(model,' ',format(runif(1,0,3))),sep = '') 30 | } else if (model == 'TrN'){ 31 | model=paste(c(model,' ',format(runif(2,0,3))),sep = '') 32 | } else if (model %in% c('TIM' ,'TIMef')){ 33 | model=paste(c(model,' ',format(runif(3,0,3))),sep = '') 34 | } else if (model == 'TVM'){ 35 | model=paste(c(model,' ',format(runif(4,0,3))),sep = '') 36 | } else if (model %in% c('SYM','GTR')){ 37 | model=paste(c(model,' ',format(runif(5,0,3))),sep = '') 38 | } else if (model == 'UNREST'){ 39 | model=paste(c(model,' ',format(runif(11,0,3))),sep = '') 40 | } else { 41 | model=model 42 | } 43 | model_id=model_id+1 44 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',I,' ',A,' 0'),file,append=T) 45 | if (model_orig %in% c('F81','HKY','TrN','TIM','TVM','GTR')) 46 | { 47 | write(paste(' [statefreq]',paste(Pi,collapse=' ')),file,append=T) 48 | } 49 | } 50 | return(models_selected) 51 | } 52 | #TREE generating function 53 | tree_genEXP=function(tr,n_sim) 54 | { 55 | nbranch=length(tr$edge[,1]) 56 | tr$edge.lengths=rep(0,nbranch) 57 | tr_newick=unlist(strsplit(write.tree(tr),"")) 58 | boot=matrix(rexptr(nbranch*n_sim,10,c(0,0.5)),ncol=nbranch) 59 | pos=which(tr_newick==0) 60 | trees=data.frame(t(tr_newick)) 61 | trees=trees[rep(1,n_sim),] 62 | trees[,pos]=boot[,1:5] 63 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 64 | return(tree_list) 65 | } 66 | 67 | tree_genFA=function(tr,n_sim) 68 | { 69 | nbranch=length(tr$edge[,1]) 70 | tr$edge.lengths=rep(0,nbranch) 71 | tr_newick=unlist(strsplit(write.tree(tr),"")) 72 | boot=matrix(runif(nbranch*n_sim,c(0.1,0.1,0,0,0,0,0,0,0.1,0.1),c(0.5,0.5,0.05,0.05,0.05,0.05,0.05,0.05,0.5,0.5)),ncol=5,byrow=T) 73 | pos=which(tr_newick==0) 74 | trees=data.frame(t(tr_newick)) 75 | trees=trees[rep(1,n_sim),] 76 | trees[,pos]=boot[,1:5] 77 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 78 | return(tree_list) 79 | } 80 | 81 | tree_genFAT=function(tr,n_sim) 82 | { 83 | nbranch=length(tr$edge[,1]) 84 | tr$edge.lengths=rep(0,nbranch) 85 | tr_newick=unlist(strsplit(write.tree(tr),"")) 86 | B1=runif(n_sim,0.1,0.5) 87 | B3=runif(n_sim,0,0.05) 88 | B2=B1+B3 89 | B4=2*B3 90 | B5=B3 91 | boot=cbind(B1,B2,B3,B4,B5) 92 | pos=which(tr_newick==0) 93 | trees=data.frame(t(tr_newick)) 94 | trees=trees[rep(1,n_sim),] 95 | trees[,pos]=boot[,1:5] 96 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 97 | return(tree_list) 98 | } 99 | 100 | tree_genFAE=function(tr,n_sim) 101 | { 102 | nbranch=length(tr$edge[,1]) 103 | tr$edge.lengths=rep(0,nbranch) 104 | tr_newick=unlist(strsplit(write.tree(tr),"")) 105 | boot=matrix(runif(nbranch*n_sim,c(0.1,0.1,0,0,0,0,0,0,0.1,0.1),c(0.5,0.5,0.5,0.05,0.05,0.05,0.05,0.5,0.5,0.5)),ncol=5,byrow=T) 106 | pos=which(tr_newick==0) 107 | trees=data.frame(t(tr_newick)) 108 | trees=trees[rep(1,n_sim),] 109 | trees[,pos]=boot[,1:5] 110 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 111 | return(tree_list) 112 | } 113 | 114 | tree_genFE=function(tr,n_sim) 115 | { 116 | nbranch=length(tr$edge[,1]) 117 | tr$edge.lengths=rep(0,nbranch) 118 | tr_newick=unlist(strsplit(write.tree(tr),"")) 119 | boot=matrix(runif(nbranch*n_sim,c(0.1,0,0,0.1,0,0.1,0,0,0,0.1,0,0.1,0,0.1,0,0,0.1,0,0,0.1),c(0.5,0.05,0.05,0.5,0.05,0.5,0.05,0.05,0.05,0.5,0.05,0.5,0.05,0.5,0.05,0.05,0.5,0.05,0.05,0.5)),ncol=5,byrow=T) 120 | pos=which(tr_newick==0) 121 | trees=data.frame(t(tr_newick)) 122 | trees=trees[rep(1,n_sim),] 123 | trees[,pos]=boot[,1:5] 124 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 125 | return(tree_list) 126 | } 127 | 128 | 129 | tree_genFEE=function(tr,n_sim) 130 | { 131 | nbranch=length(tr$edge[,1]) 132 | tr$edge.lengths=rep(0,nbranch) 133 | tr_newick=unlist(strsplit(write.tree(tr),"")) 134 | boot=matrix(runif(nbranch*n_sim,c(0.1,0,0,0.1,0,0.1,0,0,0,0.1,0,0.1,0,0.1,0,0,0.1,0,0,0.1),c(0.5,0.05,0.5,0.5,0.05,0.5,0.05,0.5,0.05,0.5,0.05,0.5,0.5,0.5,0.05,0.05,0.5,0.5,0.05,0.5)),ncol=5,byrow=T) 135 | pos=which(tr_newick==0) 136 | trees=data.frame(t(tr_newick)) 137 | trees=trees[rep(1,n_sim),] 138 | trees[,pos]=boot[,1:5] 139 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 140 | return(tree_list) 141 | } 142 | 143 | 144 | tree_genLONG=function(tr,n_sim) 145 | { 146 | nbranch=length(tr$edge[,1]) 147 | tr$edge.lengths=rep(0,nbranch) 148 | tr_newick=unlist(strsplit(write.tree(tr),"")) 149 | boot=matrix(runif(nbranch*n_sim,c(0.1,0.1,0,0.1,0.1),c(0.5,0.5,0.5,0.5,0.5)),ncol=5,byrow=T) 150 | pos=which(tr_newick==0) 151 | trees=data.frame(t(tr_newick)) 152 | trees=trees[rep(1,n_sim),] 153 | trees[,pos]=boot[,1:5] 154 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 155 | return(tree_list) 156 | } 157 | 158 | tree_genLONGOUT=function(tr,n_sim) 159 | { 160 | nbranch=length(tr$edge[,1]) 161 | tr$edge.lengths=rep(0,nbranch) 162 | tr_newick=unlist(strsplit(write.tree(tr),"")) 163 | boot=matrix(runif(nbranch*n_sim,c(0.1,0,0,0,0,0,0.1,0,0,0,0,0,0,0.1,0,0,0,0,0,0.1),c(0.5,0.05,0.5,0.05,0.05,0.05,0.5,0.5,0.05,0.05,0.05,0.05,0.5,0.5,0.05,0.05,0.05,0.5,0.05,0.5)),ncol=5,byrow=T) 164 | pos=which(tr_newick==0) 165 | trees=data.frame(t(tr_newick)) 166 | trees=trees[rep(1,n_sim),] 167 | trees[,pos]=boot[,1:5] 168 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 169 | return(tree_list) 170 | } 171 | 172 | tree_genLONGULTRA=function(tr,n_sim) 173 | { 174 | nbranch=length(tr$edge[,1]) 175 | tr$edge.lengths=rep(0,nbranch) 176 | tr_newick=unlist(strsplit(write.tree(tr),"")) 177 | boot=matrix(runif(nbranch*n_sim,c(0.5,0.5,0,0.5,0.5),c(1,1,1,1,1)),ncol=5,byrow=T) 178 | pos=which(tr_newick==0) 179 | trees=data.frame(t(tr_newick)) 180 | trees=trees[rep(1,n_sim),] 181 | trees[,pos]=boot[,1:5] 182 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 183 | return(tree_list) 184 | } 185 | 186 | 187 | tree_genSHORT=function(tr,n_sim) 188 | { 189 | nbranch=length(tr$edge[,1]) 190 | tr$edge.lengths=rep(0,nbranch) 191 | tr_newick=unlist(strsplit(write.tree(tr),"")) 192 | boot=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.05,0.05,0.5,0.05,0.05)),ncol=5,byrow=T) 193 | pos=which(tr_newick==0) 194 | trees=data.frame(t(tr_newick)) 195 | trees=trees[rep(1,n_sim),] 196 | trees[,pos]=boot[,1:5] 197 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 198 | return(tree_list) 199 | } 200 | 201 | 202 | 203 | tree_genSHORTOUT=function(tr,n_sim) 204 | { 205 | nbranch=length(tr$edge[,1]) 206 | tr$edge.lengths=rep(0,nbranch) 207 | tr_newick=unlist(strsplit(write.tree(tr),"")) 208 | boot=matrix(runif(nbranch*n_sim,c(0,0.1,0,0.1,0.1,0.1,0,0,0.1,0.1,0.1,0.1,0,0,0.1,0.1,0.1,0,0.1,0),c(0.05,0.5,0.5,0.5,0.5,0.5,0.05,0.5,0.5,0.5,0.5,0.5,0.5,0.05,0.5,0.5,0.5,0.5,0.5,0.05)),ncol=5,byrow=T) 209 | pos=which(tr_newick==0) 210 | trees=data.frame(t(tr_newick)) 211 | trees=trees[rep(1,n_sim),] 212 | trees[,pos]=boot[,1:5] 213 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 214 | return(tree_list) 215 | } 216 | 217 | tree_genSHORTINT=function(tr,n_sim) 218 | { 219 | nbranch=length(tr$edge[,1]) 220 | tr$edge.lengths=rep(0,nbranch) 221 | tr_newick=unlist(strsplit(write.tree(tr),"")) 222 | boot=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.5,0.5,0.05,0.5,0.5)),ncol=5,byrow=T) 223 | pos=which(tr_newick==0) 224 | trees=data.frame(t(tr_newick)) 225 | trees=trees[rep(1,n_sim),] 226 | trees[,pos]=boot[,1:5] 227 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 228 | return(tree_list) 229 | } 230 | 231 | tree_genSHORTULTRA=function(tr,n_sim) 232 | { 233 | nbranch=length(tr$edge[,1]) 234 | tr$edge.lengths=rep(0,nbranch) 235 | tr_newick=unlist(strsplit(write.tree(tr),"")) 236 | boot=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.01,0.01,0.01,0.01,0.01)),ncol=5,byrow=T) 237 | pos=which(tr_newick==0) 238 | trees=data.frame(t(tr_newick)) 239 | trees=trees[rep(1,n_sim),] 240 | trees[,pos]=boot[,1:5] 241 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 242 | return(tree_list) 243 | } 244 | 245 | 246 | 247 | 248 | 249 | indelib_gen=function(n_taxa,n_sim,aln_length,parameter,region) # n_sim = number of simulations per topology 250 | { 251 | dir.create(region, showWarnings = FALSE) 252 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 253 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 254 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 255 | iter=0 256 | for (tr in all_topo) 257 | { 258 | iter=iter+1 259 | dir.create(paste(region,"/topo",iter,sep=""), showWarnings = FALSE) 260 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(region,"/topo",iter,'/control.txt',sep="")) 261 | n_datasets=n_sim 262 | #Set MODEL block 263 | modelset=sample(c('JC','TIM','TIMef','GTR','UNREST'),n_datasets,replace=T) 264 | MODEL=model_gen(modelset,paste(region,"/topo",iter,'/control.txt',sep="")) 265 | #Set TREE block 266 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 267 | print(iter) 268 | print("Newick") 269 | if (region=="EXP") 270 | { 271 | NEWICK=tree_genEXP(all_topo[[iter]],n_sim) 272 | } else if (region == "FA"){ 273 | NEWICK=tree_genFA(all_topo[[iter]],n_sim) 274 | } else if (region == "FAT"){ 275 | NEWICK=tree_genFAT(all_topo[[iter]],n_sim) 276 | } else if (region == "FAE"){ 277 | NEWICK=tree_genFAE(all_topo[[iter]],n_sim) 278 | } else if (region == "FE"){ 279 | NEWICK=tree_genFE(all_topo[[iter]],n_sim) 280 | } else if (region == "FEE"){ 281 | NEWICK=tree_genFEE(all_topo[[iter]],n_sim) 282 | } else if (region == "LONG"){ 283 | NEWICK=tree_genLONG(all_topo[[iter]],n_sim) 284 | } else if (region == "LONGOUT"){ 285 | NEWICK=tree_genLONGOUT(all_topo[[iter]],n_sim) 286 | } else if (region == "LONGULTRA"){ 287 | NEWICK=tree_genLONGULTRA(all_topo[[iter]],n_sim) 288 | } else if (region == "SHORT"){ 289 | NEWICK=tree_genSHORT(all_topo[[iter]],n_sim) 290 | } else if (region == "SHORTOUT"){ 291 | NEWICK=tree_genSHORTOUT(all_topo[[iter]],n_sim) 292 | } else if (region == "SHORTINT"){ 293 | NEWICK=tree_genSHORTINT(all_topo[[iter]],n_sim) 294 | } else { 295 | NEWICK=tree_genSHORTULTRA(all_topo[[iter]],n_sim) 296 | } 297 | print("Done newick") 298 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 299 | #Set PARTITIONS block 300 | PNAME=paste("p",1:n_datasets,sep="") 301 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 302 | #Set EVOLVE block 303 | write('[EVOLVE]',paste(region,"/topo",iter,'/control.txt',sep=""),append=T) 304 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 305 | } 306 | } 307 | for (r in c("EXP","FA","FAT","FAE","FE","FEE","LONG","LONGOUT","LONGULTRA","SHORT","SHORTOUT","SHORTINT","SHORTULTRA")) 308 | { 309 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3]),as.numeric(args[4]),r) 310 | } 311 | -------------------------------------------------------------------------------- /INDELible/indelible_controlgen_REGIONS_INDEL001.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ### ARGUMENTS: N taxa N_sims Aln_length 4 | args = commandArgs(trailingOnly=TRUE) 5 | library('phangorn') 6 | library('MCMCpack') 7 | library('dplyr') 8 | library('scales') 9 | library('RGeode') 10 | options(scipen=999) 11 | #Model block generating function 12 | model_gen=function(modelset,file) 13 | { 14 | model_id=1 15 | models_selected=c() 16 | for (model in modelset) 17 | { 18 | model_orig=model 19 | #Invariant sites Unif 20 | I=runif(1,0,1) 21 | A=runif(1,0,5) 22 | #Nucl proportions DIRICHLET 23 | options(digits=5) 24 | Pi=format(rdirichlet(1, alpha=c(5,5,5,5))) 25 | models_selected=c(models_selected,paste(model,'Model',model_id,sep = '')) 26 | write(paste('\n[MODEL] ',model,'Model',model_id,sep = ''),file,append=T) 27 | options(digits=2) 28 | if (model %in% c('HKY','K80')){ 29 | model=paste(c(model,' ',format(runif(1,0,3))),sep = '') 30 | } else if (model == 'TrN'){ 31 | model=paste(c(model,' ',format(runif(2,0,3))),sep = '') 32 | } else if (model %in% c('TIM' ,'TIMef')){ 33 | model=paste(c(model,' ',format(runif(3,0,3))),sep = '') 34 | } else if (model == 'TVM'){ 35 | model=paste(c(model,' ',format(runif(4,0,3))),sep = '') 36 | } else if (model %in% c('SYM','GTR')){ 37 | model=paste(c(model,' ',format(runif(5,0,3))),sep = '') 38 | } else if (model == 'UNREST'){ 39 | model=paste(c(model,' ',format(runif(11,0,3))),sep = '') 40 | } else { 41 | model=model 42 | } 43 | model_id=model_id+1 44 | write(paste(' [submodel] ',paste(model,collapse=' '),'\n [rates] ',I,' ',A,' 0','\n [indelmodel] POW 1.5 50\n [indelrate] 0.01'),file,append=T) 45 | if (model_orig %in% c('F81','HKY','TrN','TIM','TVM','GTR')) 46 | { 47 | write(paste(' [statefreq]',paste(Pi,collapse=' ')),file,append=T) 48 | } 49 | } 50 | return(models_selected) 51 | } 52 | #TREE generating function 53 | tree_genEXP=function(tr,n_sim) 54 | { 55 | nbranch=length(tr$edge[,1]) 56 | tr$edge.lengths=rep(0,nbranch) 57 | tr_newick=unlist(strsplit(write.tree(tr),"")) 58 | boot=matrix(rexptr(nbranch*n_sim,10,c(0,0.5)),ncol=nbranch) 59 | pos=which(tr_newick==0) 60 | trees=data.frame(t(tr_newick)) 61 | trees=trees[rep(1,n_sim),] 62 | trees[,pos]=boot[,1:5] 63 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 64 | return(tree_list) 65 | } 66 | 67 | tree_genFA=function(tr,n_sim) 68 | { 69 | nbranch=length(tr$edge[,1]) 70 | tr$edge.lengths=rep(0,nbranch) 71 | tr_newick=unlist(strsplit(write.tree(tr),"")) 72 | boot=matrix(runif(nbranch*n_sim,c(0.1,0.1,0,0,0,0,0,0,0.1,0.1),c(0.5,0.5,0.05,0.05,0.05,0.05,0.05,0.05,0.5,0.5)),ncol=5,byrow=T) 73 | pos=which(tr_newick==0) 74 | trees=data.frame(t(tr_newick)) 75 | trees=trees[rep(1,n_sim),] 76 | trees[,pos]=boot[,1:5] 77 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 78 | return(tree_list) 79 | } 80 | 81 | tree_genFAT=function(tr,n_sim) 82 | { 83 | nbranch=length(tr$edge[,1]) 84 | tr$edge.lengths=rep(0,nbranch) 85 | tr_newick=unlist(strsplit(write.tree(tr),"")) 86 | B1=runif(n_sim,0.1,0.5) 87 | B3=runif(n_sim,0,0.05) 88 | B2=B1+B3 89 | B4=2*B3 90 | B5=B3 91 | boot=cbind(B1,B2,B3,B4,B5) 92 | pos=which(tr_newick==0) 93 | trees=data.frame(t(tr_newick)) 94 | trees=trees[rep(1,n_sim),] 95 | trees[,pos]=boot[,1:5] 96 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 97 | return(tree_list) 98 | } 99 | 100 | tree_genFAE=function(tr,n_sim) 101 | { 102 | nbranch=length(tr$edge[,1]) 103 | tr$edge.lengths=rep(0,nbranch) 104 | tr_newick=unlist(strsplit(write.tree(tr),"")) 105 | boot=matrix(runif(nbranch*n_sim,c(0.1,0.1,0,0,0,0,0,0,0.1,0.1),c(0.5,0.5,0.5,0.05,0.05,0.05,0.05,0.5,0.5,0.5)),ncol=5,byrow=T) 106 | pos=which(tr_newick==0) 107 | trees=data.frame(t(tr_newick)) 108 | trees=trees[rep(1,n_sim),] 109 | trees[,pos]=boot[,1:5] 110 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 111 | return(tree_list) 112 | } 113 | 114 | tree_genFE=function(tr,n_sim) 115 | { 116 | nbranch=length(tr$edge[,1]) 117 | tr$edge.lengths=rep(0,nbranch) 118 | tr_newick=unlist(strsplit(write.tree(tr),"")) 119 | boot=matrix(runif(nbranch*n_sim,c(0.1,0,0,0.1,0,0.1,0,0,0,0.1,0,0.1,0,0.1,0,0,0.1,0,0,0.1),c(0.5,0.05,0.05,0.5,0.05,0.5,0.05,0.05,0.05,0.5,0.05,0.5,0.05,0.5,0.05,0.05,0.5,0.05,0.05,0.5)),ncol=5,byrow=T) 120 | pos=which(tr_newick==0) 121 | trees=data.frame(t(tr_newick)) 122 | trees=trees[rep(1,n_sim),] 123 | trees[,pos]=boot[,1:5] 124 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 125 | return(tree_list) 126 | } 127 | 128 | 129 | tree_genFEE=function(tr,n_sim) 130 | { 131 | nbranch=length(tr$edge[,1]) 132 | tr$edge.lengths=rep(0,nbranch) 133 | tr_newick=unlist(strsplit(write.tree(tr),"")) 134 | boot=matrix(runif(nbranch*n_sim,c(0.1,0,0,0.1,0,0.1,0,0,0,0.1,0,0.1,0,0.1,0,0,0.1,0,0,0.1),c(0.5,0.05,0.5,0.5,0.05,0.5,0.05,0.5,0.05,0.5,0.05,0.5,0.5,0.5,0.05,0.05,0.5,0.5,0.05,0.5)),ncol=5,byrow=T) 135 | pos=which(tr_newick==0) 136 | trees=data.frame(t(tr_newick)) 137 | trees=trees[rep(1,n_sim),] 138 | trees[,pos]=boot[,1:5] 139 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 140 | return(tree_list) 141 | } 142 | 143 | 144 | tree_genLONG=function(tr,n_sim) 145 | { 146 | nbranch=length(tr$edge[,1]) 147 | tr$edge.lengths=rep(0,nbranch) 148 | tr_newick=unlist(strsplit(write.tree(tr),"")) 149 | boot=matrix(runif(nbranch*n_sim,c(0.1,0.1,0,0.1,0.1),c(0.5,0.5,0.5,0.5,0.5)),ncol=5,byrow=T) 150 | pos=which(tr_newick==0) 151 | trees=data.frame(t(tr_newick)) 152 | trees=trees[rep(1,n_sim),] 153 | trees[,pos]=boot[,1:5] 154 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 155 | return(tree_list) 156 | } 157 | 158 | tree_genLONGOUT=function(tr,n_sim) 159 | { 160 | nbranch=length(tr$edge[,1]) 161 | tr$edge.lengths=rep(0,nbranch) 162 | tr_newick=unlist(strsplit(write.tree(tr),"")) 163 | boot=matrix(runif(nbranch*n_sim,c(0.1,0,0,0,0,0,0.1,0,0,0,0,0,0,0.1,0,0,0,0,0,0.1),c(0.5,0.05,0.5,0.05,0.05,0.05,0.5,0.5,0.05,0.05,0.05,0.05,0.5,0.5,0.05,0.05,0.05,0.5,0.05,0.5)),ncol=5,byrow=T) 164 | pos=which(tr_newick==0) 165 | trees=data.frame(t(tr_newick)) 166 | trees=trees[rep(1,n_sim),] 167 | trees[,pos]=boot[,1:5] 168 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 169 | return(tree_list) 170 | } 171 | 172 | tree_genLONGULTRA=function(tr,n_sim) 173 | { 174 | nbranch=length(tr$edge[,1]) 175 | tr$edge.lengths=rep(0,nbranch) 176 | tr_newick=unlist(strsplit(write.tree(tr),"")) 177 | boot=matrix(runif(nbranch*n_sim,c(0.5,0.5,0,0.5,0.5),c(1,1,1,1,1)),ncol=5,byrow=T) 178 | pos=which(tr_newick==0) 179 | trees=data.frame(t(tr_newick)) 180 | trees=trees[rep(1,n_sim),] 181 | trees[,pos]=boot[,1:5] 182 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 183 | return(tree_list) 184 | } 185 | 186 | 187 | tree_genSHORT=function(tr,n_sim) 188 | { 189 | nbranch=length(tr$edge[,1]) 190 | tr$edge.lengths=rep(0,nbranch) 191 | tr_newick=unlist(strsplit(write.tree(tr),"")) 192 | boot=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.05,0.05,0.5,0.05,0.05)),ncol=5,byrow=T) 193 | pos=which(tr_newick==0) 194 | trees=data.frame(t(tr_newick)) 195 | trees=trees[rep(1,n_sim),] 196 | trees[,pos]=boot[,1:5] 197 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 198 | return(tree_list) 199 | } 200 | 201 | 202 | 203 | tree_genSHORTOUT=function(tr,n_sim) 204 | { 205 | nbranch=length(tr$edge[,1]) 206 | tr$edge.lengths=rep(0,nbranch) 207 | tr_newick=unlist(strsplit(write.tree(tr),"")) 208 | boot=matrix(runif(nbranch*n_sim,c(0,0.1,0,0.1,0.1,0.1,0,0,0.1,0.1,0.1,0.1,0,0,0.1,0.1,0.1,0,0.1,0),c(0.05,0.5,0.5,0.5,0.5,0.5,0.05,0.5,0.5,0.5,0.5,0.5,0.5,0.05,0.5,0.5,0.5,0.5,0.5,0.05)),ncol=5,byrow=T) 209 | pos=which(tr_newick==0) 210 | trees=data.frame(t(tr_newick)) 211 | trees=trees[rep(1,n_sim),] 212 | trees[,pos]=boot[,1:5] 213 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 214 | return(tree_list) 215 | } 216 | 217 | tree_genSHORTINT=function(tr,n_sim) 218 | { 219 | nbranch=length(tr$edge[,1]) 220 | tr$edge.lengths=rep(0,nbranch) 221 | tr_newick=unlist(strsplit(write.tree(tr),"")) 222 | boot=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.5,0.5,0.05,0.5,0.5)),ncol=5,byrow=T) 223 | pos=which(tr_newick==0) 224 | trees=data.frame(t(tr_newick)) 225 | trees=trees[rep(1,n_sim),] 226 | trees[,pos]=boot[,1:5] 227 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 228 | return(tree_list) 229 | } 230 | 231 | tree_genSHORTULTRA=function(tr,n_sim) 232 | { 233 | nbranch=length(tr$edge[,1]) 234 | tr$edge.lengths=rep(0,nbranch) 235 | tr_newick=unlist(strsplit(write.tree(tr),"")) 236 | boot=matrix(runif(nbranch*n_sim,c(0,0,0,0,0),c(0.01,0.01,0.01,0.01,0.01)),ncol=5,byrow=T) 237 | pos=which(tr_newick==0) 238 | trees=data.frame(t(tr_newick)) 239 | trees=trees[rep(1,n_sim),] 240 | trees[,pos]=boot[,1:5] 241 | tree_list=as.vector(apply(trees,1,paste,collapse="")) 242 | return(tree_list) 243 | } 244 | 245 | 246 | 247 | 248 | 249 | indelib_gen=function(n_taxa,n_sim,aln_length,parameter,region) # n_sim = number of simulations per topology 250 | { 251 | dir.create(region, showWarnings = FALSE) 252 | print(paste("I am simulating",n_sim,"alignements per topology of Length =",aln_length,"N taxa =",n_taxa)) 253 | taxa=c('A','B','C','D','E','F','G','H','I','J','K') 254 | all_topo=allTrees(n_taxa, rooted = FALSE, tip.label = taxa[1:n_taxa]) 255 | iter=0 256 | for (tr in all_topo) 257 | { 258 | iter=iter+1 259 | dir.create(paste(region,"/topo",iter,sep=""), showWarnings = FALSE) 260 | write(paste('[TYPE] NUCLEOTIDE 2\n[SETTINGS]\n [output] FASTA\n [randomseed] ',round(runif(1,1,100000))),paste(region,"/topo",iter,'/control.txt',sep="")) 261 | n_datasets=n_sim 262 | #Set MODEL block 263 | modelset=sample(c('JC','TIM','TIMef','GTR','UNREST'),n_datasets,replace=T) 264 | MODEL=model_gen(modelset,paste(region,"/topo",iter,'/control.txt',sep="")) 265 | #Set TREE block 266 | ID_TREE=paste("t",rep(iter,n_sim),rep("_sim",times=n_datasets),1:n_datasets,sep="") 267 | print(iter) 268 | print("Newick") 269 | if (region=="EXP") 270 | { 271 | NEWICK=tree_genEXP(all_topo[[iter]],n_sim) 272 | } else if (region == "FA"){ 273 | NEWICK=tree_genFA(all_topo[[iter]],n_sim) 274 | } else if (region == "FAT"){ 275 | NEWICK=tree_genFAT(all_topo[[iter]],n_sim) 276 | } else if (region == "FAE"){ 277 | NEWICK=tree_genFAE(all_topo[[iter]],n_sim) 278 | } else if (region == "FE"){ 279 | NEWICK=tree_genFE(all_topo[[iter]],n_sim) 280 | } else if (region == "FEE"){ 281 | NEWICK=tree_genFEE(all_topo[[iter]],n_sim) 282 | } else if (region == "LONG"){ 283 | NEWICK=tree_genLONG(all_topo[[iter]],n_sim) 284 | } else if (region == "LONGOUT"){ 285 | NEWICK=tree_genLONGOUT(all_topo[[iter]],n_sim) 286 | } else if (region == "LONGULTRA"){ 287 | NEWICK=tree_genLONGULTRA(all_topo[[iter]],n_sim) 288 | } else if (region == "SHORT"){ 289 | NEWICK=tree_genSHORT(all_topo[[iter]],n_sim) 290 | } else if (region == "SHORTOUT"){ 291 | NEWICK=tree_genSHORTOUT(all_topo[[iter]],n_sim) 292 | } else if (region == "SHORTINT"){ 293 | NEWICK=tree_genSHORTINT(all_topo[[iter]],n_sim) 294 | } else { 295 | NEWICK=tree_genSHORTULTRA(all_topo[[iter]],n_sim) 296 | } 297 | print("Done newick") 298 | write.table(data.frame('[TREE]',ID_TREE,NEWICK),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 299 | #Set PARTITIONS block 300 | PNAME=paste("p",1:n_datasets,sep="") 301 | write.table(data.frame('[PARTITIONS]',PNAME,"[",ID_TREE,MODEL,aln_length,"]"),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 302 | #Set EVOLVE block 303 | write('[EVOLVE]',paste(region,"/topo",iter,'/control.txt',sep=""),append=T) 304 | write.table(data.frame(PNAME,1,apply(data.frame(ID_TREE,"_",MODEL),1,paste,collapse="")),paste(region,"/topo",iter,'/control.txt',sep=""),append=T,quote=F,row.names=F,col.names =F) 305 | } 306 | } 307 | for (r in c("EXP","FA","FAT","FAE","FE","FEE","LONG","LONGOUT","LONGULTRA","SHORT","SHORTOUT","SHORTINT","SHORTULTRA")) 308 | { 309 | indelib_gen(as.numeric(args[1]),as.numeric(args[2]),as.numeric(args[3]),as.numeric(args[4]),r) 310 | } 311 | 312 | 313 | -------------------------------------------------------------------------------- /INDELible/figure_prep.R: -------------------------------------------------------------------------------- 1 | library("MASS") 2 | library("ape") 3 | library("phangorn") 4 | library("ggplot2") 5 | library("gtools") 6 | library('pals') 7 | library('vioplot') 8 | library('dplyr') 9 | library('scales') 10 | library('geometry') 11 | library('gplots') 12 | library("corrplot") 13 | library("ggtern") 14 | library("ggpubr") 15 | 16 | 17 | color.bar <- function(lut, min, max=-min, nticks=3, ticks=seq(min, max, len=nticks), title='') { 18 | scale = (length(lut)-1)/(max-min) 19 | 20 | #quartz() 21 | lines(c(0,10), c(min,max), type='n', bty='n', xaxt='n', xlab='', yaxt='n', ylab='', main=title) 22 | axis(2, ticks, las=1,cex.axis=0.6) 23 | for (i in 1:(length(lut)-1)) { 24 | y = (i-1)/scale + min 25 | rect(0,y,10,y+1/scale, col=lut[i], border=NA) 26 | } 27 | } 28 | color.bar(inferno(1000),0,1) 29 | quartz.save("heatlegend.pdf", type = "pdf",antialias=F,bg="white",dpi=800,pointsize=12) 30 | dev.off() 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | plot_topo=function(brl,namezone,xlabz,ylabz) 46 | { 47 | plot(1,1,xlim=c(0.4,1.6),ylim=c(0.6,1.4),col="white",xaxt='n',yaxt='n',main=namezone,xlab="",ylab="") 48 | brl=ifelse(brl==0,0.15,0.45) 49 | ext=brl 50 | b1=ext[1];b2=ext[2];b=ext[3];b3=ext[4];b4=ext[5];as=1;lns=1 51 | l=b/2 52 | ax=as-l-sqrt(b1^2/2) 53 | bx=as-l 54 | cx=as+l 55 | dx=as+l+sqrt(b3^2/2) 56 | ex=as+l+sqrt(b4^2/2) 57 | fx=as-l-sqrt(b2^2/2) 58 | ay=sqrt(b1^2/2)+lns 59 | by=lns 60 | cy=lns 61 | dy=sqrt(b3^2/2)+lns 62 | ey=lns-sqrt(b4^2/2) 63 | fy=lns-sqrt(b2^2/2) 64 | lines(c(ax,bx,cx,dx,cx,ex,cx,bx,fx),c(ay,by,cy,dy,cy,ey,cy,by,fy),type="l",lwd=1.5) 65 | lines(ax,ay,pch=21,cex=2,type='p',bg="white") 66 | lines(fx,fy,pch=21,cex=2,type='p',bg="white") 67 | lines(dx,dy,pch=21,cex=2,type='p',bg="white") 68 | lines(ex,ey,pch=21,cex=2,type='p',bg="white") 69 | lines(1,1,pch=21,cex=2,type='p',bg="white") 70 | text(ax,ay,1,cex=1) 71 | text(fx,fy,2,cex=1) 72 | text(dx,dy,3,cex=1) 73 | text(ex,ey,4,cex=1) 74 | text(1,1,5,cex=1) 75 | mtext(xlabz,side=1,line=0.7,cex=1) 76 | mtext(ylabz,side=2,line=0.5,cex=1) 77 | } 78 | 79 | boot_viol=function(my_table) 80 | { 81 | bootreg=c() 82 | for (b in 1:1000) 83 | { 84 | tabboot=my_table[,c("MP","NJ","ML","BI",'CNN')] 85 | boot=apply(apply(tabboot,2,sample,replace=T),2,mean) 86 | bootreg=rbind(bootreg,boot) 87 | } 88 | return(bootreg) 89 | } 90 | 91 | boot_viol_big=function(my_table) 92 | { 93 | bootreg=c() 94 | for (b in 1:1000) 95 | { 96 | tabboot=my_table[,"X5"] 97 | boot=mean(sample(tabboot,replace=T)) 98 | bootreg=rbind(bootreg,boot) 99 | } 100 | return(bootreg) 101 | } 102 | 103 | 104 | 105 | plot_dens=function(zonetab,zone,y,xlim,n,title) 106 | { 107 | ztab=zonetab[zonetab$zone==zone,] 108 | ztab$y_axis=y 109 | for (col in c("MP","NJ","ML","BI",'CNN')) 110 | { 111 | bind1=ztab[ztab[,col]!=1,c("inter","y_axis")] 112 | bind2=as.matrix(ztab[,c("inter","y_axis")]) 113 | k1=kde2d(bind1[,1],bind1[,2], n=200,lims = c(c(0,xlim),c(min(bind2[,2]),max(bind2[,2])))) 114 | k2=kde2d(bind2[,1],bind2[,2], n=200,lims = c(c(0,xlim),c(min(bind2[,2]),max(bind2[,2])))) 115 | k1$z=k1$z/k2$z 116 | if (title == 1) 117 | { 118 | image(k1, col=inferno(2000),yaxs="i",main="",cex.axis=0.8) 119 | title(col,adj=0,line=0.5) 120 | }else{ 121 | image(k1, col=inferno(2000),yaxs="i",main="",cex.axis=0.8) 122 | } 123 | lines(bind1,type="p",pch=21,cex=0.3,bg="gray",lwd=0.3) 124 | 125 | } 126 | } 127 | 128 | plot_viol=function(z_list,zone,ylim1,ylim2,laby,topviol) 129 | { 130 | bootreg=z_list[[zone]][["z_boot"]] 131 | vioplot(bootreg[,1],bootreg[,2],bootreg[,3],bootreg[,4],bootreg[,5],col="grey",pchMed=16,names=NA,ylim=c(ylim1,ylim2)) 132 | mtext(laby,2,padj=-3,cex=0.7) 133 | mtext(c("MP","NJ","ML","BI","CNN"),1,at=seq(1,5),padj=1,cex=0.6) 134 | text(1:5,apply(bootreg,2,max)+topviol,round(z_list[[zone]][["z_acc"]],digits=3),cex=0.7,xpd = TRUE,col="red") 135 | } 136 | 137 | 138 | 139 | ###Get bootstrap values 140 | getBoot= function(x){boot=c(); for (phy in x) {boot=c(boot,phy$node.label[2])};boot=as.numeric(gsub(".*/","",boot));return(boot)} 141 | 142 | table_prep=function(path_tab) 143 | { 144 | tt=read.table(path_tab) 145 | names(tt)=c("ID","true_T","model","I","G","pars","nj","ml","bi","CNN_class") 146 | tt$zone=rep(c( "EXP", "FA", "FAE" ,"FAT", "FE", "FEE","LONG","LONGOUT","LONGULTRA","SHORT","SHORTINT","SHORTOUT","SHORTULTRA"),each=3000) 147 | 148 | #Read newicks 149 | master=read.tree(text=as.character(tt$true_T)) 150 | pars=read.tree(text=as.character(tt$pars)) 151 | nj=read.tree(text=as.character(tt$nj)) 152 | ml=read.tree(text=as.character(tt$ml)) 153 | ba=read.tree(text=as.character(tt$bi)) 154 | 155 | #Calculate RF 156 | RF=c() 157 | for (i in 1:length(master)) 158 | { 159 | treerf=RF.dist(c(pars[[i]],nj[[i]],ml[[i]],ba[[i]]),master[[i]]) 160 | RF=rbind(RF,treerf) 161 | print(i) 162 | } 163 | 164 | tt=cbind(tt,data.frame(RF)+1) 165 | tt$X5=as.numeric(tt$CNN_class==rep(rep(c(0,1,2),times=c(1000,1000,1000)),13)) 166 | tt[,c("X1","X2","X3","X4",'X5')]=ifelse(tt[,c("X1","X2","X3","X4",'X5')]!=1,0,1) 167 | 168 | ###Extract branch lengths 169 | mm=unlist(strsplit(gsub("[:|,|[A-Z]|\\(|\\)|;| "," ",tt$true_T),split=" ")) 170 | br_l=data.frame(matrix(as.numeric(mm[mm!=""]),ncol=5,byrow=T),stringsAsFactors=F) 171 | names(br_l)=c("A","B","inter","C","D") 172 | tt=cbind(tt,br_l) 173 | names(tt)[12:16]=c("MP","NJ","ML","BI","CNN") 174 | tt$AB=tt$A+tt$B 175 | tt$CD=tt$C+tt$D 176 | tt$AC=tt$A+tt$C 177 | tt$AD=tt$A+tt$D 178 | tt$BC=tt$B+tt$C 179 | tt$BC=tt$B+tt$C 180 | tt$BD=tt$B+tt$D 181 | tt$ABCD=tt$AB+tt$CD 182 | return(tt) 183 | } 184 | 185 | table_prep_master=function(path_tab) 186 | { 187 | tt=read.table(path_tab) 188 | 189 | names(tt)=c("ID","true_T","model","I","G","pars","nj","ml","bi","CNN_class","CPP1","CPP2","CPP3","CNN_bs") 190 | #Read newicks 191 | master=read.tree(text=as.character(tt$true_T)) 192 | pars=read.tree(text=as.character(tt$pars)) 193 | nj=read.tree(text=as.character(tt$nj)) 194 | ml=read.tree(text=as.character(tt$ml)) 195 | ba=read.tree(text=as.character(tt$bi)) 196 | 197 | #Get bootstrap 198 | tt$pars_bs=getBoot(pars)/100 199 | tt$nj_bs=getBoot(nj)/100 200 | tt$ml_bs=getBoot(ml)/100 201 | tt$bi_pp=getBoot(ba) 202 | 203 | #Calculate RF 204 | RF=c() 205 | for (i in 1:length(master)) 206 | { 207 | treerf=RF.dist(c(pars[[i]],nj[[i]],ml[[i]],ba[[i]]),master[[i]]) 208 | RF=rbind(RF,treerf) 209 | print(i) 210 | } 211 | 212 | tt=cbind(tt,data.frame(RF)+1) 213 | tt$X5=as.numeric(tt$CNN_class==rep(c(0,1,2),times=c(5000,5000,5000))) 214 | tt[,c("X1","X2","X3","X4",'X5')]=ifelse(tt[,c("X1","X2","X3","X4",'X5')]!=1,0,1) 215 | names(tt)[19:23]=c("MP","NJ","ML","BI","CNN") 216 | return(tt) 217 | } 218 | 219 | 220 | ###Accuracy 221 | acc_get=function(tt) 222 | { 223 | z_list=list() 224 | for(z in unique(tt$zone)) 225 | { 226 | print(z) 227 | zacc=apply(tt[tt$zone==z,c("MP","NJ","ML","BI","CNN")],2,mean) 228 | zboot=boot_viol(tt[tt$zone==z,c("MP","NJ","ML","BI","CNN")]) 229 | z_list[[z]]=list(z_acc=zacc,z_boot=zboot) 230 | } 231 | return(z_list) 232 | } 233 | 234 | 235 | 236 | "/Users/anton/Downloads/gapregions_1000.table" 237 | ###MAIN FIGS 238 | 239 | ###Zones GAPS 240 | quartz(width=7.7, height=11) 241 | par(mfcol=c(7,4),mar=c(2,3,2,1)) 242 | #FA 243 | plot_topo(c(1,1,0,0,0),"a) Farris zone",expression('B'[5]),expression('B'[1+2])) 244 | plot_viol(z_list,"FA",0.53,1.02,"Accuracy",0.023) 245 | y=as.numeric(apply(tt[tt$zone=="FA",c("AB","CD","AC","AD","BC","BD")],1,max)) 246 | plot_dens(tt,"FA",y,0.05,200,1) 247 | #FAT 248 | plot_topo(c(1,1,0,0,0),"b) Twisted Farris zone",expression('B'[5]),expression('B'[1+2])) 249 | plot_viol(z_list,"FAT",0.52,1.02,"",0.023) 250 | y=as.numeric(apply(tt[tt$zone=="FAT",c("AB","CD","AC","AD","BC","BD")],1,max)) 251 | plot_dens(tt,"FAT",y,0.05,200,0) 252 | #FE 253 | plot_topo(c(1,0,0,1,0),"c) Felsenstein zone",expression('B'[5]),expression('B'[1+3])) 254 | plot_viol(z_list,"FE",0.1,0.8,"",0.03) 255 | y=as.numeric(apply(tt[tt$zone=="FE",c("AB","CD","AC","AD","BC","BD")],1,max)) 256 | plot_dens(tt,"FE",y,0.05,200,0) 257 | #SHORTINT 258 | plot_topo(c(1,1,0,1,1),"d) Short internal branch",expression('B'[5]),expression('B'[1+2+3+4])) 259 | plot_viol(z_list,"SHORTINT",0.45,0.71,"",0.013) 260 | y=as.numeric(tt[tt$zone=="SHORTINT","ABCD"]) 261 | plot_dens(tt,"SHORTINT",y,0.05,200,0) 262 | quartz.save("Bias_gap.jpeg", type = "jpeg",antialias=F,bg="white",dpi=400,pointsize=12) 263 | dev.off() 264 | 265 | 266 | 267 | ###Zones NOGAPS 268 | quartz(width=7.7, height=11) 269 | par(mfcol=c(7,4),mar=c(2,3,2,1)) 270 | #FA 271 | plot_topo(c(1,1,0,0,0),"a) Farris zone",expression('B'[5]),expression('B'[1+2])) 272 | plot_viol(z_list,"FA",0.53,1.02,"Accuracy",0.023) 273 | y=as.numeric(apply(tt[tt$zone=="FA",c("AB","CD","AC","AD","BC","BD")],1,max)) 274 | plot_dens(tt,"FA",y,0.05,200,1) 275 | #FAT 276 | plot_topo(c(1,1,0,0,0),"b) Twisted Farris zone",expression('B'[5]),expression('B'[1+2])) 277 | plot_viol(z_list,"FAT",0.52,1.02,"",0.023) 278 | y=as.numeric(apply(tt[tt$zone=="FAT",c("AB","CD","AC","AD","BC","BD")],1,max)) 279 | plot_dens(tt,"FAT",y,0.05,200,0) 280 | #FE 281 | plot_topo(c(1,0,0,1,0),"c) Felsenstein zone",expression('B'[5]),expression('B'[1+3])) 282 | plot_viol(z_list,"FE",0.1,0.8,"",0.03) 283 | y=as.numeric(apply(tt[tt$zone=="FE",c("AB","CD","AC","AD","BC","BD")],1,max)) 284 | plot_dens(tt,"FE",y,0.05,200,0) 285 | #SHORTINT 286 | plot_topo(c(1,1,0,1,1),"d) Short internal branch",expression('B'[5]),expression('B'[1+2+3+4])) 287 | plot_viol(z_list,"SHORTINT",0.45,0.71,"",0.013) 288 | y=as.numeric(tt[tt$zone=="SHORTINT","ABCD"]) 289 | plot_dens(tt,"SHORTINT",y,0.05,200,0) 290 | quartz.save("Bias_nogap.jpeg", type = "jpeg",antialias=F,bg="white",dpi=400,pointsize=12) 291 | dev.off() 292 | 293 | 294 | 295 | ##SUPPL FIGS GAP / NOGAP 296 | quartz(width=10.6, height=10.1) 297 | par(mfrow=c(8,7),mar=c(2,2,2,2)) 298 | #FAE 299 | plot_topo(c(1,1,1,0,0),"a) Extended \nFarris zone",expression('B'[5]),expression('B'[1+2])) 300 | plot_viol(z_list,"FAE",0.8,1.02,"Accuracy",0.023) 301 | y=as.numeric(apply(tt[tt$zone=="FAE",c("AB","CD","AC","AD","BC","BD")],1,max)) 302 | plot_dens(tt,"FAE",y,0.5,200,1) 303 | 304 | #FEE 305 | plot_topo(c(1,0,1,1,0),"b) Extended \nFelsenstein zone",expression('B'[5]),expression('B'[1+3])) 306 | plot_viol(z_list,"FEE",0.7,1.02,"Accuracy",0.023) 307 | y=as.numeric(apply(tt[tt$zone=="FEE",c("AB","CD","AC","AD","BC","BD")],1,max)) 308 | plot_dens(tt,"FEE",y,0.5,200,1) 309 | 310 | #LONG 311 | plot_topo(c(1,1,1,1,1),"c) Long branches",expression('B'[5]),expression('B'[1+2+3+4])) 312 | plot_viol(z_list,"LONG",0.7,1.02,"Accuracy",0.023) 313 | y=as.numeric(tt[tt$zone=="LONG","ABCD"]) 314 | plot_dens(tt,"LONG",y,0.5,200,1) 315 | 316 | #LONGULTRA 317 | plot_topo(c(1,1,1,1,1),"d) Extra-long branches",expression('B'[5]),expression('B'[1+2+3+4])) 318 | plot_viol(z_list,"LONGULTRA",0.6,1,"Accuracy",0.023) 319 | y=as.numeric(tt[tt$zone=="LONGULTRA","ABCD"]) 320 | plot_dens(tt,"LONGULTRA",y,1,200,1) 321 | 322 | #LONGOUT 323 | plot_topo(c(1,0,1,0,0),"e) Single long branch",expression('B'[5]),expression('B'[1])) 324 | plot_viol(z_list,"LONGOUT",0.8,1.02,"Accuracy",0.023) 325 | y=as.numeric(apply(tt[tt$zone=="LONGOUT",c("A","B","C","D")],1,max)) 326 | plot_dens(tt,"LONGOUT",y,0.5,200,1) 327 | 328 | #SHORT 329 | plot_topo(c(0,0,1,0,0),"f) Short branches",expression('B'[5]),expression('B'[1+2+3+4])) 330 | plot_viol(z_list,"SHORT",0.96,1,"Accuracy",0.004) 331 | y=as.numeric(tt[tt$zone=="SHORT","ABCD"]) 332 | plot_dens(tt,"SHORT",y,0.5,200,1) 333 | 334 | #SHORTULTRA 335 | plot_topo(c(0,0,0,0,0),"g) Extra-short branches",expression('B'[5]),expression('B'[1+2+3+4])) 336 | plot_viol(z_list,"SHORTULTRA",0.6,1,"Accuracy",0.023) 337 | y=as.numeric(tt[tt$zone=="SHORTULTRA","ABCD"]) 338 | plot_dens(tt,"SHORTULTRA",y,0.01,200,1) 339 | 340 | #SHORTOUT 341 | plot_topo(c(0,1,1,1,1),"e) Single short branch",expression('B'[5]),expression('B'[1])) 342 | plot_viol(z_list,"SHORTOUT",0.8,1.02,"Accuracy",0.023) 343 | y=as.numeric(apply(tt[tt$zone=="SHORTOUT",c("A","B","C","D","BC")],1,min)) 344 | plot_dens(tt,"SHORTOUT",y,0.5,200,1) 345 | quartz.save("Suppl_Bias_gap.jpeg", type = "jpeg",antialias=F,bg="white",dpi=400,pointsize=12) 346 | dev.off() 347 | 348 | #MAIN FIGS ACCURACY PINV GAMMA 349 | 350 | #read gap table 351 | tt_gap=table_prep_master( "~/Downloads/master_regions_gap") 352 | tt_nogap=table_prep_master( "~/Downloads/master_regions_nogap") 353 | tt_gap$zone="TOTAL" 354 | tt_nogap$zone="TOTAL" 355 | gap_acc=acc_get(tt_gap) 356 | nogap_acc=acc_get(tt_nogap) 357 | nogap150k=read.table("~/Downloads/150k.classeslab_class.txt") 358 | nogap300k=read.table("~/Downloads/300k.classeslab_class.txt") 359 | 360 | nogap150k$X5=as.numeric(nogap150k$V1==rep(c(0,1,2),times=c(5000,5000,5000))) 361 | nogap150k[,'X5']=ifelse(nogap150k[,'X5']!=1,0,1) 362 | 363 | nogap300k$X5=as.numeric(nogap300k$V1==rep(c(0,1,2),times=c(5000,5000,5000))) 364 | nogap300k[,'X5']=ifelse(nogap300k[,'X5']!=1,0,1) 365 | 366 | nogap150kboot=boot_viol_big(nogap150k) 367 | nogap300kboot=boot_viol_big(nogap300k) 368 | acc150k=mean(nogap150k$X5) 369 | acc300k=mean(nogap300k$X5) 370 | 371 | 372 | #Accuracy 373 | vioplot(gap_acc$TOTAL$z_boot[,1],nogap_acc$TOTAL$z_boot[,1],gap_acc$TOTAL$z_boot[,2],nogap_acc$TOTAL$z_boot[,2],gap_acc$TOTAL$z_boot[,3],nogap_acc$TOTAL$z_boot[,3],gap_acc$TOTAL$z_boot[,4],nogap_acc$TOTAL$z_boot[,4],gap_acc$TOTAL$z_boot[,5],nogap_acc$TOTAL$z_boot[,5],nogap150kboot[,1],nogap300kboot[,1],col=c(rep(c("grey45","grey80"),5),rep("grey80",2)),pchMed=16,names=c(rep("",12))) 374 | text(x=1:12, 0.67, labels=c(rep(c("MP","NJ","ML","BI"),each=2),"CNN50k","CNN50k","CNN150k","CNN300k"),srt = 45, pos = 1, xpd = TRUE,cex=0.8) 375 | legend("topleft",legend=c("gapped","ungapped"),col=c("grey45","grey80"),pch=19,cex=0.8) 376 | title(ylab="Accuracy") 377 | text(1:12,apply(cbind(gap_acc$TOTAL$z_boot,nogap_acc$TOTAL$z_boot,nogap150kboot,nogap300kboot),2,max)[c(1,6,2,7,3,8,4,9,5,10,11,12)]+0.004,round(c(gap_acc$TOTAL$z_acc,nogap_acc$TOTAL$z_acc,acc150k,acc300k)[c(1,6,2,7,3,8,4,9,5,10,11,12)],digits=3),cex=0.6) 378 | 379 | #Invariant + Gamma 380 | 381 | pinvar1=data.frame(inv=tt_gap[,"I"],method=rep(c("MP","NJ","ML","BI","CNN"),each=15000),inf=ifelse(as.vector(as.matrix(tt_gap[,c("MP","NJ","ML","BI","CNN")]))==1,"correct","incorrect")) 382 | pinvarpl1=ggplot(pinvar1, aes(x=method, y=inv, fill=inf))+geom_violin(trim=T,size=0.5,bw=0.05)+ theme_classic()+scale_x_discrete(limits=c("MP","NJ","ML","BI","CNN"))+scale_fill_manual(values=c("grey","white"))+geom_boxplot(width=0.1,position=position_dodge(0.9))+ 383 | labs(title="Invariant sites (+I model)", y = expression('p'[inv]),x="",fill = "")+ stat_summary(fun.y=median, geom="point", size=2.5, color="black",position=position_dodge(0.9))+theme(legend.position = c(0.75, 1.1),legend.direction = "horizontal") 384 | 385 | pinvar2=data.frame(inv=tt_nogap[,"I"],method=rep(c("MP","NJ","ML","BI","CNN"),each=15000),inf=ifelse(as.vector(as.matrix(tt_nogap[,c("MP","NJ","ML","BI","CNN")]))==1,"correct","incorrect")) 386 | pinvarpl2=ggplot(pinvar2, aes(x=method, y=inv, fill=inf))+geom_violin(trim=T,size=0.5,bw=0.05)+ theme_classic()+scale_x_discrete(limits=c("MP","NJ","ML","BI","CNN"))+scale_fill_manual(values=c("grey","white"))+geom_boxplot(width=0.1,position=position_dodge(0.9))+ 387 | labs(title="Invariant sites (+I model)", y = expression('p'[inv]),x="",fill = "")+ stat_summary(fun.y=median, geom="point", size=2.5, color="black",position=position_dodge(0.9))+theme(legend.position = "none") 388 | 389 | gam1=data.frame(inv=tt_gap[,c("G")],method=rep(c("MP","NJ","ML","BI","CNN"),each=15000),inf=ifelse(as.vector(as.matrix(tt_gap[,c("MP","NJ","ML","BI","CNN")]))==1,"correct","incorrect")) 390 | gampl1=ggplot(gam1, aes(x=method, y=inv, fill=inf))+geom_violin(trim=T,bw=0.05)+ theme_classic()+scale_x_discrete(limits=c("MP","NJ","ML","BI","CNN"))+scale_fill_manual(values=c("grey","white"))+geom_boxplot(width=0.1,position=position_dodge(0.9))+ 391 | labs(title=expression(paste("Gamma (+",Gamma," model)")), y = expression(alpha),x="",fill = "")+ stat_summary(fun.y=median, geom="point", size=2.5, color="black",position=position_dodge(0.9))+theme(legend.position = "none") 392 | 393 | gam2=data.frame(inv=tt_nogap[,c("G")],method=rep(c("MP","NJ","ML","BI","CNN"),each=15000),inf=ifelse(as.vector(as.matrix(tt_nogap[,c("MP","NJ","ML","BI","CNN")]))==1,"correct","incorrect")) 394 | gampl2=ggplot(gam2, aes(x=method, y=inv, fill=inf))+geom_violin(trim=T,bw=0.05)+ theme_classic()+scale_x_discrete(limits=c("MP","NJ","ML","BI","CNN"))+scale_fill_manual(values=c("grey","white"))+geom_boxplot(width=0.1,position=position_dodge(0.9))+ 395 | labs(title=expression(paste("Gamma (+",Gamma," model)")), y = expression(alpha),x="",fill = "")+ stat_summary(fun.y=median, geom="point", size=2.5, color="black",position=position_dodge(0.9))+theme(legend.position = "none") 396 | 397 | ggarrange(pinvarpl1,pinvarpl2,gampl1,gampl2,nrow=2,ncol=2,labels = c("a)", "b)","c)","d)")) 398 | 399 | 400 | #Reliability 401 | pro1=apply(tt_gap[,c("CPP1","CPP2","CPP3")],1,max) 402 | bootboot1=data.frame(boot=as.vector(as.matrix(cbind(tt_gap[,c("pars_bs","nj_bs","ml_bs","bi_pp","CNN_bs")],pro1))),method=rep(c("MP BS","NJ BS","ML BS","Bayes PP","CNN BS","CNN CPP"),each=15000),inf=ifelse(as.vector(as.matrix(tt_gap[,c("MP","NJ","ML","BI","CNN","CNN")]))==1,"correct","incorrect")) 403 | bootbootpl1=ggplot(bootboot1, aes(x=method, y=boot, fill=inf))+geom_violin(trim=T,bw=0.15)+ theme_classic()+scale_x_discrete(limits=c("MP BS","NJ BS","ML BS","Bayes PP","CNN BS","CNN CPP"))+scale_fill_manual(values=c("grey","white"))+ 404 | geom_boxplot(width=0.05,position=position_dodge(0.9),outlier.shape = NA) +labs(title="", y = "Value of reliability measure",x="",fill = "")+ stat_summary(fun.y=median, geom="point", size=2, color="black",position=position_dodge(0.9))+theme(legend.position = c(0.2, 1.05),legend.direction = "horizontal") 405 | 406 | 407 | pro2=apply(tt_nogap[,c("CPP1","CPP2","CPP3")],1,max) 408 | bootboot2=data.frame(boot=as.vector(as.matrix(cbind(tt_nogap[,c("pars_bs","nj_bs","ml_bs","bi_pp","CNN_bs")],pro2))),method=rep(c("MP BS","NJ BS","ML BS","Bayes PP","CNN BS","CNN CPP"),each=15000),inf=ifelse(as.vector(as.matrix(tt_nogap[,c("MP","NJ","ML","BI","CNN","CNN")]))==1,"correct","incorrect")) 409 | bootbootpl2=ggplot(bootboot2, aes(x=method, y=boot, fill=inf))+geom_violin(trim=T,bw=0.15)+ theme_classic()+scale_x_discrete(limits=c("MP BS","NJ BS","ML BS","Bayes PP","CNN BS","CNN CPP"))+scale_fill_manual(values=c("grey","white"))+ 410 | geom_boxplot(width=0.05,position=position_dodge(0.9),outlier.shape = NA) +labs(title="", y = "Value of reliability measure",x="",fill = "")+ stat_summary(fun.y=median, geom="point", size=2, color="black",position=position_dodge(0.9))+theme(legend.position = "none") 411 | 412 | ggarrange(bootbootpl1,bootbootpl2,nrow=2,labels = c("a)", "b)")) 413 | 414 | --------------------------------------------------------------------------------