├── src ├── PhyloAcc-GT │ ├── placeholder │ ├── profile.h │ ├── newick2.h │ ├── Makefile │ ├── newick.h │ ├── profile.cpp │ ├── genetree.hpp │ ├── utils.cpp │ ├── bpp_c2.cpp │ └── utils.h ├── PhyloAcc-interface │ ├── phyloacc_lib │ │ ├── __init__.py │ │ ├── info.yaml │ │ ├── pa-welcome.txt │ │ ├── labeltree.py │ │ ├── output.py │ │ ├── post_params.py │ │ ├── html.py │ │ ├── params.py │ │ └── treeio.py │ └── phyloacc.py ├── PhyloAcc-ST-GBGC │ ├── PhyloAcc_gBGC │ ├── SRC │ │ ├── profile.h │ │ ├── newick.h │ │ ├── profile.cpp │ │ ├── utils.cpp │ │ ├── utils.h │ │ └── newick.cpp │ └── Makefile └── PhyloAcc-ST │ ├── profile.h │ ├── newick.h │ ├── Makefile │ ├── profile.cpp │ ├── utils.cpp │ ├── utils.h │ └── newick.cpp ├── dev ├── TODO.md └── Makefile-crossplat-dev ├── .gitignore ├── bioconda_test.sh ├── Makefile ├── docs └── v1 │ ├── READMEv1_OUTPUT.md │ ├── READMEv1_PARAMETER.md │ └── READMEv1.md ├── CHANGELOG.md └── phyloacc-cfg-template.yaml /src/PhyloAcc-GT/placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/PhyloAcc_gBGC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phyloacc/PhyloAcc/HEAD/src/PhyloAcc-ST-GBGC/PhyloAcc_gBGC -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/info.yaml: -------------------------------------------------------------------------------- 1 | version: 2.4.3 2 | releasedate-major: April 1, 2022 3 | releasedate-minor: September 26, 2024 4 | releasedate-patch: March 25, 2025 5 | devs: Zhirui Hu, Han Yan, Gregg Thomas, Tim Sackton, Scott Edwards, and Jun Liu 6 | doi: https://doi.org/10.1093/molbev/msz049 7 | http: https://phyloacc.github.io 8 | github: https://github.com/phyloacc/PhyloAcc 9 | -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/pa-welcome.txt: -------------------------------------------------------------------------------- 1 | _____ _ _ ___ 2 | | ___ \ | | | / _ \ 3 | | |_/ / |__ _ _| | ___ / /_\ \ ___ ___ 4 | | __/| '_ \| | | | |/ _ \| _ |/ __/ __| 5 | | | | | | | |_| | | (_) | | | | (_| (__ 6 | \_| |_| |_|\__, |_|\___/\_| |_/\___\___| 7 | __/ | 8 | |___/ -------------------------------------------------------------------------------- /src/PhyloAcc-ST/profile.h: -------------------------------------------------------------------------------- 1 | #ifndef PROFILE_H_INCLUDED 2 | #define PROFILE_H_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | struct PhyloProf 13 | { 14 | unsigned G, S, C; //, P, CG; 15 | vector< string > species_names; 16 | vector< string > element_names; 17 | vector< double* > element_pos; 18 | vector element_id; 19 | vector< string> X; 20 | 21 | }; 22 | 23 | // load the phylogenetic profile 24 | PhyloProf LoadPhyloProfiles(string profile_path,string segment_path,string segment_ID=""); 25 | 26 | #endif // PROFILE_H_INCLUDED 27 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/SRC/profile.h: -------------------------------------------------------------------------------- 1 | #ifndef PROFILE_H_INCLUDED 2 | #define PROFILE_H_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | struct PhyloProf 13 | { 14 | unsigned G, S, C; //, P, CG; 15 | vector< string > species_names; 16 | vector< string > element_names; 17 | vector< double* > element_pos; 18 | vector element_id; 19 | vector< string> X; 20 | 21 | }; 22 | 23 | // load the phylogenetic profile 24 | PhyloProf LoadPhyloProfiles(string profile_path,string segment_path,string segment_ID=""); 25 | 26 | #endif // PROFILE_H_INCLUDED 27 | -------------------------------------------------------------------------------- /src/PhyloAcc-GT/profile.h: -------------------------------------------------------------------------------- 1 | #ifndef PROFILE_H_INCLUDED 2 | #define PROFILE_H_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | struct PhyloProf 13 | { 14 | unsigned G, S, C; //, P, CG; 15 | vector< string > species_names; 16 | vector< string > element_names; 17 | vector< string > element_tree; 18 | vector< double* > element_pos; 19 | //vector element_pos; 20 | vector element_id; 21 | vector< string> X; 22 | 23 | }; 24 | 25 | // load the phylogenetic profile 26 | PhyloProf LoadPhyloProfiles(string profile_path,string segment_path,string segment_ID=""); 27 | 28 | #endif // PROFILE_H_INCLUDED 29 | -------------------------------------------------------------------------------- /src/PhyloAcc-GT/newick2.h: -------------------------------------------------------------------------------- 1 | #ifndef NEWICK2_H_INCLUDED 2 | #define NEWICK2_H_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | //#include "newick.h" 9 | 10 | using namespace std; 11 | using namespace arma; 12 | 13 | struct PhyloTree_theta 14 | { 15 | int S; 16 | vector< string > species_names; // species names 17 | vector< string > nodes_names; // all nodes names 18 | vector< double > distances; // branch distances 19 | vector< vector > dag; 20 | }; 21 | 22 | 23 | // load the phylogenetic tree 24 | PhyloTree_theta LoadPhyloTree_theta(string params_path); 25 | //void getThetas(PhyloTree tree_sub, PhyloTree tree_coal); 26 | #endif // NEWICK_H_INCLUDED 27 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/Makefile: -------------------------------------------------------------------------------- 1 | TARGET=PhyloAcc_gBGC 2 | ifeq ($(shell uname),Darwin) 3 | CXX=g++-7 4 | else 5 | CXX=g++ 6 | endif 7 | 8 | CFLAGS=-Wall -g -O2 -std=c++11 9 | LDFLAGS=-lgsl -lm -lgslcblas -larmadillo -fopenmp 10 | 11 | GSL_INCLUDE=/usr/local/include/ 12 | GSL_LIB=/usr/local/lib/ 13 | 14 | SRC_DIR=SRC 15 | SRCS=$(SRC_DIR)/*.cpp 16 | INCLUDES=$(SRC_DIR)/*.h $(SRC_DIR)/*.hpp 17 | PREFIX=/usr/local 18 | 19 | $(TARGET): $(SRCS) $(INCLUDES) 20 | $(CXX) $(CFLAGS) -I$(GSL_INCLUDE) -L$(GSL_LIB) $(SRCS) -o $(TARGET) $(LDFLAGS) 21 | 22 | .PHONY: install 23 | install: $(TARGET) 24 | cp $< $(DESTDIR)$(PREFIX)/bin/$(TARGET) 25 | 26 | .PHONY: uninstall 27 | uninstall: 28 | rm -f $(DESTDIR)$(PREFIX)/bin/$(TARGET) 29 | 30 | .PHONY: clean 31 | clean: 32 | rm -f *.o *~ $(TARGET) 33 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST/newick.h: -------------------------------------------------------------------------------- 1 | #ifndef NEWICK_H_INCLUDED 2 | #define NEWICK_H_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace arma; 10 | 11 | struct PhyloTree 12 | { 13 | int S; // num of living species 14 | vector< string > species_names; // species names 15 | vector< string > nodes_names; // all nodes names 16 | vector< vector > dag; // connection matrix 17 | vector< double > distances; // branch distances 18 | // base compostion and substritution rate 19 | vec pi; 20 | mat subs_rate; 21 | }; 22 | 23 | // load the phylogenetic tree 24 | PhyloTree LoadPhyloTree(string params_path); 25 | 26 | #endif // NEWICK_H_INCLUDED 27 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/SRC/newick.h: -------------------------------------------------------------------------------- 1 | #ifndef NEWICK_H_INCLUDED 2 | #define NEWICK_H_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace arma; 10 | 11 | struct PhyloTree 12 | { 13 | int S; // num of living species 14 | vector< string > species_names; // species names 15 | vector< string > nodes_names; // all nodes names 16 | vector< vector > dag; // connection matrix 17 | vector< double > distances; // branch distances 18 | // base compostion and substritution rate 19 | vec pi; 20 | mat subs_rate; 21 | }; 22 | 23 | // load the phylogenetic tree 24 | PhyloTree LoadPhyloTree(string params_path); 25 | 26 | #endif // NEWICK_H_INCLUDED 27 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST/Makefile: -------------------------------------------------------------------------------- 1 | TARGET=PhyloAcc 2 | ifeq ($(shell uname),Darwin) 3 | CXX=g++-7 4 | else 5 | CXX=g++ 6 | endif 7 | 8 | CFLAGS=-Wall -g -O2 -std=c++11 9 | LDFLAGS=-lgsl -lm -lgslcblas -larmadillo -fopenmp 10 | 11 | GSL_HOME=/usr/local/ 12 | GSL_INCLUDE=${GSL_HOME}/include/ 13 | GSL_LIB=${GSL_HOME}/lib/ 14 | 15 | SRC_DIR=. 16 | SRCS=$(SRC_DIR)/*.cpp 17 | INCLUDES=$(SRC_DIR)/*.h $(SRC_DIR)/*.hpp 18 | PREFIX=/usr/local 19 | 20 | $(TARGET): $(SRCS) $(INCLUDES) 21 | $(CXX) $(CFLAGS) -I$(GSL_INCLUDE) -L$(GSL_LIB) $(SRCS) -o $(TARGET) $(LDFLAGS) 22 | 23 | .PHONY: install 24 | install: $(TARGET) 25 | cp $< $(DESTDIR)$(PREFIX)/bin/$(TARGET) 26 | 27 | .PHONY: uninstall 28 | uninstall: 29 | rm -f $(DESTDIR)$(PREFIX)/bin/$(TARGET) 30 | 31 | .PHONY: clean 32 | clean: 33 | rm -f *.o *~ $(TARGET) 34 | -------------------------------------------------------------------------------- /src/PhyloAcc-GT/Makefile: -------------------------------------------------------------------------------- 1 | TARGET=PhyloAcc-GT_piQ 2 | CFLAGS=-Wall -g -O2 -std=c++11 3 | LDFLAGS=-lgsl -lm -lgslcblas -larmadillo -fopenmp 4 | 5 | GSL_INCLUDE=${GSL_HOME}/include 6 | GSL_LIB=${GSL_HOME}/lib64 7 | 8 | SRC_DIR=. 9 | SRCS=$(SRC_DIR)/*.cpp 10 | INCLUDES=$(SRC_DIR)/*.h $(SRC_DIR)/*.hpp 11 | PREFIX=/usr/local 12 | ARMA900_INCLUDE=/n/home05/hyan/armadillo-9.900.1/include 13 | ARMA900_LIB=/n/home05/hyan/armadillo-9.900.1 14 | 15 | $(TARGET): $(SRCS) $(INCLUDES) 16 | $(CXX) $(CFLAGS) -I$(GSL_INCLUDE) -I$(ARMA900_INCLUDE) -L$(GSL_LIB) -L$(ARMA900_LIB) $(SRCS) -o $(TARGET) $(LDFLAGS) 17 | 18 | .PHONY: install 19 | install: $(TARGET) 20 | cp $< $(DESTDIR)$(PREFIX)/bin/$(TARGET) 21 | 22 | .PHONY: uninstall 23 | uninstall: 24 | rm -f $(DESTDIR)$(PREFIX)/bin/$(TARGET) 25 | 26 | .PHONY: clean 27 | clean: 28 | rm -f *.o *~ $(TARGET) 29 | -------------------------------------------------------------------------------- /dev/TODO.md: -------------------------------------------------------------------------------- 1 | TODO: 2 | - ~~Split the Hu-etal-2019 folder into a separate repo within the PhyloAcc organization~~ (completed 02.18.2022) 3 | - ~~What is _config.yml? Seems to be related to building the website. Figure out where it needs to go~~ (removed) 4 | - ~~Update docs~~ 5 | - ~~Update links~~ 6 | - ~~Remake the original PhyloAcc web page so that the link in the paper is still active; this page can just point to the new repo or page~~ 7 | - ~~Make release for these changes and update conda recipe (meta.yaml) to point to it~~ 8 | - ~~Fork bioconda and upload recipe~~ 9 | - Add a simple way to test the PhyloAcc-ST binary, maybe --version option and/or a very small test dataset 10 | - ~~Remove .py extension from interface (difficult for Windows)~~ 11 | - ~~Resolve or transfer interface issues~~ 12 | - Add option for number of quartets to sample for sCF 13 | - Responsive output for variable screen widths -------------------------------------------------------------------------------- /src/PhyloAcc-GT/newick.h: -------------------------------------------------------------------------------- 1 | #ifndef NEWICK_H_INCLUDED 2 | #define NEWICK_H_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | using namespace arma; 11 | 12 | struct PhyloTree 13 | { 14 | int S; // num of living species 15 | vector< string > species_names; // species names 16 | vector< string > nodes_names; // all nodes names 17 | vector< vector > dag; // connection matrix 18 | vector< double > distances; // branch distances 19 | vector< double > thetas; // 2\mu N 20 | // base compostion and substritution rate 21 | vec pi; 22 | mat subs_rate; 23 | }; 24 | 25 | // load the phylogenetic tree 26 | PhyloTree LoadPhyloTree(string params_path); 27 | PhyloTree LoadPhyloTree_text(string params_path, map& speciesname); 28 | #endif // NEWICK_H_INCLUDED 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | src/PhyloAcc-interface/phyloacc_lib/__pycache__/ 2 | *.pyc 3 | .snakemake 4 | *.code-workspace 5 | src/PhyloAcc-interface/notes.txt 6 | src/PhyloAcc-interface/phyloacc_lib/templates_old.py 7 | src/PhyloAcc-interface/*.errlog 8 | src/PhyloAcc-interface/tests/ 9 | support/ 10 | bin/ 11 | PhyloAcc-ST 12 | PhyloAcc-GT 13 | blah 14 | tmp 15 | run.sh 16 | build_gt.sh 17 | build_st.sh 18 | .vscode 19 | phyloacc-dev* 20 | dev/phyloacc-dev* 21 | 22 | 23 | # Compiled source # 24 | ################### 25 | *.com 26 | *.class 27 | *.dll 28 | *.exe 29 | *.o 30 | *.so 31 | 32 | # Packages # 33 | ############ 34 | # it's better to unpack these files and commit the raw source 35 | # git has its own built in compression methods 36 | *.7z 37 | *.dmg 38 | *.gz 39 | *.iso 40 | *.jar 41 | *.rar 42 | *.tar 43 | *.zip 44 | 45 | # Logs and databases # 46 | ###################### 47 | *.log 48 | *.sql 49 | *.sqlite 50 | 51 | # OS generated files # 52 | ###################### 53 | .DS_Store 54 | .DS_Store? 55 | ._* 56 | .Spotlight-V100 57 | .Trashes 58 | ehthumbs.db 59 | Thumbs.db 60 | 61 | -------------------------------------------------------------------------------- /bioconda_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | set -x 5 | 6 | # Runs test for PhyloAcc, including on a small simulated dataset that contains a fasta file, mod file, 7 | # bed file, id subset file, and config file. 8 | 9 | TMP=$(mktemp -d) 10 | cd $TMP 11 | 12 | echo " ** DOWNLOADING TEST DATA." 13 | files=( 14 | "bioconda-test-cfg.yaml" 15 | "id-subset.txt" 16 | "ratite.mod" 17 | "simu_500_200_diffr_2-1.bed" 18 | "simu_500_200_diffr_2-1.noanc.fa" 19 | ) 20 | 21 | for file in "${files[@]}"; do 22 | if ! wget -q "https://github.com/phyloacc/PhyloAcc-test-data/raw/main/bioconda-test-data/$file"; then 23 | echo "Failed to download $file" >&2 24 | exit 1 25 | fi 26 | done 27 | echo " ** TEST DATA DOWNLOAD OK." 28 | 29 | echo " ** BEGIN DEPCHECK TEST." 30 | if ! phyloacc --depcheck; then 31 | echo " ** ERROR: Dependency check failed. Please ensure all dependencies are installed." >&2 32 | exit 1 33 | fi 34 | echo " ** DEPCHECK TEST OK." 35 | 36 | echo " ** BEGIN PHYLOACC INTERFACE TEST." 37 | if ! phyloacc --config bioconda-test-cfg.yaml --local; then 38 | echo " ** ERROR: PhyloAcc interface test failed. Please check the configuration and installation." >&2 39 | exit 1 40 | fi 41 | echo " ** INTERFACE TEST OK." 42 | 43 | echo " ** BEGIN WORKFLOW TEST." 44 | if ! snakemake -p --jobs 1 --cores 1 -s phyloacc-bioconda-test/phyloacc-job-files/snakemake/run_phyloacc.smk --configfile phyloacc-bioconda-test/phyloacc-job-files/snakemake/phyloacc-config.yaml; then 45 | echo " ** ERROR: PhyloAcc workflow test failed. Please check the Snakemake configuration and log files." >&2 46 | exit 1 47 | fi 48 | echo " ** WORKFLOW TEST OK." 49 | 50 | echo " ** BEGIN POST-PROCESSING TEST." 51 | if ! phyloacc_post.py -h; then 52 | echo " ** ERROR: Failed to display help message for phyloacc_post.py" >&2 53 | exit 1 54 | fi 55 | if ! phyloacc_post.py -i phyloacc-bioconda-test/; then 56 | echo " ** ERROR: Post-processing test failed. Please check the input directory and log files." >&2 57 | exit 1 58 | fi 59 | echo " ** POST-PROCESSING TEST OK." -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TARGET_ST=PhyloAcc-ST 2 | TARGET_GT=PhyloAcc-GT 3 | # The name of the compiled binary 4 | 5 | # make PREFIX=$CONDA_PREFIX 6 | # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/ 7 | # To make locally with the conda environment prefix 8 | 9 | CXX=g++ 10 | # Which compiler to use 11 | # Note: g++ 5.4 resulted in several errors while compiling: SRC/bpp_c2.cpp:345:12: error: ‘::isnan’ has not been declared 12 | # Require g++ 7+ 13 | 14 | $(info $$CXX is [${CXX}]) 15 | # Report the compiler used 16 | 17 | $(info $$PREFIX is [${PREFIX}]) 18 | # Report the PATH prefix 19 | 20 | GSL_INCLUDE=${PREFIX}/include/ 21 | GSL_LIB=${PREFIX}/lib/ 22 | $(info $$GSL_INCLUDE is [${GSL_INCLUDE}]) 23 | $(info $$GSL_LIB is [${GSL_LIB}]) 24 | # GSL paths with the conda environment prefix 25 | 26 | CFLAGS=-Wall -g -O2 -std=c++11 27 | LDFLAGS=-lgsl -lm -lgslcblas -larmadillo -fopenmp 28 | # Options for the g++ commands 29 | 30 | ############ 31 | 32 | SRC_DIR_ST=src/$(TARGET_ST)/ 33 | SRCS_ST=$(SRC_DIR_ST)/*.cpp 34 | INCLUDES_ST=$(SRC_DIR_ST)/*.h $(SRC_DIR_ST)/*.hpp 35 | # Locations of files to compile 36 | 37 | $(TARGET_ST): $(SRCS_ST) $(INCLUDES_ST) 38 | $(CXX) $(CFLAGS) -I$(GSL_INCLUDE) -L$(GSL_LIB) $(SRCS_ST) -o $(TARGET_ST) $(LDFLAGS) 39 | # g++ commands for each file 40 | # Species tree version 41 | ############ 42 | 43 | SRC_DIR_GT=src/$(TARGET_GT)/ 44 | SRCS_GT=$(SRC_DIR_GT)/*.cpp 45 | INCLUDES_GT=$(SRC_DIR_GT)/*.h $(SRC_DIR_GT)/*.hpp 46 | # Locations of files to compile 47 | 48 | $(TARGET_GT): $(SRCS_GT) $(INCLUDES_GT) 49 | $(CXX) $(CFLAGS) -I$(GSL_INCLUDE) -L$(GSL_LIB) $(SRCS_GT) -o $(TARGET_GT) $(LDFLAGS) 50 | # g++ commands for each file 51 | # Gene tree version 52 | ############ 53 | 54 | .PHONY: install 55 | install: $(TARGET_ST) $(TARGET_GT) 56 | cp $(TARGET_ST) $(PREFIX)/bin/$(TARGET_ST) 57 | cp $(TARGET_GT) $(PREFIX)/bin/$(TARGET_GT) 58 | # Command to install by moving binary 59 | 60 | .PHONY: uninstall 61 | uninstall: 62 | rm -f $(PREFIX)/bin/$(TARGET_ST) 63 | rm -f $(PREFIX)/bin/$(TARGET_GT) 64 | # Command to uninstall by removing binary 65 | 66 | .PHONY: clean 67 | clean: 68 | rm -f *.o *~ $(TARGET_ST) 69 | rm -f *.o *~ $(TARGET_GT) 70 | # Command to remove all compiled files to make a clean install -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/labeltree.py: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # Helper function to transfer labels from a source tree to an input tree. 3 | # Necessary for the trees generated from ASTRAL to be input into PhyloAcc 4 | # since the labels need to match the tree in the mod file. 5 | # 6 | # Gregg Thomas, March 2025 7 | ############################################################################# 8 | 9 | import phyloacc_lib.tree as TREE 10 | import re 11 | 12 | ############################################################################# 13 | 14 | def transferLabels(source_tree_file, input_tree_file, output_tree_file): 15 | with open(source_tree_file, 'r') as source_tree_stream, open(input_tree_file, 'r') as input_tree_stream: 16 | source_tree = TREE.Tree(source_tree_stream.read()); 17 | input_tree = TREE.Tree(input_tree_stream.read()); 18 | # Read the source and input trees from their respective files and 19 | # create Tree objects for both trees 20 | 21 | input_tree.label.update(source_tree.label); 22 | # Transfer labels from source_tree to input_tree 23 | 24 | label_pattern = r'(\)<\d+>)(?!,)(?:<\d+>|[^:<>\(\)]+)+(?=:)'; 25 | output_tree_str = re.sub(label_pattern, r'\1', input_tree.tree_str); 26 | # Make a new version of the tree string, removing any labels from the input tree (but 27 | # preserving our own <#> labels) 28 | 29 | #print("Input tree string:", input_tree.tree_str); 30 | #print("Input tree string with labels removed:", output_tree_str); 31 | # Print the original and modified input tree strings for debugging 32 | 33 | for orig_label in input_tree.label: 34 | if orig_label not in input_tree.tips: 35 | output_tree_str = output_tree_str.replace(orig_label, input_tree.label[orig_label]); 36 | # Replace internal labels with their corresponding values 37 | 38 | output_tree_str += ";"; 39 | # Add a semicolon to the end of the tree string 40 | 41 | #print("Output tree string:", output_tree_str); 42 | 43 | with open(output_tree_file, 'w') as output_tree_stream: 44 | output_tree_stream.write(output_tree_str); 45 | # Write the modified tree string to the output file 46 | 47 | ############################################################################# -------------------------------------------------------------------------------- /dev/Makefile-crossplat-dev: -------------------------------------------------------------------------------- 1 | TARGET=PhyloAcc-ST 2 | # The name of the compiled binary 3 | 4 | # ifeq ($(shell uname),Darwin) 5 | # CXX=g++-7 6 | # else 7 | # CXX=g++ 8 | # endif 9 | 10 | # ifeq ($(OS),Windows_NT) 11 | # CXX=g++ 12 | # GSL_INCLUDE=${LIBRARY_INC} 13 | # GSL_LIB=${LIBRARY_LIB} 14 | # # GSL paths with the conda environment prefix 15 | ifeq ($(shell uname),Darwin) 16 | CXX=clang++ 17 | GSL_INCLUDE=${PREFIX}/include/ 18 | GSL_LIB=${PREFIX}/lib/ 19 | CFLAGS=-Wall -g -O2 -std=c++11 -Wl, -rpath ${PREFIX}/lib 20 | else 21 | CXX=g++ 22 | GSL_INCLUDE=${PREFIX}/include/ 23 | GSL_LIB=${PREFIX}/lib/ 24 | CFLAGS=-Wall -g -O2 -std=c++11 25 | # GSL paths with the conda environment prefix 26 | endif 27 | 28 | # ifeq ($(shell uname),Linux Darwin) 29 | # CXX=g++-7 30 | # else 31 | # CXX=g++ 32 | # endif 33 | #CXX=g++-7 34 | # Which compiler to use. 35 | # Note: g++ 5.4 resulted in several errors while compiling: SRC/bpp_c2.cpp:345:12: error: ‘::isnan’ has not been declared 36 | # Switched to g++-7 37 | 38 | $(info $$CXX is [${CXX}]) 39 | # Report the compiler used 40 | 41 | $(info $$PREFIX is [${PREFIX}]) 42 | $(info $$GSL_INCLUDE is [${GSL_INCLUDE}]) 43 | $(info $$GSL_LIB is [${GSL_LIB}]) 44 | 45 | LDFLAGS=-lgsl -lm -lgslcblas -larmadillo -fopenmp 46 | # Options for the g++ commands 47 | 48 | SRC_DIR=src/$(TARGET)/ 49 | SRCS=$(SRC_DIR)/*.cpp 50 | INCLUDES=$(SRC_DIR)/*.h $(SRC_DIR)/*.hpp 51 | # Locations of files to compile 52 | 53 | #export LD_LIBRARY_PATH=${PREFIX}/lib/ 54 | #export LD_RUN_PATH=${PREFIX}/lib/ 55 | # LD paths for the dependencies installed through conda (armadillo, gsl, blas, etc.) 56 | # e.g.: 57 | # libcblas.so.3, needed by /home/gregg/anaconda3/envs/phyloacc/lib//libgsl.so, not found 58 | # However, these don't appear to be needed with conda 59 | 60 | $(TARGET): $(SRCS) $(INCLUDES) 61 | $(CXX) $(CFLAGS) -I$(GSL_INCLUDE) -L$(GSL_LIB) $(SRCS) -o $(TARGET) $(LDFLAGS) 62 | # g++ commands for each file 63 | 64 | .PHONY: install 65 | install: $(TARGET) 66 | ifeq ($(OS),Windows_NT) 67 | cp $< $(LIBRARY_BIN)/$(TARGET) 68 | # GSL paths with the conda environment prefix 69 | else 70 | cp $< $(PREFIX)/bin/$(TARGET) 71 | endif 72 | # Command to install by moving binary 73 | 74 | .PHONY: uninstall 75 | uninstall: 76 | rm -f $(PREFIX)/bin/$(TARGET) 77 | # Command to uninstall by removing binary 78 | 79 | .PHONY: clean 80 | clean: 81 | rm -f *.o *~ $(TARGET) 82 | # Command to remove all compiled files to make a clean install 83 | 84 | 85 | 86 | # Install notes: 87 | # conda install -c conda-forge armadillo -> includes: conda install lapack 88 | # conda install -c conda-forge gsl 89 | # no package called: conda install atlas 90 | # 91 | # also had to: sudo apt install make 92 | # sudo apt update 93 | # sudo apt install g++ gdb make ninja-build rsync zip 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST/profile.cpp: -------------------------------------------------------------------------------- 1 | #include "profile.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "utils.h" 13 | 14 | using namespace std; 15 | 16 | // load the phylogenetic profile 17 | PhyloProf LoadPhyloProfiles(string profile_path, string segment_path, string segment_ID) 18 | { 19 | PhyloProf prof; 20 | vector seqs; 21 | prof.G=0; 22 | string linestr; 23 | ifstream in_prof(profile_path.c_str()); 24 | 25 | if (!in_prof) 26 | { 27 | cerr << "(Error. Cannot open the phylogenetic profile input file: " << profile_path << ")" << endl; 28 | exit(1); 29 | } 30 | 31 | 32 | 33 | 34 | // count the num of species, base pairs and load the profiles 35 | string wholeline=""; 36 | while(!in_prof.eof()) 37 | { 38 | std::getline(in_prof, linestr); 39 | linestr = strutils::trim(linestr); 40 | if(!strncmp(linestr.c_str(),">", 1)) { 41 | string tmp = strutils::trim(linestr.substr(1)); 42 | prof.species_names.push_back(tmp); 43 | 44 | if(prof.G==0) prof.G = wholeline.length(); 45 | else assert(wholeline.length() == prof.G); 46 | 47 | if(prof.G>0) { 48 | wholeline =strutils::ToLowerCase(wholeline); 49 | prof.X.push_back(wholeline); 50 | } 51 | wholeline = ""; 52 | } 53 | else { 54 | 55 | wholeline += linestr; 56 | } 57 | } 58 | 59 | if(prof.G==0) prof.G = wholeline.length(); 60 | wholeline =strutils::ToLowerCase(wholeline); 61 | prof.X.push_back(wholeline); 62 | 63 | //cout < line_splits = strutils::split(linestr, '\t'); 84 | if(line_splits.size()<3) break; 85 | prof.element_names.push_back(line_splits[0]); 86 | double* tmp = new double[3]; 87 | tmp[0] = atoi(line_splits[1].c_str()); 88 | tmp[1] = atoi(line_splits[2].c_str()); 89 | //tmp[2] = atof(line_splits[4].c_str()); //add null scale!! 90 | prof.element_pos.push_back(tmp); 91 | } 92 | prof.C = prof.element_names.size(); 93 | 94 | in_segment.close(); 95 | 96 | 97 | // read in ID 98 | if(segment_ID!="") 99 | { 100 | string segment_path2 = segment_ID + ".txt"; 101 | in_segment.open(segment_path2.c_str()); 102 | 103 | if (!in_segment) 104 | { 105 | cerr << "(Error. Cannot open the segment input txt file: " << segment_path2 << ")" << endl; 106 | exit(1); 107 | } 108 | 109 | while(!in_segment.eof()) 110 | { 111 | std::getline(in_segment, linestr); 112 | linestr = strutils::trim(linestr); 113 | if(linestr=="") continue; 114 | //vector line_splits = strutils::split(linestr, '\t'); 115 | //if(line_splits.size()<3) break; 116 | prof.element_id.push_back(linestr); 117 | } 118 | } 119 | 120 | 121 | in_segment.close(); 122 | 123 | 124 | return prof; 125 | 126 | } 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/SRC/profile.cpp: -------------------------------------------------------------------------------- 1 | #include "profile.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "utils.h" 13 | 14 | using namespace std; 15 | 16 | // load the phylogenetic profile 17 | PhyloProf LoadPhyloProfiles(string profile_path, string segment_path, string segment_ID) 18 | { 19 | PhyloProf prof; 20 | vector seqs; 21 | prof.G=0; 22 | string linestr; 23 | ifstream in_prof(profile_path.c_str()); 24 | 25 | if (!in_prof) 26 | { 27 | cerr << "(Error. Cannot open the phylogenetic profile input file: " << profile_path << ")" << endl; 28 | exit(1); 29 | } 30 | 31 | 32 | 33 | 34 | // count the num of species, base pairs and load the profiles 35 | string wholeline=""; 36 | while(!in_prof.eof()) 37 | { 38 | std::getline(in_prof, linestr); 39 | linestr = strutils::trim(linestr); 40 | if(!strncmp(linestr.c_str(),">", 1)) { 41 | string tmp = strutils::trim(linestr.substr(1)); 42 | prof.species_names.push_back(tmp); 43 | 44 | if(prof.G==0) prof.G = wholeline.length(); 45 | else assert(wholeline.length() == prof.G); 46 | 47 | if(prof.G>0) { 48 | wholeline =strutils::ToLowerCase(wholeline); 49 | prof.X.push_back(wholeline); 50 | } 51 | wholeline = ""; 52 | } 53 | else { 54 | 55 | wholeline += linestr; 56 | } 57 | } 58 | 59 | if(prof.G==0) prof.G = wholeline.length(); 60 | wholeline =strutils::ToLowerCase(wholeline); 61 | prof.X.push_back(wholeline); 62 | 63 | //cout < line_splits = strutils::split(linestr, '\t'); 84 | if(line_splits.size()<3) break; 85 | prof.element_names.push_back(line_splits[0]); 86 | double* tmp = new double[3]; 87 | tmp[0] = atoi(line_splits[1].c_str()); 88 | tmp[1] = atoi(line_splits[2].c_str()); 89 | //tmp[2] = atof(line_splits[4].c_str()); //add null scale!! 90 | prof.element_pos.push_back(tmp); 91 | } 92 | prof.C = prof.element_names.size(); 93 | 94 | in_segment.close(); 95 | 96 | 97 | // read in ID 98 | if(segment_ID!="") 99 | { 100 | string segment_path2 = segment_ID + ".txt"; 101 | in_segment.open(segment_path2.c_str()); 102 | 103 | if (!in_segment) 104 | { 105 | cerr << "(Error. Cannot open the segment input txt file: " << segment_path2 << ")" << endl; 106 | exit(1); 107 | } 108 | 109 | while(!in_segment.eof()) 110 | { 111 | std::getline(in_segment, linestr); 112 | linestr = strutils::trim(linestr); 113 | if(linestr=="") continue; 114 | //vector line_splits = strutils::split(linestr, '\t'); 115 | //if(line_splits.size()<3) break; 116 | prof.element_id.push_back(linestr); 117 | } 118 | } 119 | 120 | 121 | in_segment.close(); 122 | 123 | 124 | return prof; 125 | 126 | } 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/PhyloAcc-GT/profile.cpp: -------------------------------------------------------------------------------- 1 | #include "profile.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "utils.h" 13 | 14 | using namespace std; 15 | 16 | // load the phylogenetic profile 17 | PhyloProf LoadPhyloProfiles(string profile_path, string segment_path, string segment_ID) 18 | { 19 | PhyloProf prof; 20 | vector seqs; 21 | prof.G=0; 22 | string linestr; 23 | ifstream in_prof(profile_path.c_str()); 24 | 25 | if (!in_prof) 26 | { 27 | cerr << "(Error. Cannot open the phylogenetic profile input file: " << profile_path << ")" << endl; 28 | exit(1); 29 | } 30 | 31 | // count the num of species, base pairs and load the profiles 32 | string wholeline=""; 33 | while(!in_prof.eof()) 34 | { 35 | std::getline(in_prof, linestr); 36 | linestr = strutils::trim(linestr); 37 | if(!strncmp(linestr.c_str(),">", 1)) { //return 0 if 1st char of linestr is >. 38 | string tmp = strutils::trim(linestr.substr(1)); 39 | prof.species_names.push_back(tmp); 40 | 41 | if(prof.G==0) prof.G = wholeline.length(); 42 | else assert(wholeline.length() == prof.G); 43 | 44 | if(prof.G>0) { 45 | wholeline =strutils::ToLowerCase(wholeline); 46 | prof.X.push_back(wholeline); 47 | } 48 | wholeline = ""; 49 | } 50 | else { 51 | 52 | wholeline += linestr; 53 | } 54 | } 55 | 56 | if(prof.G==0) prof.G = wholeline.length(); 57 | wholeline =strutils::ToLowerCase(wholeline); 58 | prof.X.push_back(wholeline); 59 | 60 | prof.S = prof.species_names.size(); 61 | 62 | //read in segment size and specific scaling factor 63 | ifstream in_segment(segment_path.c_str()); 64 | 65 | if (!in_segment) 66 | { 67 | cerr << "(Error. Cannot open the segment input file: " << segment_path << ")" << endl; 68 | exit(1); 69 | } 70 | 71 | while(!in_segment.eof()) 72 | { 73 | std::getline(in_segment, linestr); 74 | linestr = strutils::trim(linestr); 75 | vector line_splits = strutils::split(linestr, '\t'); 76 | if(line_splits.size()<3) break; 77 | prof.element_names.push_back(line_splits[0]); 78 | double* tmp = new double[3]; 79 | //vector tmp=vector(2,0.0); 80 | tmp[0] = atoi(line_splits[1].c_str()); 81 | tmp[1] = atoi(line_splits[2].c_str()); 82 | //tmp[2] = atof(line_splits[4].c_str()); //add null scale!! 83 | prof.element_pos.push_back(tmp); 84 | if(line_splits.size() >=8) prof.element_tree.push_back(line_splits[7]); 85 | } 86 | prof.C = prof.element_names.size(); 87 | 88 | in_segment.close(); 89 | 90 | // read in ID 91 | if (segment_ID != "") 92 | { 93 | string segment_path2 = segment_ID + ".txt"; 94 | in_segment.open(segment_path2.c_str()); 95 | 96 | if (!in_segment) 97 | { 98 | cerr << "(Error. Cannot open the segment input txt file: " << segment_path2 << ")" << endl; 99 | exit(1); 100 | } 101 | 102 | while (!in_segment.eof()) 103 | { 104 | std::getline(in_segment, linestr); 105 | linestr = strutils::trim(linestr); 106 | if (linestr == "") 107 | continue; 108 | //vector line_splits = strutils::split(linestr, '\t'); 109 | //if(line_splits.size()<3) break; 110 | prof.element_id.push_back(linestr); 111 | } 112 | } 113 | 114 | in_segment.close(); 115 | 116 | return prof; 117 | } 118 | 119 | 120 | -------------------------------------------------------------------------------- /docs/v1/READMEv1_OUTPUT.md: -------------------------------------------------------------------------------- 1 | # Format of output files 2 | *prefix*\_rate_postZ\_M\*.txt: posterior median of conserved rate, accelerated rate, probability of gain and loss conservation ($\alpha = P(Z=0\rightarrow Z=1)$ and $\beta = P(Z=1\rightarrow Z=2)$), and posterior probability of being in each latent state on each branch for each element. Columns in the file are: 3 | 1. element No. which is the order of the element in the input bed file starting from zero 4 | 2. posterior median of accelerated substitution rate 5 | 3. posterior median of conserved substitution rate 6 | 4. posterior median of $\alpha$ 7 | 5. posterior median of $\beta$ 8 | 6. posterior median of $\beta_2 = P(Z=0\rightarrow Z=2)$, which is 0 in current implementation 9 | 7. from the 7th column and on, we have four columns for each branch:\*\_0 indicates whether it's "missing"; \*\_1, \*\_2 and \*\_3 are the posterior probability in the background, conserved and accelerated state respectively. The algorithm will prune "missing" branches within outgroup and set the latent states of them to -1 so that the three posterior probabilities are all zero. Column names indicate the branch right above the node and the order of the branch is the same as that in *prefix*\_elem_Z.txt. 10 | 11 | If sampling hyperparameters, the outputs under different hyperparameters will be concatenated to this file. If an element is filtered because of too many alignment gaps (criteria see [README_PARAMETER.md](README_PARAMETER.md)), it will not appear in this file. 12 | 13 | *prefix*\_elem_lik.txt: marginal logliklihood for all models (integrating out parameters and latent states). The columns are: 14 | * *No.*: The order of the element in the input bed file starting from zero 15 | * *ID*: The element name as in the input bed file 16 | * *loglik_Null*: marginal logliklihood under the null model 17 | * *loglik_Acc*: marginal logliklihood under accelerated model 18 | * *loglik_Full*: marginal logliklihood under the full model 19 | * *logBF1*: log Bayes factor between null and accelerated model 20 | * *logBF2*: log Bayes factor between accelerated and full model 21 | * *loglik_Max_M0, loglik_Max_M1, loglik_Max_M2*: Maximum joint likelihood of Y (observed sequences), r (substitution rates) given Z (latent states) ($\max_{r, Z} P(Y, r|Z)$) under null ($M_0$), accelerated ($M_1$) and full ($M_2$) model respectively. 22 | 23 | If updating hyperparameters, the algorithm will only compute the log-likelihood under the full model. When the hyperparameters are updated, the log-likelihoods for each element will be recomputed and concatenated to this file. If an element is filtered because of too many alignment gaps (criteria see [README_PARAMETER.md](README_PARAMETER.md)), all the columns will be zero. If the MCMC algorithm is trapped at some local modes or some other numerical errors occur for some elements, it will return NA. 24 | 25 | 26 | *prefix*\_M\*_elem_Z.txt: maximum loglikhood configurations of latent state Z under null, accelerated and full model, with Z=-1(if the element is 'missing' in the branches of outgroup species),0(background),1(conserved),2(accelerated); each row is an element, ordered same as the input bed file. Output this file if not sample hyperparameters. If an element is filtered because of too many alignment gaps (criteria see [README_PARAMETER.md](README_PARAMETER.md)), all the columns will be zero. 27 | 28 | *prefix*\_hyper.txt: hyperparameters at each iteration, only meaningful if adopting full Bayesian approach by sampling hyperparameters. Columns are: 29 | * *iter*: current iteration 30 | * *nprior_a, nprior_b*: the shape and scale hyperparameter of the gamma prior of accelerated substitution rate 31 | * *cprior_a, cprior_b*: the shape and scale hyperparameter of the gamma prior of conserved substitution rate 32 | * *prior_lrate_a,prior_lrate_b*: hyperparameters of beta prior for probability of loss conservation 33 | * *prior_grate_a,prior_grate_b*: hyperparameters of beta prior for probability of gain conservation 34 | 35 | *prefix*\_mcmc_trace_M\_[0-2]_\*.txt: Output this file if verbose=T, which contains the trace of MCMC samples in each iteration for an element. Each row is one iteration and columns are: log-likelihood($P(Y|Z, r)$), accelerated substitution rate, conserved substitution rate, probability of gain and loss conservation and latent state Z of each branch. If sampling hyperparameters, MCMC samples will be concantenated together under different hyperparameters. In the file name, [0-2]: under null, accelerated and full model respectively. *: element No. . 36 | 37 | The output files from the extended version including gBGC are slightly different. *prefix*\_rate_postZ\_\*.txt* also contains posterior mode of gBGC coefficient (*gBC* column) and posterior of having gBGC effect on each branch (*\*_B* columns).*prefix*_1_elem_Z.txt has the latent gBGC state on each branch in the first columns. 38 | -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/output.py: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # Output functions for the PhyloAcc interface 3 | # Gregg Thomas 4 | ############################################################################# 5 | 6 | import os 7 | import phyloacc_lib.core as PC 8 | import phyloacc_lib.tree as TREE 9 | 10 | ############################################################################# 11 | 12 | def writeAlnStats(globs): 13 | # Writes alignment stats to output file 14 | 15 | step = "Writing: " + globs['alnstatsfile']; 16 | step_start_time = PC.report_step(globs, step, False, "In progress..."); 17 | # Status updated 18 | 19 | globs['alnstatsfile'] = os.path.join(globs['outdir'], globs['alnstatsfile']); 20 | # Updates the alignment stats file to include the output directory 21 | 22 | loci_sorted = sorted(globs['aln-stats']); 23 | # Sorts the loci 24 | 25 | with open(globs['alnstatsfile'], "w") as outfile: 26 | # Open the alignment stats file for writing 27 | 28 | first = True; 29 | # First flag 30 | 31 | for locus in loci_sorted: 32 | # Write every locus 33 | 34 | if first: 35 | # For the first locus, extract and write the headers 36 | 37 | keys = list(globs['aln-stats'][locus].keys()) 38 | # The headers will be the keys in the locus dict 39 | 40 | headers = ['locus'] + keys; 41 | # Add 'locus' to the headers for the locus ID 42 | 43 | outfile.write(",".join(headers) + "\n"); 44 | first = False; 45 | # Write the headers and set the first flag to False 46 | ### 47 | 48 | outline = [locus] + [ str(globs['aln-stats'][locus][key]) for key in keys ]; 49 | outfile.write(",".join(outline) + "\n"); 50 | # Extract the stats for the current locus and write to the file 51 | ## End locus loop 52 | ## Close file 53 | 54 | if globs['no-inf-sites-loci']: 55 | globs['no-inf-loci-file'] = os.path.join(globs['outdir'], globs['no-inf-loci-file']); 56 | with open(globs['no-inf-loci-file'], "w") as outfile: 57 | for locus in globs['no-inf-sites-loci']: 58 | outfile.write(locus + "\n"); 59 | # Write the loci with no informative sites to a file 60 | 61 | step_start_time = PC.report_step(globs, step, step_start_time, "Success: align stats written"); 62 | globs['aln-stats-written'] = True; 63 | # Status update 64 | 65 | return globs; 66 | 67 | ############################################################################# 68 | 69 | def writeSCFStats(globs): 70 | 71 | step = "Writing: " + globs['scfstatsfile']; 72 | step_start_time = PC.report_step(globs, step, False, "In progress..."); 73 | # Status update 74 | 75 | headers = ["node","variable-sites","decisive-sites","concordant-sites","quartet-scf-sum","num-quartets","total-quartets","avg-quartet-scf"]; 76 | # The headers to include in the output, must be keys from the globs['scf'] dict 77 | 78 | globs['scfstatsfile'] = os.path.join(globs['outdir'], globs['scfstatsfile']); 79 | # Add the output directory to the scfstatsfile name 80 | 81 | with open(globs['scfstatsfile'], "w") as outfile: 82 | # Open the scf stats file for writing 83 | 84 | outfile.write(",".join(headers) + "\n"); 85 | # Write the headers to the output 86 | 87 | for node in globs['scf']: 88 | # Write stats for every node 89 | 90 | globs['scf'][node]['num-quartets'] = len(globs['quartets'][node]) 91 | # Retrieve the number of quartets for this node from the global quartets dict 92 | 93 | outline = [node] + [ str(globs['scf'][node][header]) for header in headers if header != "node" ]; 94 | outfile.write(",".join(outline) + "\n"); 95 | # Extract the stats for the current node and write to the file 96 | ## End node loop 97 | ## Close file 98 | 99 | step_start_time = PC.report_step(globs, step, step_start_time, "Success: sCF stats written"); 100 | globs['scf-stats-written'] = True; 101 | # Status update 102 | 103 | #################### 104 | 105 | step = "Writing: " + globs['scftreefile']; 106 | step_start_time = PC.report_step(globs, step, False, "In progress..."); 107 | # Status update 108 | 109 | globs['scftreefile'] = os.path.join(globs['outdir'], globs['scftreefile']); 110 | # Add the output directory to the scftreefile name 111 | 112 | globs['scf-labeled-tree'] = globs['labeled-tree']; 113 | # Get the labeled input tree from treeParse 114 | 115 | globs['scf-labeled-tree'] = TREE.addBranchLength(globs['scf-labeled-tree'], globs['tree-dict']); 116 | # Add the branche lengths back onto the tree 117 | 118 | for node in globs['scf']: 119 | globs['scf-labeled-tree'] = globs['scf-labeled-tree'].replace(node, node + "_" + str(round(globs['scf'][node]['avg-quartet-scf'], 2))); 120 | # For every node in the tree, add the averaged scf value over all loci to the label 121 | 122 | with open(globs['scftreefile'], "w") as outfile: 123 | outfile.write(globs['scf-labeled-tree']); 124 | # Write the scf labeled tree to a file 125 | 126 | step_start_time = PC.report_step(globs, step, step_start_time, "Success: sCF tree written"); 127 | globs['scf-tree-written'] = True; 128 | # Status update 129 | 130 | return globs; 131 | 132 | ############################################################################# 133 | 134 | -------------------------------------------------------------------------------- /docs/v1/READMEv1_PARAMETER.md: -------------------------------------------------------------------------------- 1 | # Parameter file 2 | In the parameter file, each parameter is specified in a line with the parameter name at the beginning followed by the parameter value. The parameters are: 3 | * **Input and output**: 4 | * *PHYTREE_FILE*: the path of phylogeny (.mod) 5 | * *SEG_FILE*: the path of bed file for genomic regions. At least 3 columns in the bed file. You could have more columns in this file but the program will only read in the first 3 columns. The first column is element name or ID (different from usual bed file), the second and third columns are the starting and ending positions of each element (in the coordinate of the whole multiple alignment). The program assumes that the alignment file concatenates all the elements together and will only use the second and third columns in the bed file. If concatenating multiple chromosomes, the coordinate of elements on the current chromosome should not start from zero but should add to the previous chromosome. The program will internally generated a No. for each element which is the order in the input bed file, and it will use No. in the outputs and plot functions. 6 | * *ALIGN_FILE*: the path of multiple alignment file (.fasta). The name of the species in the alignment file has to the same as in the phylogenetic tree! 7 | * *RESULT_FOLDER*: the output folder. The folder should exist. 8 | * *PREFIX*: the prefix for output files (default: test). 9 | * *ID_FILE* (optional): only compute elements in this file. (The element is numbered by its order in the input bed file starting from 0). If not specified, the program will compute all elements in the input file. 10 | * *VERBOSE*: 0 or 1. If it's 1, the algorithm will output some intermediate results to console and MCMC trace for each element (default: 0). Should set to 0 if computing many elements, otherwise the output file is too large. 11 | 12 | * **Specify species on the phylogeny**: 13 | * *TARGETSPECIES*: species of interest. E.g. species potentially lost conservation or with convergent phenotype changes. 14 | * *OUTGROUP*: outgroup species of the phylogeny. These species are not considered to be accelerated in our model. 15 | * *CONSERVE*: species assumed to be mostly conserved. The algorithm will filter out elements "missing" in more than *CONSERVE_PROP* of the conserved species. Input conserved species should exclude target species. 16 | * *CONSERVE_PROP*: filter out elements "missing" in more than *CONSERVE_PROP* of the conserved species (default: 0.8). 17 | * *PRUNE_TREE*: Whether to prune "missing" branches besides outgroup. The "missing" branches inside outgroup are always pruned. (default: false, no prune). 18 | 19 | * **Alignment Gaps and Filtering**: 20 | * *GAPCHAR*: the character for alignment gaps. (default: -). Should be one char. 21 | * *GAP_PROP*: if the sequence alignment of a species contains gaps for more than *GAP_PROP* of the whole element, then we say that the element is "missing" in that species (default: 0.8). 22 | * *TRIM_GAP_PERCENT*: Trim the loci with indels or unknown base pairs in more than *TRIM_GAP_PERCENT* of all species. (default: 1, no trim). 23 | * *CONSTOMIS*: the probability of "missing" under the conserved state. Should be small (default: 0.01). 24 | * *MIN_LEN*: The trimmed element with length less than *MIN_LEN* will be filtered out. (default: 50). 25 | 26 | * **(Hyper)Parameters and initial values**: 27 | * *INIT_GRATE*: the initial transition probability from background to conserved state (default: 0.5). 28 | * *INIT_LRATE*: the initial transition probability from conserved to accelerated state (default: 0.3). 29 | * *HYPER_LRATE_A, HYPER_LRATE_B*: the parameters for the beta prior of loss probability (default: 1,9). 30 | * *HYPER_GRATE_A, HYPER_GRATE_B*: the parameters for the beta prior of gain probability (default: 3,1). 31 | * *INIT_CONSERVE_RATE*: the initial conserved rate (default: 0.5). 32 | * *INIT_ACCE_RATE*: the initial accelerated rate (default: 1). 33 | * *CONSERVE_PRIOR_A*: the shape parameter for the gamma prior of conserved rate (default: 5). 34 | * *CONSERVE_PRIOR_B*: the scale parameter for the gamma prior of conserved rate (default: 0.04). 35 | * *ACCE_PRIOR_A*: the shape parameter for the gamma prior of accelerated rate (default: 10). 36 | * *ACCE_PRIOR_B*: the scale parameter for the gamma prior of accelerated rate (default: 0.2). 37 | * *RATE_OPT*: to avoid label switching of latent state, we provide options to restrict the range of accelerated rate and conserved rate. 0: no restriction on rates; 1: have lower and upper bound on accelerated and conserved rates respectively; 2: restrict accelerated rate to be larger than conserved rate. (default: 1) 38 | * *NLB*: lower bound for the accelerated rate. Only valid if *RATE_OPT* = 1. Default: 0.6. 39 | * *CUB*: upper bound for the conserved rate. Only valid if *RATE_OPT* = 1. Default: 1. 40 | 41 | * **Control for MCMC and number of threads**: 42 | * *BURNIN*: number of initial iterations to discard before equilibrium of the chain (default: 200). Should set to be larger. 43 | * *MCMC*: number of MCMC iterations (default: 800). Should set to be larger. 44 | * *ADAPT_FREQ*: number of iterations to recompute acceptance rate of Metropolis-Hastings for adaptively adjusting the proposal variances for substitution rates (default: 500). 45 | * *SEED*: seed for random sampling (default: 5) 46 | * *SAMPLE_HYPER*: whether to sample hyperparameters. 0, fix hyperparameters; 1, sample (default: 0). Sampling hyperparameters is time-consuming, and not recommended. If sampling hyperparameters, the algorithm will only output the posterior of Z (latent state of each branch) under full model. 47 | * *CHAIN*: Numer of iterations to sample hyper parameters. If not sampling hyperparameter, set it to 1 (default: 1). 48 | * *NUM_THREAD*: Number of threads to run the algorithm (default: 1). 49 | 50 | -------------------------------------------------------------------------------- /src/PhyloAcc-GT/genetree.hpp: -------------------------------------------------------------------------------- 1 | // genetree.hpp 2 | // PhyloAcc_init3-1 3 | // 4 | // Created by hzr on 2019/5/19. 5 | // Copyright © 2019 hzr. All rights reserved. 6 | // 7 | 8 | #ifndef genetree_h 9 | #define genetree_h 10 | 11 | #include "bpp.hpp" 12 | 13 | using namespace std; 14 | using namespace arma; 15 | 16 | class GTree 17 | { 18 | private: 19 | int N; 20 | int S; 21 | //int seed; 22 | gsl_rng * RNG; 23 | 24 | public: 25 | // gene tree 26 | int GG; 27 | int root; 28 | int (*children_gene)[2]; 29 | int *parent_gene; 30 | bool *missing_gene; 31 | double *heights_gene; 32 | vector gene_nodes; 33 | vector var_br_node; 34 | vector prob_var_node; //sampling prob of var_br_node. If not uniform, then will be prop to 1/pa(var_br_node)_len; 35 | int* childID_gene; // record the child ID: 0 or 1 36 | 37 | vector> temp_coal; // record the coalescents for each species 38 | 39 | vector> parent_gene2; // record gene tree plus cross species boundary 40 | vector> > lambda; // N * gene_tree_node * mat 41 | vector < map >> Tg; //current history during each speciation, N*gene_tree_node* GG 42 | 43 | 44 | //GTree(){} 45 | GTree(int _N, int _S) 46 | { 47 | N = _N; 48 | S = _S; 49 | 50 | lambda = vector< map> >(N, map>()); 51 | Tg = vector< map >> (N, map>()); 52 | 53 | // gene tree 54 | children_gene = new int[N][2]; 55 | parent_gene = new int[N]; 56 | heights_gene = new double[N]; 57 | childID_gene = new int[N]; 58 | missing_gene = new bool[N]; 59 | 60 | parent_gene2 = vector>(N); 61 | temp_coal = vector>(N); 62 | } 63 | 64 | GTree(int _N, int _GG, int _S, gsl_rng* _RNG) 65 | { 66 | GG = _GG; 67 | N = _N; 68 | S = _S; 69 | RNG = _RNG; 70 | 71 | 72 | lambda = vector< map> >(N, map>()); 73 | Tg = vector< map >> (N, map>()); 74 | 75 | // gene tree 76 | children_gene = new int[N][2]; 77 | parent_gene = new int[N]; 78 | heights_gene = new double[N]; 79 | childID_gene = new int[N]; 80 | missing_gene = new bool[N]; 81 | 82 | parent_gene2 = vector>(N); 83 | temp_coal = vector>(N); 84 | } 85 | 86 | ~GTree(){ 87 | delete [] children_gene; 88 | delete [] parent_gene; 89 | delete [] heights_gene; 90 | delete [] childID_gene; 91 | delete [] missing_gene; 92 | } 93 | 94 | 95 | void copyto(int len, GTree & gtree); 96 | void copyfrom(int len, GTree & gtree, int numbase); 97 | 98 | //void initTree(string tree_str, vector & missing, set & upper,BPP & bpp); 99 | void initTree(string tree_str, BPP & bpp); 100 | void initTree(vector & missing, set & upper, BPP & bpp); 101 | void initTree_Sptop(vector & missing, set & upper, BPP & bpp); 102 | void InitTg(int len, BPP & bpp, vector & nodes, int start = 0); 103 | double priorTree(BPP & bpp); 104 | void printTree(int s, BPP& bpp, std::stringstream & buffer); 105 | void printTree2(int s, int & currentS, vector & parent_relabel, vector & branchlen); 106 | bool Sample_tree(int indicator, BPP & bpp, vector & Z, double n_rate, double c_rate, double consToMis, double nconsToMis, vector lens, vec & loglik, vec & logp_Z, mat & c_eigenvec, mat & c_eigenval, mat & c_eigeninv, vec& log_pi); 107 | 108 | bool Sample_BranchLen(double delta, int gnode, int indicator, BPP & bpp, vector & Z, double n_rate, double c_rate, double consToMis, double nconsToMis, vector lens, vec & loglik, vec & logp_Z, mat& c_eigenvec, mat& c_eigenval, mat& c_eigeninv, double & curGTprior, vec& log_pi); 109 | void Graft(int branch, int ss, int target, double coal, BPP& bpp, double rate, mat & c_eigenvec, mat & c_eigenval, mat & c_eigeninv); 110 | 111 | int Remove_branch(int branch, int ss, BPP & bpp); 112 | void getGeneNodes(int s, int & minss); 113 | double get_logpZ(vector & Z, double consToMis, double nconsToMis); 114 | double get_logpZ1(BPP & bpp, vector & Z, double consToMis, double nconsToMis); 115 | 116 | void Update_Lambda(int start_ss, int branchsib, BPP& bpp, vector & Z, double n_rate, double c_rate, mat & c_eigenvec, mat & c_eigenval, mat & c_eigeninv); 117 | void Update_Tg(int g, vector visited, BPP& bpp, bool tosample,vector & nodes, vector & Z, double n_rate, double c_rate, vec log_pi, mat & c_eigenvec, mat & c_eigenval, mat & c_eigeninv, int start =0); 118 | void Simulate_Tg(int len, BPP& bpp, vector & nodes, vector & Z, double n_rate, double c_rate, vec log_pi, mat & c_eigenvec, mat & c_eigenval, mat & c_eigeninv, int start=0); 119 | 120 | double CompareTg(vector Tg1, vector Tg2, BPP & bpp); 121 | 122 | bool Sample_tree2(int branch, int indicator, BPP & bpp, vector & Z, double n_rate, double c_rate, double consToMis, double nconsToMis, vector lens, vec & loglik, vec & logp_Z, mat& c_eigenvec, mat& c_eigenval, mat& c_eigeninv, vec& log_pi); 123 | //bool Sample_tree2_print(int branch, int indicator, BPP & bpp, vector & Z, double n_rate, double c_rate, double consToMis, double nconsToMis, vector lens, vec & loglik, vec & logp_Z, mat& c_eigenvec, mat& c_eigenval, mat& c_eigeninv, vec& log_pi); 124 | 125 | void printSptree(BPP& bpp, int l); 126 | int findSp(int gnode, BPP& bpp); 127 | }; 128 | 129 | #endif /* genetree_h */ 130 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Version 2.4.3, 03.25.2025 2 | - Fixed how tree labels are read in GT 3 | - Switched from bash script to Python functions to label ASTRAL tree internal nodes, which required adding the various hidden `-lt` options to `phyloacc.py` 4 | 5 | Version 2.4.0, 09.26.2024 6 | - Added `--local` option to generate a snakemake command that does not use a cluster, intended for testing purposes only. 7 | - Fixed the `--version`, `--quiet`, and `--options` flags to work with the config file. 8 | - Fixed spacing when string is longer than the pad. 9 | - Added `--testcmd` to also print a direct PhyloAcc command at the end of the interface for testing purposes. 10 | 11 | Version 2.3.4, 09.24.2024 12 | - Reverted the element ID indexing in the interface from 1 to 0 to match the C++ code 13 | - Changed how the interface uses regex to read trees so it uses `r`aw strings 14 | - Fixed bug in which gene trees were still being inferred with the ST model 15 | 16 | Version 2.3.0 17 | - Check for infs in `phyloacc_post.py` 18 | - Added error checking for unlabeled trees and alignments with labels that don't match the tree 19 | - Added capability to specify input options in a config file with `--config` 20 | - Provided template config file (`phyloacc-cfg.yml`) 21 | - Added the `--filter` option to filter out alignments with too many missing sites 22 | - Added hidden `--debug-aln` option to stop the program after reading the alignments 23 | 24 | Version 2.2.0, 04.13.2023 25 | - Added `--nophyloacc` option that prevents execution of the PhyloAcc rules in the snakemake workflow, useful for debugging or just running `--theta` 26 | - Internally, switched the number of informative sites required for a locus to be used in `--theta` estimation to be a param, maybe user option later 27 | - Added `--dollo` option which sets the PhyloAcc `HYPER_LRATE2_A` parameter to 0 to use the Dollo assumption from the original model 28 | - Implemented (better) handling of Warnings 29 | - Added hidden `--dev` option to automatically set paths when testing things locally 30 | - Added hidden `-inf-frac-theta` option to control the fraction of informative sites needed for a locus to be used in `--theta` estimation (for development) 31 | - Fixed bug in which options that require floats as input (e.g. `-scf`) wouldn't be set if the input value was 0.0 32 | 33 | 04.06.2023 34 | - Fixed ids dir when `--theta` is set 35 | - Added `-iqtree-path` and `-coal-cmd` options to specify programs to estimate branch lengths in coalescent units when `--theta` is set 36 | - Fixed bug where `--labeltree` added NA labels to nodes in addition to <#> nodes when input tree is unlabeled 37 | - Added error message if tree in .mod file does not have labels on the internal nodes telling the user that PhyloAcc requires a labeled tree as input 38 | - `--labeltree` now also prints a tree labeled in the format of [descendant 1]-[descendant 2], where descendants 1 and 2 are the first of the alphabetically sorted tips that descend from that node 39 | 40 | Version 1.1.0, 02.11.2022 41 | - Combining PhyloAcc and Python interface repos to facilitate codebase mergining in the future and to develop the conda package 42 | - Moved all data files to `Hu-etal-2019` to split into a separate repo later 43 | - Updated Makefile to be used with conda build; the original Makefile can still be found in the `src/PhyloACC-ST/` folder if we ever need to build from source that way 44 | - Added `meta.yaml`, `build.sh`, and `build.bat` for conda building which will be split out later to our fork of bioconda 45 | - Changed name of interface from `phyloacc_interface.py` to `phyloacc.py` and changed name of `PhyloAcc` binary to `PhyloAcc-ST` 46 | - Changed default path for the PhyloAcc binary within the interface to point to `PhyloAcc-ST` to align with the name change above 47 | - Changed licsene from MIT to GPL3 48 | - Moved `V2_GBGC` to the `src/PhyloAcc-ST/` folder 49 | 50 | 02.15.2022, interface 51 | - Fixed coloring of branches to include internal branches in the summary page 52 | - Added group input specification by internal nodes 53 | - Added option to only re-run the summary and plot generation and ignore/don't write or overwrite the job files 54 | - Fixed issue where log info wasn't actually written to the log 55 | - Added an option to append to a previous log rather than overwrite 56 | 57 | 02.16.2022, interface 58 | - Added a check for duplicate labels in input groups 59 | - Removed `--plot` option and changed `--plotonly` option to `--summarize`; plots are now always generated and `--summarize` indicates job files should not be overwritten or generated 60 | - Fixed error codes in opt_parse 61 | 62 | 02.16.2022 63 | - Moved the `V2_GBGC` source from within the `PhyloAcc-ST` dir to its own dir in src called `PhyloAcc-ST-GBGC` 64 | - Removed the simulation data from the `GBGC` folder and put it in the `Hu-etal-2019` folder under the `GBGC` subfolder 65 | - Added `test` folder with original test data from the README (500 simulated elements on the ratite tree, but with the ID file only 10 elements are run) 66 | 67 | 02.18.2022 68 | - Split the `Hu-etal-2019` subfolder into its own repository 69 | - Move the test data to a new repository, `PhyloAcc-test-data` 70 | 71 | 03.07.2022 72 | - Remove `rcParams["xtick.labelcolor"]` and `rcParams["ytick.labelcolor"]` since they don't seem to work for `matplotlib-base` 73 | - Remove `--plot` option from post script; plots are now generated by default 74 | - Change version string 75 | 76 | 07.15.2022 77 | - Implemented tree parsing as a class rather than functions 78 | - Refactored sCF code 79 | - Organized option parsing and main code 80 | - Added `-scf` and `-s` options for sCF user cutoffs 81 | - Updated README and moved version 1 READMEs to `docs/` 82 | 83 | 10.21.2022 84 | - More testing of tree class 85 | - Added `thin` as a user option 86 | - Split the HTML functions out of `plot.py` into `html.py` 87 | - Added new plots and output tables for `phyloacc_post.py` 88 | - Fixed default path of config file in PhyloAcc-GT `main.cpp`, now exits rather than trying a non-existant file 89 | -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/post_params.py: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # This file holds some global variables for some of the input options. 3 | # These global parameters should be read only -- they are not modified anywhere 4 | # else in the code except when reading the input options. 5 | ############################################################################# 6 | 7 | import sys 8 | import os 9 | import timeit 10 | from collections import defaultdict 11 | import phyloacc_lib.core as CORE 12 | 13 | ############################################################################# 14 | 15 | class StrictDict(dict): 16 | # This prevents additional keys from being added to the global params dict in 17 | # any other part of the code, just to help me limit it's scope 18 | # https://stackoverflow.com/questions/32258706/how-to-prevent-key-creation-through-dkey-val 19 | def __setitem__(self, key, value): 20 | if key not in self: 21 | raise KeyError("{} is not a legal key of this StrictDict".format(repr(key))); 22 | dict.__setitem__(self, key, value); 23 | 24 | ############################################################################# 25 | 26 | def init(): 27 | globs_init = { 28 | 29 | 'starttime' : timeit.default_timer(), 30 | 'startdatetime' : CORE.getOutTime(), 31 | 'startdatetimenice' : CORE.getRunTimeNice(), 32 | # Meta info 33 | 34 | 'pyver' : ".".join(map(str, sys.version_info[:3])), 35 | # System info 36 | 37 | 'call' : "", 38 | # Script call info 39 | 40 | 'interface-run-dir' : False, 41 | 'interface-logfile' : False, 42 | 'phyloacc-out-dir' : False, 43 | 'id-file' : False, 44 | # Input with concatenated alignment and partitions by bed file 45 | 46 | 'bf1-cutoff' : 5, 47 | 'bf2-cutoff' : 5, 48 | 'bf3-cutoff' : 5, 49 | 50 | 'bf1' : [], 51 | 'bf2' : [], 52 | 'bf3' : [], 53 | 54 | 'all-loci' : [], 55 | 56 | 'bf1-loci' : [], 57 | 'bf2-loci' : [], 58 | 'bf3-loci' : [], 59 | # Bayes Factor summaries 60 | 61 | 'm2-per-locus' : defaultdict(int), 62 | 'm2-per-lineage' : defaultdict(int), 63 | 64 | 'locus-stats' : {}, 65 | 'm0-loci' : [], 66 | 'm1-loci' : [], 67 | 'm2-loci' : [], 68 | 69 | 'summary' : {}, 70 | 71 | 'complete-batches' : [], 72 | 'complete-batches-st' : [], 73 | 'complete-batches-gt' : [], 74 | 'incomplete-batches' : [], 75 | 'incomplete-batches-st' : [], 76 | 'incomplete-batches-gt' : [], 77 | 'complete-loci' : [], 78 | 'accelerated-loci' : [], 79 | 'batch-size' : 0, 80 | 'procs-per-batch' : 0, 81 | 'batch-runtimes' : [], 82 | 83 | 'outdir' : '', 84 | 'results-dir' : '', 85 | 'run-name' : 'phyloacc-post', 86 | 'logfilename' : 'phyloacc-post.errlog', 87 | 'tmpdir' : 'System default.', 88 | 'overwrite' : False, 89 | # I/O options 90 | 91 | 'plot' : True, 92 | # Option to output plots/html 93 | 94 | 'tree-string' : False, 95 | 'st' : False, 96 | 'st-rev-labels' : {}, 97 | 98 | 'tree-data-type' : "class", 99 | 'tree-dict' : False, 100 | 'labeled-tree' : False, 101 | 'root-node' : False, 102 | 'tree-tips' : False, 103 | # Tree variables 104 | 105 | 'plot-dir' : '', 106 | 'input-tree-plot-file' : 'input-species-tree.png', 107 | 'bf-dist-file' : 'bf-dists.png', 108 | 'bf1-dist-file' : 'bf1-hist.png', 109 | 'bf2-dist-file' : 'bf2-hist.png', 110 | 'bf3-dist-file' : 'bf3-hist.png', 111 | 'bf1-bf2-plot-file' : 'bf1-v-bf2.png', 112 | 'm2-locus-dist-file' : 'accelerated-lineages-per-locus-m2.png', 113 | 'm2-lineage-dist-file' : 'accelerated-loci-per-lineage-m2.png', 114 | 115 | 'inf-sites-frac-plot-file' : 'informative-sites-frac-hist.png', 116 | 'var-inf-sites-plot-file' : 'variable-informative-sites.png', 117 | 'avg-scf-hist-file' : 'avg-scf-per-locus.png', 118 | 'low-scf-hist-file' : 'perc-low-scf-branchers-per-locus.png', 119 | 'scf-tree-plot-file' : 'scf-species-tree.png', 120 | 'bl-scf-plot-file' : 'bl-scf.png', 121 | 'html-file' : 'phyloacc-results.html', 122 | # Plot and HTML summary files 123 | 124 | 'html-summary-written' : False, 125 | # Output checks 126 | 127 | 'num-procs' : 1, 128 | # Number of procs for this script to use 129 | 130 | 'info' : False, 131 | 'dryrun' : False, 132 | 'quiet' : False, 133 | # Other user options 134 | 135 | 'skip-chars' : ["-", "N"], 136 | 'pad' : 82, 137 | 'endprog' : False, 138 | 'exit-code' : 0, 139 | 'log-v' : 1, 140 | 'stats' : True, 141 | 'progstarttime' : 0, 142 | 'stepstarttime' : 0, 143 | 'pids' : "", 144 | 'psutil' : False, 145 | 'qstats' : False, 146 | 'norun' : False, 147 | 'debug' : False, 148 | 'nolog' : False, 149 | # Internal stuff 150 | } 151 | 152 | globs_init['logfilename'] = "phyloacc-post-" + globs_init['startdatetime'] + ".errlog"; 153 | # Add the runtime to the error log file name. 154 | 155 | for line in open(os.path.join(os.path.dirname(__file__), "info.yaml"), "r"): 156 | line = line.strip().split(":\t"); 157 | globs_init[line[0]] = line[1]; 158 | # Reads meta info (version, urls, etc.) from the info.yaml file 159 | 160 | globs = StrictDict(globs_init); 161 | # Restrict the dict from having keys added to it after this 162 | 163 | return globs; 164 | 165 | ############################################################################# -------------------------------------------------------------------------------- /src/PhyloAcc-GT/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace ctnutils 16 | { 17 | 18 | vector GenGrayArray(int n) 19 | { 20 | // base case 21 | if (n <= 0) 22 | return vector(); 23 | 24 | // 'arr' will store all generated codes 25 | vector arr; 26 | 27 | // start with one-bit pattern 28 | arr.push_back("0"); 29 | arr.push_back("1"); 30 | 31 | // Every iteration of this loop generates 2*i codes from previously 32 | // generated i codes. 33 | int i, j; 34 | for (i = 2; i < (1<= 0 ; j--) 39 | arr.push_back(arr[j]); 40 | 41 | // append 0 to the first half 42 | for (j = 0 ; j < i ; j++) 43 | arr[j] = "0" + arr[j]; 44 | 45 | // append 1 to the second half 46 | for (j = i ; j < 2*i ; j++) 47 | arr[j] = "1" + arr[j]; 48 | } 49 | 50 | // // print contents of arr[] 51 | // for (i = 0 ; i < arr.size() ; i++ ) 52 | // cout << arr[i] << endl; 53 | 54 | return arr; 55 | } 56 | 57 | } 58 | 59 | namespace strutils 60 | { 61 | 62 | string LoadFileToString(ifstream & in) 63 | { 64 | istreambuf_iterator beg(in), end; 65 | string str(beg, end); 66 | return str; 67 | } 68 | 69 | string LoadFileToString(const char *file_path) 70 | { 71 | ifstream in(file_path); 72 | istreambuf_iterator beg(in), end; 73 | string str(beg, end); 74 | return str; 75 | } 76 | 77 | string LoadFileToString(string file_path) 78 | { 79 | ifstream in(file_path.c_str()); 80 | istreambuf_iterator beg(in), end; 81 | string str(beg, end); 82 | return str; 83 | } 84 | 85 | vector &split(const string &s, char delim, vector &elems) 86 | { 87 | stringstream ss(s); 88 | string item; 89 | while(getline(ss, item, delim)) 90 | { 91 | // if (item.length() > 0) // skip empty token 92 | elems.push_back(item); 93 | } 94 | return elems; 95 | } 96 | 97 | vector split(const string &s, char delim) 98 | { 99 | vector elems; 100 | return split(s, delim, elems); 101 | } 102 | 103 | string ToUpperCase(string str) 104 | { 105 | transform(str.begin(), str.end(), str.begin(), ::toupper); 106 | return str; 107 | } 108 | 109 | string ToLowerCase(string str) 110 | { 111 | transform(str.begin(), str.end(), str.begin(), ::tolower); 112 | return str; 113 | } 114 | 115 | string replace(string value, string const & search, string const & replace) 116 | { 117 | string::size_type next; 118 | 119 | for(next = value.find(search);next != std::string::npos;next = value.find(search,next)) 120 | { 121 | value.replace(next,search.length(),replace); // Do the replacement. 122 | next += replace.length(); // the next search from. 123 | } 124 | 125 | return value; 126 | } 127 | 128 | bool endsWith (std::string const &fullString, std::string const &ending) 129 | { 130 | if (fullString.length() >= ending.length()) { 131 | return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending)); 132 | } else { 133 | return false; 134 | } 135 | } 136 | 137 | string itoa(int value) 138 | { 139 | std::string buf; 140 | 141 | enum { kMaxDigits = 35 }; 142 | buf.reserve( kMaxDigits ); // Pre-allocate enough space. 143 | 144 | int quotient = value; 145 | 146 | // Translating number to string with base: 147 | do { 148 | buf += "0123456789abcdef"[ std::abs( quotient % 10 ) ]; 149 | quotient /= 10; 150 | } while ( quotient ); 151 | 152 | // Append the negative sign 153 | if ( value < 0) buf += '-'; 154 | 155 | std::reverse( buf.begin(), buf.end() ); 156 | return buf; 157 | } 158 | 159 | } 160 | 161 | namespace numutils 162 | { 163 | int CountDigits(int num) 164 | { 165 | if (num==0) 166 | return 1; 167 | int num_digits = (num<0)?1:0; 168 | while(num!=0) 169 | { 170 | num_digits ++; 171 | num /= 10; 172 | } 173 | return num_digits; 174 | } 175 | 176 | string EnoughSpaces(int MAX, int num) 177 | { 178 | string spaces = ""; 179 | int num_spaces = MAX-CountDigits(num); 180 | for(int i=0; i getdir(string dir) 205 | { 206 | vector files; 207 | DIR *dp; 208 | struct dirent *dirp; 209 | if((dp = opendir(dir.c_str())) == NULL) { 210 | cout << "Error opening " << dir << endl; 211 | exit(1); 212 | } 213 | 214 | while ((dirp = readdir(dp)) != NULL) { 215 | files.push_back(dir+"/"+string(dirp->d_name)); 216 | } 217 | closedir(dp); 218 | return files; 219 | } 220 | 221 | bool file_exists(const std::string& name) 222 | { 223 | ifstream f(name.c_str()); 224 | if (f.good()) { 225 | f.close(); 226 | return true; 227 | } else { 228 | f.close(); 229 | return false; 230 | } 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace ctnutils 16 | { 17 | 18 | vector GenGrayArray(int n) 19 | { 20 | // base case 21 | if (n <= 0) 22 | return vector(); 23 | 24 | // 'arr' will store all generated codes 25 | vector arr; 26 | 27 | // start with one-bit pattern 28 | arr.push_back("0"); 29 | arr.push_back("1"); 30 | 31 | // Every iteration of this loop generates 2*i codes from previously 32 | // generated i codes. 33 | int i, j; 34 | for (i = 2; i < (1<= 0 ; j--) 39 | arr.push_back(arr[j]); 40 | 41 | // append 0 to the first half 42 | for (j = 0 ; j < i ; j++) 43 | arr[j] = "0" + arr[j]; 44 | 45 | // append 1 to the second half 46 | for (j = i ; j < 2*i ; j++) 47 | arr[j] = "1" + arr[j]; 48 | } 49 | 50 | // // print contents of arr[] 51 | // for (i = 0 ; i < arr.size() ; i++ ) 52 | // cout << arr[i] << endl; 53 | 54 | return arr; 55 | } 56 | 57 | } 58 | 59 | namespace strutils 60 | { 61 | 62 | string LoadFileToString(ifstream & in) 63 | { 64 | istreambuf_iterator beg(in), end; 65 | string str(beg, end); 66 | return str; 67 | } 68 | 69 | string LoadFileToString(const char *file_path) 70 | { 71 | ifstream in(file_path); 72 | istreambuf_iterator beg(in), end; 73 | string str(beg, end); 74 | return str; 75 | } 76 | 77 | string LoadFileToString(string file_path) 78 | { 79 | ifstream in(file_path.c_str()); 80 | istreambuf_iterator beg(in), end; 81 | string str(beg, end); 82 | return str; 83 | } 84 | 85 | vector &split(const string &s, char delim, vector &elems) 86 | { 87 | stringstream ss(s); 88 | string item; 89 | while(getline(ss, item, delim)) 90 | { 91 | // if (item.length() > 0) // skip empty token 92 | elems.push_back(item); 93 | } 94 | return elems; 95 | } 96 | 97 | vector split(const string &s, char delim) 98 | { 99 | vector elems; 100 | return split(s, delim, elems); 101 | } 102 | 103 | string ToUpperCase(string str) 104 | { 105 | transform(str.begin(), str.end(), str.begin(), ::toupper); 106 | return str; 107 | } 108 | 109 | string ToLowerCase(string str) 110 | { 111 | transform(str.begin(), str.end(), str.begin(), ::tolower); 112 | return str; 113 | } 114 | 115 | string replace(string value, string const & search, string const & replace) 116 | { 117 | string::size_type next; 118 | 119 | for(next = value.find(search);next != std::string::npos;next = value.find(search,next)) 120 | { 121 | value.replace(next,search.length(),replace); // Do the replacement. 122 | next += replace.length(); // the next search from. 123 | } 124 | 125 | return value; 126 | } 127 | 128 | bool endsWith (std::string const &fullString, std::string const &ending) 129 | { 130 | if (fullString.length() >= ending.length()) { 131 | return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending)); 132 | } else { 133 | return false; 134 | } 135 | } 136 | 137 | string itoa(int value) 138 | { 139 | std::string buf; 140 | 141 | enum { kMaxDigits = 35 }; 142 | buf.reserve( kMaxDigits ); // Pre-allocate enough space. 143 | 144 | int quotient = value; 145 | 146 | // Translating number to string with base: 147 | do { 148 | buf += "0123456789abcdef"[ std::abs( quotient % 10 ) ]; 149 | quotient /= 10; 150 | } while ( quotient ); 151 | 152 | // Append the negative sign 153 | if ( value < 0) buf += '-'; 154 | 155 | std::reverse( buf.begin(), buf.end() ); 156 | return buf; 157 | } 158 | 159 | } 160 | 161 | namespace numutils 162 | { 163 | int CountDigits(int num) 164 | { 165 | if (num==0) 166 | return 1; 167 | int num_digits = (num<0)?1:0; 168 | while(num!=0) 169 | { 170 | num_digits ++; 171 | num /= 10; 172 | } 173 | return num_digits; 174 | } 175 | 176 | string EnoughSpaces(int MAX, int num) 177 | { 178 | string spaces = ""; 179 | int num_spaces = MAX-CountDigits(num); 180 | for(int i=0; i getdir(string dir) 205 | { 206 | vector files; 207 | DIR *dp; 208 | struct dirent *dirp; 209 | if((dp = opendir(dir.c_str())) == NULL) { 210 | cout << "Error opening " << dir << endl; 211 | exit(1); 212 | } 213 | 214 | while ((dirp = readdir(dp)) != NULL) { 215 | files.push_back(dir+"/"+string(dirp->d_name)); 216 | } 217 | closedir(dp); 218 | return files; 219 | } 220 | 221 | bool file_exists(const std::string& name) 222 | { 223 | ifstream f(name.c_str()); 224 | if (f.good()) { 225 | f.close(); 226 | return true; 227 | } else { 228 | f.close(); 229 | return false; 230 | } 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/SRC/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace ctnutils 16 | { 17 | 18 | vector GenGrayArray(int n) 19 | { 20 | // base case 21 | if (n <= 0) 22 | return vector(); 23 | 24 | // 'arr' will store all generated codes 25 | vector arr; 26 | 27 | // start with one-bit pattern 28 | arr.push_back("0"); 29 | arr.push_back("1"); 30 | 31 | // Every iteration of this loop generates 2*i codes from previously 32 | // generated i codes. 33 | int i, j; 34 | for (i = 2; i < (1<= 0 ; j--) 39 | arr.push_back(arr[j]); 40 | 41 | // append 0 to the first half 42 | for (j = 0 ; j < i ; j++) 43 | arr[j] = "0" + arr[j]; 44 | 45 | // append 1 to the second half 46 | for (j = i ; j < 2*i ; j++) 47 | arr[j] = "1" + arr[j]; 48 | } 49 | 50 | // // print contents of arr[] 51 | // for (i = 0 ; i < arr.size() ; i++ ) 52 | // cout << arr[i] << endl; 53 | 54 | return arr; 55 | } 56 | 57 | } 58 | 59 | namespace strutils 60 | { 61 | 62 | string LoadFileToString(ifstream & in) 63 | { 64 | istreambuf_iterator beg(in), end; 65 | string str(beg, end); 66 | return str; 67 | } 68 | 69 | string LoadFileToString(const char *file_path) 70 | { 71 | ifstream in(file_path); 72 | istreambuf_iterator beg(in), end; 73 | string str(beg, end); 74 | return str; 75 | } 76 | 77 | string LoadFileToString(string file_path) 78 | { 79 | ifstream in(file_path.c_str()); 80 | istreambuf_iterator beg(in), end; 81 | string str(beg, end); 82 | return str; 83 | } 84 | 85 | vector &split(const string &s, char delim, vector &elems) 86 | { 87 | stringstream ss(s); 88 | string item; 89 | while(getline(ss, item, delim)) 90 | { 91 | // if (item.length() > 0) // skip empty token 92 | elems.push_back(item); 93 | } 94 | return elems; 95 | } 96 | 97 | vector split(const string &s, char delim) 98 | { 99 | vector elems; 100 | return split(s, delim, elems); 101 | } 102 | 103 | string ToUpperCase(string str) 104 | { 105 | transform(str.begin(), str.end(), str.begin(), ::toupper); 106 | return str; 107 | } 108 | 109 | string ToLowerCase(string str) 110 | { 111 | transform(str.begin(), str.end(), str.begin(), ::tolower); 112 | return str; 113 | } 114 | 115 | string replace(string value, string const & search, string const & replace) 116 | { 117 | string::size_type next; 118 | 119 | for(next = value.find(search);next != std::string::npos;next = value.find(search,next)) 120 | { 121 | value.replace(next,search.length(),replace); // Do the replacement. 122 | next += replace.length(); // the next search from. 123 | } 124 | 125 | return value; 126 | } 127 | 128 | bool endsWith (std::string const &fullString, std::string const &ending) 129 | { 130 | if (fullString.length() >= ending.length()) { 131 | return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending)); 132 | } else { 133 | return false; 134 | } 135 | } 136 | 137 | string itoa(int value) 138 | { 139 | std::string buf; 140 | 141 | enum { kMaxDigits = 35 }; 142 | buf.reserve( kMaxDigits ); // Pre-allocate enough space. 143 | 144 | int quotient = value; 145 | 146 | // Translating number to string with base: 147 | do { 148 | buf += "0123456789abcdef"[ std::abs( quotient % 10 ) ]; 149 | quotient /= 10; 150 | } while ( quotient ); 151 | 152 | // Append the negative sign 153 | if ( value < 0) buf += '-'; 154 | 155 | std::reverse( buf.begin(), buf.end() ); 156 | return buf; 157 | } 158 | 159 | } 160 | 161 | namespace numutils 162 | { 163 | int CountDigits(int num) 164 | { 165 | if (num==0) 166 | return 1; 167 | int num_digits = (num<0)?1:0; 168 | while(num!=0) 169 | { 170 | num_digits ++; 171 | num /= 10; 172 | } 173 | return num_digits; 174 | } 175 | 176 | string EnoughSpaces(int MAX, int num) 177 | { 178 | string spaces = ""; 179 | int num_spaces = MAX-CountDigits(num); 180 | for(int i=0; i getdir(string dir) 205 | { 206 | vector files; 207 | DIR *dp; 208 | struct dirent *dirp; 209 | if((dp = opendir(dir.c_str())) == NULL) { 210 | cout << "Error opening " << dir << endl; 211 | exit(1); 212 | } 213 | 214 | while ((dirp = readdir(dp)) != NULL) { 215 | files.push_back(dir+"/"+string(dirp->d_name)); 216 | } 217 | closedir(dp); 218 | return files; 219 | } 220 | 221 | bool file_exists(const std::string& name) 222 | { 223 | ifstream f(name.c_str()); 224 | if (f.good()) { 225 | f.close(); 226 | return true; 227 | } else { 228 | f.close(); 229 | return false; 230 | } 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ############################################################################# 3 | # This is the python front-end for PhyloAcc, a Bayesian substitution rate 4 | # estimation program for conserved non-coding genomic elements. This script 5 | # will handle user inputs, model selection, and batching jobs 6 | # 7 | # Gregg Thomas 8 | # Summer 2021 9 | ############################################################################# 10 | 11 | import sys 12 | import os 13 | import phyloacc_lib.core as PC 14 | import phyloacc_lib.params as params 15 | import phyloacc_lib.opt_parse as OP 16 | import phyloacc_lib.treeio as TREEIO 17 | import phyloacc_lib.seq as SEQ 18 | import phyloacc_lib.tree as TREE 19 | import phyloacc_lib.tree_old as TREEF 20 | import phyloacc_lib.cf as CF 21 | import phyloacc_lib.output as OUT 22 | import phyloacc_lib.batch as BATCH 23 | import phyloacc_lib.plot as PLOT 24 | import phyloacc_lib.html as HTML 25 | 26 | ############################################################################# 27 | 28 | if __name__ == '__main__': 29 | # Main is necessary for multiprocessing to work on Windows. 30 | 31 | globs = params.init(); 32 | # Get the global params as a dictionary. 33 | 34 | globs['script-dir'] = os.path.dirname(os.path.realpath(__file__)); 35 | 36 | globs = OP.optParse(globs); 37 | # Getting the input parameters from optParse. 38 | 39 | #print(globs['tree-data-type']); 40 | #print(globs['scf-site-type']); 41 | # Tree debugging stuff 42 | 43 | if globs['info']: 44 | print("# --info SET. EXITING AFTER PRINTING PROGRAM INFO...\n#") 45 | sys.exit(0); 46 | if globs['norun']: 47 | print("# --norun SET. EXITING AFTER PRINTING OPTIONS INFO...\n#") 48 | sys.exit(0); 49 | # Early exit options 50 | 51 | step_start_time = PC.report_step(globs, "", "", "", start=True); 52 | # Initialize the step headers 53 | 54 | ## Initialization 55 | #################### 56 | 57 | if not globs['debug-aln']: 58 | if globs['tree-data-type'] == 'class': 59 | globs['tree-string'], globs['st'] = TREEIO.readST(globs); 60 | elif globs['tree-data-type'] == 'func': 61 | globs['tree-string'], globs['st'], globs['labeled-tree'], globs['root'], globs['tips'], globs['internals'] = TREEIO.readST(globs); 62 | 63 | PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# Tree read from mod file:", 45) + globs['tree-string']); 64 | # Report the tree that was read -- this line is necessary for phyloacc_post.py to read the tree used in the run 65 | 66 | if globs['debug-tree']: 67 | TREE.debugTree(globs); 68 | # print(globs['st'].has_label); 69 | sys.exit(0); 70 | # Print the debug stats and exit if --debugtree is set 71 | 72 | if globs['label-tree']: 73 | if globs['tree-data-type'] == 'class': 74 | globs['st'].showAttrib("type", "label", "desc"); 75 | 76 | print("\nInput tree with integer labels:"); 77 | print(globs['st'].tree_str +"\n"); 78 | 79 | #if not globs['st'].has_label: 80 | print("\nInput tree with descendant labels:"); 81 | print(globs['st'].labelDesc()); 82 | 83 | #print(globs['labeled-tree']); 84 | #print(); 85 | sys.exit(0); 86 | elif not globs['st'].has_label: 87 | PC.errorOut("MAIN2", "One or more internal nodes in your tree are unlabeled, which is required for PhyloAcc. Please label all internal nodes (maybe with the --labeltree option) and replace the tree in your .mod file with the labeled tree.", globs); 88 | 89 | #else: 90 | # CORE.printWrite(globs['logfilename'], globs['log-v'], "# INFO: Original tree with node labels:\t" + globs['st'].tree_str); 91 | # Print the tree and exit if --labeltree is set 92 | 93 | ## Read species tree 94 | #################### 95 | 96 | globs = TREEIO.readSpeciesGroups(globs); 97 | 98 | ## Read species groups 99 | #################### 100 | 101 | if not globs['theta'] and globs['coal-tree-file']: 102 | if globs['tree-data-type'] == 'class': 103 | tree_str, tree = TREEIO.readST(globs, tree_type="coalescent"); 104 | elif globs['tree-data-type'] == 'func': 105 | tree_str, tree_dict, tree, root, tips, internals = TREEIO.readST(globs, tree_type="coalescent"); 106 | 107 | ## Read coalescent tree if provided 108 | #################### 109 | ## End check for --debugaln option 110 | 111 | globs = SEQ.readSeq(globs); 112 | 113 | ## Library to read input sequences 114 | #################### 115 | 116 | globs = SEQ.alnStats(globs); 117 | 118 | ## Calculate some basic alignment stats 119 | #################### 120 | 121 | if globs['debug-aln']: 122 | sys.exit(0); 123 | # Exit here if --debugaln is set 124 | 125 | if globs['run-mode'] == 'adaptive': 126 | #globs = CF.scf(globs, globs['st'], globs['alns'], globs['scf-pool']); 127 | globs = CF.scf(globs); 128 | # globs = TREE.scf(globs); 129 | 130 | # Calculate avg. sCF per locus 131 | #################### 132 | 133 | globs = OUT.writeAlnStats(globs); 134 | # Write out the alignment summary stats 135 | 136 | if globs['run-mode'] == 'adaptive': 137 | #globs = OUT.writeSCFStats(globs); 138 | globs = TREEIO.writeCF(globs); 139 | # Write out the sCF summary stats 140 | 141 | #################### 142 | 143 | globs = BATCH.genJobFiles(globs); 144 | # Generates the locus specific job files (aln, bed, config, etc.) for phyloacc 145 | 146 | globs = BATCH.writeSnakemake(globs); 147 | # Generates the snakemake config and cluster profile 148 | 149 | #################### 150 | 151 | if globs['local']: 152 | globs['smk-cmd'] = "snakemake -p --jobs " + str(globs['num-jobs']); 153 | globs['smk-cmd'] += " --cores " + str(globs['procs-per-job'] * globs['num-jobs']); 154 | globs['smk-cmd'] += " -s " + os.path.abspath(globs['smk']); 155 | globs['smk-cmd'] += " --configfile " + os.path.abspath(globs['smk-config']); 156 | globs['smk-cmd'] += " --dryrun"; 157 | else: 158 | globs['smk-cmd'] = "snakemake -p -s " + os.path.abspath(globs['smk']); 159 | globs['smk-cmd'] += " --configfile " + os.path.abspath(globs['smk-config']); 160 | globs['smk-cmd'] += " --profile " + os.path.abspath(globs['profile-dir']); 161 | #globs['smk-cmd'] += " --cluster-status " + os.path.abspath(globs['status-script']); 162 | globs['smk-cmd'] += " --dryrun"; 163 | # The snakemake command to run PhyloAcc 164 | 165 | #################### 166 | 167 | if globs['plot']: 168 | PLOT.genPlots(globs); 169 | globs = HTML.writeHTML(globs); 170 | 171 | #################### 172 | 173 | PC.endProg(globs); 174 | 175 | ############################################################################# 176 | 177 | -------------------------------------------------------------------------------- /docs/v1/READMEv1.md: -------------------------------------------------------------------------------- 1 | # PhyloAcc 2 | This is a software to detect the shift pattern of DNA substitution rate of a genomic region and identify genomic elements accelerated in some specific species from a set of conserved elements. The underlying model assumes a latent discrete state (Z) of relative substitution rate of each branch on the phylogeny which can be neutral, conserved and accelerated. For each genomic element, it will start from a neutral or conserved state at the common ancestor of the phylogeny, transit to conserved state if not yet being conserved and then reach an accelerated state in some lineages. Our method utilizes adaptive collapsed Gibbs sampling to obtain the pattern of substitution rate shifts (posterior distribution of Z) as well as relative substitution rates of conserved and accelerated state. To identify DNA elements with accelerating on specific branches, we compare marginal likelihoods under three models: null model (M0) where all branches are neutral or conserved; accelerated model (M1) in which branches leading to target species are accelerated; and full model (M2), with no constraints on latent states. Then we use two Bayes factors: between M1 and M0 (BF1) and between M1 and M2 (BF2) as criteria to identify DNA elements accelerated exclusively in target lineages. 3 | 4 | ## Getting Started 5 | Some preliminary inputs which might be generated from other software are required: (1) a Phylogeny in mod format. The file can be output from phyloFit in PHAST package, with the transition rate matrix of bases and branch lengths. Our model assumes that these branch lengths represent the expected number of substitutions in the background state and will estimate the conserved and accelerated rate relative to the background rate. In our study, we used genome-wide four-fold degenerate sites to estimate the rate matrix and branch lengths. (2) a multiple alignment files concatenating sequences of all input conserved elements in FASTA format, and (3) a bed file with the position of each individual element in the coordinate of concatenated alignment file (0-based). 6 | 7 | 8 | We also need a parameter file, which contains the paths for input files and output directory, information of species and parameters for MCMC. Please read [README_PARAMETER.md](https://github.com/xyz111131/PhyloAcc/blob/master/README_PARAMETER.md) for more detail. 9 | 10 | 11 | After running the algorithm, our method will output the posterior of latent state (Z) for each branch (indicating background, conserved or accelerated) for each element under each model in the files "*prefix*\_rate_postZ\_M[0-2].txt" and the marginal loglikelihoods for each element are in the file "*prefix*_elem_lik.txt". The format of output files is explained in [README_OUTPUT.md](https://github.com/xyz111131/PhyloAcc/blob/master/README_OUTPUT.md). 12 | 13 | ## Prerequisites 14 | * [GCC](https://gcc.gnu.org/): You might need latest GCC (version 7) supporting openmp. If you are using Mac, you could use brew to (re)install gcc. 15 | ```bash 16 | brew update ## update the formulae and Homebrew itself, if your brew is out-dated 17 | brew install gcc 18 | ``` 19 | * [GNU Scientific Library (GSL)](https://www.gnu.org/software/gsl/): a numerical library for C and C++. PhyloAcc has been 20 | tested with version 2.4 of GSL. 21 | * [Armadillo](http://arma.sourceforge.net/): C++ linear algebra library. You could install Armadillo following the steps on its website. For Linux, before installing Armadillo, you need to install CMAKE, LAPACK, BLAS (OPENBLAS) and ATLAS, along with the corresponding development/header files. PhyloAcc has been tested with version 7.800.2. 22 | For Mac, you could use brew (tested and Recommended): 23 | ```bash 24 | brew install homebrew/science/armadillo 25 | ``` 26 | * [Open MP](http://www.openmp.org/): for parallel computing. 27 | * To use the R functions to plot, please install [Rstudio](https://www.rstudio.com/) with the current version of R (>=3.3.2) and install seqinr, ggplot2, reshape2, ape packages. 28 | 29 | Alternatively, you can use Conda to install these required packages: 30 | 1) make a new conda environment called "PhyloAcc" for example, and activate the environment 31 | ```bash 32 | conda create -n PhyloAcc 33 | conda activate PhyloAcc 34 | ``` 35 | 2) conda install gsl 36 | 3) conda install -c conda-forge lapack 37 | 4) conda install -c conda-forge armadillo 38 | 5) edit your .bashrc in your home directory: 39 | ```bash 40 | export LD_LIBRARY_PATH=/user/anaconda3/envs/PhyloAcc/lib/:$LD_LIBRARY_PATH 41 | ``` 42 | or other path of the conda environment 43 | 7) source .bashrc 44 | 8) change to g++-9 in the PhyloAcc Makefile if you are using GCC version 9 45 | 9) edit the location of GSL lib and include in the Makefile: e.g. 46 | ```bash 47 | GSL_HOME=/user/anaconda3/envs/PhyloAcc/ 48 | ``` 49 | 11) edit *PREFIX* in the Makefile for the installation directory of the conda env 50 | 51 | *(credited to Wei Gordon)* 52 | 53 | ## Build on Linux or Mac 54 | Run: 55 | ```bash 56 | make 57 | ``` 58 | in PhyloAcc directory to generate the 'PhyloAcc' executable. 59 | 60 | ## Installation 61 | Run: 62 | ```bash 63 | sudo make install 64 | ``` 65 | to install in default path /usr/local/bin, and 66 | ```bash 67 | sudo make uninstall 68 | ``` 69 | to uninstall. 70 | 71 | For our extended version modeling GC-based gene conversion (gBGC) effect, please go to [V2_GBGC/](https://github.com/xyz111131/PhyloAcc/blob/master/V2_GBGC) and make & install under that directory. 72 | 73 | ## Usage 74 | Try this in PhyloAcc directory as a test, which will run simulated elements on ratite phylogeny: 75 | ```bash 76 | mkdir Simulation_ratite/result_tmp 77 | ./PhyloAcc Simulation_ratite/param2-1-test.txt 78 | ``` 79 | or this after installation: 80 | ```bash 81 | PhyloAcc Simulation_ratite/param2-1-test.txt 82 | ``` 83 | For testing propose, it will only run the first 10 elements of simulated data from Simulation_ratite/simu_500_200_diffr_2-1.* and output to Simulation_ratite/result_tmp/. To run all the elements and get results in Simulation_ratite/result_phyloAcc/, you could run: 84 | ```bash 85 | ./PhyloAcc Simulation_ratite/param2-6.txt 86 | ``` 87 | To run your own data, please change the paths in your parameter file. 88 | 89 | To run the model including gBGC, 90 | ```bash 91 | cd V2_GBGC 92 | mkdir Simulation/result_tmp 93 | ./PhyloAcc_gBGC paramGC-0.txt 94 | ``` 95 | under *V2_GBGC/*. It will output to *V2_GBGC/Simulation/result_tmp/* (which will be the same as in V2_GBGC/Simulation/result/). 96 | 97 | There are several R scripts available in [R/](https://github.com/xyz111131/PhyloAcc/blob/master/R) which read the output from PhyloAcc and generate plots in the main paper (e.g. "scaled" phylogenetic tree and sequence alignment for one element). Please read [plot.html](https://xyz111131.github.io/PhyloAcc/R/plot.html) and run [plot.Rmd](https://github.com/xyz111131/PhyloAcc/blob/master/R/plot.Rmd) for detail. R scripts to generate simulated DNA sequences are also in [R/](https://github.com/xyz111131/PhyloAcc/blob/master/R), please see [Simulation.md](https://github.com/xyz111131/PhyloAcc/blob/master/Simulation.md) for more detail. 98 | 99 | ## Data 100 | The species names and phylogenetic trees used in the PhyloAcc manuscript are in [Data/](https://github.com/xyz111131/PhyloAcc/blob/master/Data/). The simulated sequences and results are in [Simulation_mammal/](https://github.com/xyz111131/PhyloAcc/blob/master/Simulation_mammal/) and [Simulation_ratite/](https://github.com/xyz111131/PhyloAcc/blob/master/Simulation_ratite/). The results for ratite and mammalian CNEEs in [mammal_result/](https://github.com/xyz111131/PhyloAcc/blob/master/mammal_result/) and [ratite_result/](https://github.com/xyz111131/PhyloAcc/blob/master/ratite_result/). 101 | 102 | ## Trouble Shooting 103 | After running phyloFit for background sequences to get the rate matrix and branch lengths, you might need to name all ancestral nodes in the tree. You could use tree_doctor in phast (http://compgen.cshl.edu/phast/help-pages/tree_doctor.txt): 104 | ``` bash 105 | tree_doctor --name-ancestors input.mod > output.mod 106 | ``` 107 | 108 | ## Reference 109 | Zhirui Hu, Timothy B Sackton, Scott V. Edwards, Jun S. Liu: **A hierarchical Bayesian model for detecting convergent rate changes of conserved noncoding elements on phylogenetic trees**, *bioRxiv*, 2018. 110 | https://www.biorxiv.org/content/early/2018/02/22/260745.1 111 | -------------------------------------------------------------------------------- /src/PhyloAcc-GT/bpp_c2.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // bpp_c2.cpp 3 | // PhyloAcc_init2 4 | // 5 | // Created by hzr on 4/20/16. 6 | // Copyright © 2016 hzr. All rights reserved. 7 | // 8 | 9 | #include "bpp_c.hpp" 10 | #include "bpp.hpp" 11 | #include 12 | 13 | struct Cmp2 14 | { 15 | bool operator()(const pair &a, const pair &b) 16 | { 17 | if (a.first == b.first) 18 | { 19 | return a.second <= b.second; 20 | } 21 | return a.first > b.first; 22 | } 23 | }; 24 | 25 | void BPP_C::Output_sampling(int iter, string output_path2, BPP &bpp, int resZ) 26 | { 27 | 28 | string outpath_lik = output_path2 + "_mcmc_trace_M" + to_string(resZ) + "_" + to_string(CC) + ".txt"; 29 | ofstream out_lik; 30 | out_lik.precision(8); 31 | 32 | #pragma omp critical 33 | { 34 | if (iter == 0) 35 | { 36 | out_lik.open(outpath_lik.c_str()); 37 | out_lik << "iter\tloglik\tindicator\trate_n\trate_c\tpi_A\tGTtop\t"; 38 | for (int s = 0; s < N; s++) 39 | { // header: species name 40 | out_lik << bpp.nodes_names[s] << "\t"; 41 | } 42 | out_lik<<"grate\tlrate"; 43 | out_lik << endl; 44 | } 45 | else 46 | { 47 | out_lik.open(outpath_lik.c_str(), ios::app); 48 | } 49 | 50 | for (std::size_t i = 0; i < num_mcmc + num_burn; i++) 51 | { 52 | out_lik << iter << "\t" << trace_loglik[i] << "\t" << trace_indicator[i] << "\t" << trace_n_rate[i] << "\t" << trace_c_rate[i] << "\t" << trace_pi[i][0] << "\t" <, Cmp2> topK; 70 | set>::iterator itbegin; 71 | for (map::iterator it = trace_genetree.begin(); it != trace_genetree.end(); it++) //Han: number of times each tree sampled. 72 | { 73 | if (topK.size() < 10) //Han: get the top 5 most freq gene trees 74 | { 75 | topK.insert(make_pair(it->second.count, it->first)); 76 | } 77 | else 78 | { 79 | // 80 | if (topK.rbegin()->first <= it->second.count) 81 | { 82 | topK.insert(make_pair(it->second.count, it->first)); 83 | itbegin = topK.begin(); 84 | advance(itbegin, 9); 85 | int temp_count = itbegin->first; 86 | for (; itbegin != topK.end(); itbegin++) 87 | { 88 | if (itbegin->first < temp_count) 89 | break; 90 | } 91 | 92 | topK.erase(itbegin, topK.end()); 93 | } 94 | } 95 | } 96 | 97 | #pragma omp critical 98 | if(iter ==0){ 99 | outG.open(outpath_Gt.c_str()); 100 | outG<<"ID\titer\tTreeNum\tprop\tG"<second; 110 | 111 | //get average branch len 112 | for (vector::iterator it = trace_genetree[gt].distances.begin(); it != trace_genetree[gt].distances.end(); it++) 113 | { 114 | *it /= itbegin->first; 115 | } 116 | std::stringstream buffer; 117 | trace_genetree[gt].printTree(trace_genetree[gt].root, bpp, buffer); 118 | outG<< CC << "\t" <first / num_mcmc << "\t" << buffer.str() << endl; 119 | } 120 | outG.close(); 121 | } 122 | 123 | void BPP_C::Output_init(string output_path, string output_path2, BPP &bpp, ofstream &out_Z, ofstream &out_tree, int mod_GT) 124 | { 125 | 126 | int mid = num_mcmc / 2; 127 | std::sort(trace_n_rate.begin() + num_burn, trace_n_rate.begin() + num_mcmc + num_burn); 128 | double n_rate = trace_n_rate[mid + num_burn]; //Han: medium 129 | 130 | double c_rate = 0; 131 | for (std::size_t i = num_burn; i < num_mcmc + num_burn; i++) 132 | { 133 | c_rate += trace_c_rate[i]; 134 | } 135 | c_rate /= num_mcmc; //Han mean 136 | 137 | std::sort(trace_g_rate.begin() + num_burn, trace_g_rate.begin() + num_mcmc + num_burn); 138 | double g_rate = trace_g_rate[mid + num_burn]; //Han: medium 139 | 140 | std::sort(trace_l_rate.begin() + num_burn, trace_l_rate.begin() + num_mcmc + num_burn); 141 | double l_rate = trace_l_rate[mid + num_burn]; 142 | 143 | std::sort(trace_l2_rate.begin() + num_burn, trace_l2_rate.begin() + num_mcmc + num_burn); 144 | double l2_rate = trace_l2_rate[mid + num_burn]; 145 | 146 | vector> countZ = vector>(N, vector(4, 0)); 147 | for (int s = 0; s < N; s++) 148 | { 149 | 150 | for (std::size_t i = num_burn; i < num_mcmc + num_burn; i++) 151 | { 152 | 153 | countZ[s][trace_Z[i][s] + 1]++; 154 | } 155 | 156 | if (missing[s]) 157 | { 158 | countZ[s][0] = num_mcmc; //set missing s = 1, though Z[s] can be 0/1/2; only missing in upper Z[s] = -1 159 | } 160 | } 161 | 162 | // output top K genetree, K = 5 163 | set, Cmp2> topK; 164 | set>::iterator itbegin; 165 | for (map::iterator it = trace_genetree.begin(); it != trace_genetree.end(); it++) //Han: number of times each tree sampled. 166 | { 167 | if (topK.size() < 5) //Han: get the top 5 most freq gene trees 168 | { 169 | topK.insert(make_pair(it->second.count, it->first)); 170 | } 171 | else 172 | { 173 | // 174 | if (topK.rbegin()->first <= it->second.count) 175 | { 176 | topK.insert(make_pair(it->second.count, it->first)); 177 | itbegin = topK.begin(); 178 | advance(itbegin, 4); 179 | int temp_count = itbegin->first; 180 | for (; itbegin != topK.end(); itbegin++) 181 | { 182 | if (itbegin->first < temp_count) 183 | break; 184 | } 185 | 186 | topK.erase(itbegin, topK.end()); 187 | } 188 | } 189 | } 190 | 191 | #pragma omp critical 192 | { 193 | out_Z << CC << "\t" << n_rate << "\t" << c_rate << "\t" << g_rate << "\t" << l_rate << "\t" << l2_rate; 194 | for (int s = 0; s < N; s++) 195 | { 196 | //out_Z <<"\t"<second; 207 | 208 | //get average branch len 209 | if(mod_GT==0){ 210 | for (vector::iterator it = trace_genetree[gt].distances.begin(); it != trace_genetree[gt].distances.end(); it++) 211 | { 212 | *it /= itbegin->first; 213 | } 214 | } 215 | std::stringstream buffer; 216 | trace_genetree[gt].printTree(trace_genetree[gt].root, bpp, buffer); 217 | #pragma omp critical 218 | { 219 | out_tree << CC << "\t" << (double)itbegin->first / num_mcmc << "\t" << buffer.str() << endl; 220 | } 221 | } 222 | } 223 | 224 | 225 | void BPP_C::Output_GTsampling(string output_path2, BPP &bpp, int resZ) 226 | { 227 | string outpath_mctree = output_path2 + "_trace_genetree_M" + to_string(resZ) + "_" + to_string(CC) + ".txt"; 228 | ofstream out_mctree; 229 | out_mctree.precision(8); 230 | 231 | #pragma omp critical 232 | { 233 | out_mctree.open(outpath_mctree.c_str()); 234 | out_mctree<<"No.\tCount\tGTtopology\tgenetree"; 235 | for(int s=S;s::iterator it = trace_genetree.begin(); it != trace_genetree.end(); it++) 242 | { 243 | no = no + 1; 244 | string tp = it->first; 245 | for(vector::iterator it2=trace_genetree[tp].distances.begin(); it2 !=trace_genetree[tp].distances.end();it2++){ 246 | *it2 /= trace_genetree[tp].count; 247 | } //Get average tree //If called bppc.Output_init before, then topK5 trees are alr average tree. 248 | std::stringstream buffer; 249 | trace_genetree[tp].printTree(trace_genetree[tp].root, bpp, buffer); 250 | out_mctree << no << "\t" << it->second.count << "\t" << tp << "\t" << buffer.str(); 251 | for(int s=S; s 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace ctnutils 16 | { 17 | 18 | 19 | template 20 | vector sort_indexes(const vector &v) { 21 | 22 | // initialize original index locations 23 | vector idx(v.size()); 24 | iota(idx.begin(), idx.end(), 0); 25 | 26 | // sort indexes based on comparing values in v 27 | sort(idx.begin(), idx.end(), 28 | [&v](size_t i1, size_t i2) {return v[i1] < v[i2];}); 29 | 30 | return idx; 31 | } 32 | 33 | template 34 | struct IdxCompare 35 | { 36 | const std::vector& target; 37 | 38 | IdxCompare(const std::vector& target): target(target) {} 39 | 40 | bool operator()(size_t a, size_t b) const { return target[a].size() > target[b].size(); } 41 | }; 42 | 43 | template 44 | vector sort_indexes(const vector &v, int K) { 45 | 46 | 47 | if(K<0) K =v.size(); 48 | // initialize original index locations 49 | vector idx(K); 50 | for (size_t i = 0; i != idx.size(); ++i) idx[i] = i; 51 | 52 | // sort indexes based on comparing values in v 53 | sort(idx.begin(), idx.end(), IdxCompare(v)); 54 | 55 | return idx; 56 | } 57 | 58 | template 59 | vector merge(const vector & v1, const vector & v2) 60 | { 61 | vector c; 62 | c.insert(c.end(), v1.begin(), v1.end()); 63 | c.insert(c.end(), v2.begin(), v2.end()); 64 | return c; 65 | } 66 | 67 | template 68 | vector merge(const vector & v1, const T & v2) 69 | { 70 | vector c; 71 | c.insert(c.end(), v1.begin(), v1.end()); 72 | c.push_back(v2); 73 | return c; 74 | } 75 | 76 | vector GenGrayArray(int n); 77 | 78 | template 79 | vector< vector > subsets(const vector & s) 80 | { 81 | int len = s.size(); 82 | vector arr = ctnutils::GenGrayArray(len); 83 | 84 | vector< vector > results; 85 | for(int i=0; i result; 88 | for(int j=0; j 103 | vector SetDiff(const vector & A, const vector & B) 104 | { 105 | vector C = A; 106 | for(int i=0; i 125 | void DispVector(const vector & vec, string sep = "\t") 126 | { 127 | if (vec.size()==0) 128 | { 129 | cout << "(empty vector)"; 130 | return; 131 | } 132 | 133 | cout << "("; 134 | for(unsigned i=0; i 140 | void DispVector(const vector & vec, const vector & ref, string sep = "\t") 141 | { 142 | if (vec.size()==0) 143 | { 144 | cout << "(empty vector)"; 145 | return; 146 | } 147 | 148 | cout << "("; 149 | for(unsigned i=0; i 155 | void DispVector(T *vec, int size, string sep = "\t") 156 | { 157 | if (size==0) 158 | { 159 | cout << "(empty vector)"; 160 | return; 161 | } 162 | 163 | cout << "("; 164 | for(int i=0; i 170 | void DispMatrix(vector > & matrix) 171 | { 172 | for(unsigned i=0; i 185 | void DispMatrix(T **matrix, int num_row, int num_col) 186 | { 187 | for(int i=0; i 197 | void FillVector(T *vec, int n, T elem) 198 | { 199 | for(int i=0;i 204 | void FillVector(vector & vec, int n, T elem) 205 | { 206 | for(int i=0;i 211 | bool IsVectorSame(const vector & v1, const vector & v2) 212 | { 213 | if (v1.size()!=v2.size()) 214 | return false; 215 | for(unsigned i=0; i 222 | bool IsVectorSame(T *v1, T *v2, int n) 223 | { 224 | for(int i=0; i 231 | bool Contains(const vector & vec, T elem) 232 | { 233 | for(int i=0; i 243 | bool Contains(const vector & vec, const vector & vec2) 244 | { 245 | for(int i=0; i 253 | struct FindReturn 254 | { 255 | int pos; 256 | T val; 257 | }; 258 | 259 | template 260 | FindReturn Find(const vector & vec, T t) 261 | { 262 | FindReturn fr; 263 | fr.pos = -1; 264 | for(int i=0; i 276 | FindReturn FindMax(const vector & vec, T min) 277 | { 278 | FindReturn fr; 279 | fr.pos = -1; 280 | for(int i=0; i min) 283 | { 284 | fr.pos = i; 285 | fr.val = vec[i]; 286 | min = vec[i]; 287 | } 288 | } 289 | return fr; 290 | } 291 | 292 | template 293 | FindReturn FindMin(const vector & vec, T max) 294 | { 295 | FindReturn fr; 296 | fr.pos = -1; 297 | for(int i=0; i &split(const string &s, char delim, vector &elems); 327 | vector split(const string &s, char delim); 328 | 329 | 330 | string ToUpperCase(string str); 331 | 332 | string ToLowerCase(string str); 333 | 334 | string replace(string value, string const & search, string const & replace); 335 | 336 | bool endsWith (std::string const &fullString, std::string const &ending); 337 | 338 | string itoa(int value); 339 | } 340 | 341 | namespace numutils 342 | { 343 | 344 | int CountDigits(int num); 345 | 346 | string EnoughSpaces(int MAX, int num); 347 | 348 | bool is_number(const std::string& s); 349 | 350 | } 351 | 352 | namespace oututils 353 | { 354 | 355 | void RepeatOutputs(int num_spaces, ostream & out = cout, string sp = " "); 356 | 357 | } 358 | 359 | 360 | namespace statutils 361 | { 362 | // adjusted rand index 363 | double ari(vector I1, vector I2); 364 | 365 | double ari(vector > I1, vector > I2); 366 | 367 | void test_ari(); 368 | 369 | template 370 | T sum(vector & vec) 371 | { 372 | T m = 0; 373 | for(int i=0; i 381 | T mean(vector & vec) 382 | { 383 | T m = 0; 384 | for(int i=0; i 393 | T var(vector & vec) 394 | { 395 | T m = mean(vec); 396 | T var = 0; 397 | for(int i=0; i getdir (string dir); 409 | 410 | bool file_exists (const std::string& name); 411 | } 412 | #endif // UTILS_H_INCLUDED 413 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H_INCLUDED 2 | #define UTILS_H_INCLUDED 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | namespace ctnutils 15 | { 16 | 17 | //template 18 | //vector< vector > sortbySizeDec(vector< vector > v) 19 | //{ 20 | // for(int i=0; i 33 | struct IdxCompare 34 | { 35 | const std::vector& target; 36 | 37 | IdxCompare(const std::vector& target): target(target) {} 38 | 39 | bool operator()(size_t a, size_t b) const { return target[a].size() > target[b].size(); } 40 | }; 41 | 42 | template 43 | vector sort_indexes(const vector &v, int K) { 44 | 45 | 46 | if(K<0) K =v.size(); 47 | // initialize original index locations 48 | vector idx(K); 49 | for (size_t i = 0; i != idx.size(); ++i) idx[i] = i; 50 | 51 | // sort indexes based on comparing values in v 52 | sort(idx.begin(), idx.end(), IdxCompare(v)); 53 | 54 | return idx; 55 | } 56 | 57 | 58 | // template 59 | // vector sort_indexes(const vector &v, int K) { 60 | // 61 | // if(K<0) K =v.size(); 62 | // // initialize original index locations 63 | // vector idx(K); 64 | // for (size_t i = 0; i != idx.size(); ++i) idx[i] = i; 65 | // 66 | // // sort indexes based on comparing values in v 67 | // sort(idx.begin(), idx.end(), 68 | // [&v](size_t i1, size_t i2) {return v[i1].size() > v[i2].size();}); 69 | // 70 | // return idx; 71 | // } 72 | 73 | 74 | 75 | template 76 | vector merge(const vector & v1, const vector & v2) 77 | { 78 | vector c; 79 | c.insert(c.end(), v1.begin(), v1.end()); 80 | c.insert(c.end(), v2.begin(), v2.end()); 81 | return c; 82 | } 83 | 84 | template 85 | vector merge(const vector & v1, const T & v2) 86 | { 87 | vector c; 88 | c.insert(c.end(), v1.begin(), v1.end()); 89 | c.push_back(v2); 90 | return c; 91 | } 92 | 93 | vector GenGrayArray(int n); 94 | 95 | template 96 | vector< vector > subsets(const vector & s) 97 | { 98 | int len = s.size(); 99 | vector arr = ctnutils::GenGrayArray(len); 100 | 101 | vector< vector > results; 102 | for(int i=0; i result; 105 | for(int j=0; j 120 | vector SetDiff(const vector & A, const vector & B) 121 | { 122 | vector C = A; 123 | for(int i=0; i 142 | void DispVector(const vector & vec, string sep = "\t") 143 | { 144 | if (vec.size()==0) 145 | { 146 | cout << "(empty vector)"; 147 | return; 148 | } 149 | 150 | cout << "("; 151 | for(unsigned i=0; i 157 | void DispVector(const vector & vec, const vector & ref, string sep = "\t") 158 | { 159 | if (vec.size()==0) 160 | { 161 | cout << "(empty vector)"; 162 | return; 163 | } 164 | 165 | cout << "("; 166 | for(unsigned i=0; i 172 | void DispVector(T *vec, int size, string sep = "\t") 173 | { 174 | if (size==0) 175 | { 176 | cout << "(empty vector)"; 177 | return; 178 | } 179 | 180 | cout << "("; 181 | for(int i=0; i 187 | void DispMatrix(vector > & matrix) 188 | { 189 | for(unsigned i=0; i 202 | void DispMatrix(T **matrix, int num_row, int num_col) 203 | { 204 | for(int i=0; i 214 | void FillVector(T *vec, int n, T elem) 215 | { 216 | for(int i=0;i 221 | void FillVector(vector & vec, int n, T elem) 222 | { 223 | for(int i=0;i 228 | bool IsVectorSame(const vector & v1, const vector & v2) 229 | { 230 | if (v1.size()!=v2.size()) 231 | return false; 232 | for(unsigned i=0; i 239 | bool IsVectorSame(T *v1, T *v2, int n) 240 | { 241 | for(int i=0; i 248 | bool Contains(const vector & vec, T elem) 249 | { 250 | for(int i=0; i 260 | bool Contains(const vector & vec, const vector & vec2) 261 | { 262 | for(int i=0; i 270 | struct FindReturn 271 | { 272 | int pos; 273 | T val; 274 | }; 275 | 276 | template 277 | FindReturn Find(const vector & vec, T t) 278 | { 279 | FindReturn fr; 280 | fr.pos = -1; 281 | for(int i=0; i 293 | FindReturn FindMax(const vector & vec, T min) 294 | { 295 | FindReturn fr; 296 | fr.pos = -1; 297 | for(int i=0; i min) 300 | { 301 | fr.pos = i; 302 | fr.val = vec[i]; 303 | min = vec[i]; 304 | } 305 | } 306 | return fr; 307 | } 308 | 309 | template 310 | FindReturn FindMin(const vector & vec, T max) 311 | { 312 | FindReturn fr; 313 | fr.pos = -1; 314 | for(int i=0; i &split(const string &s, char delim, vector &elems); 344 | vector split(const string &s, char delim); 345 | 346 | 347 | string ToUpperCase(string str); 348 | 349 | string ToLowerCase(string str); 350 | 351 | string replace(string value, string const & search, string const & replace); 352 | 353 | bool endsWith (std::string const &fullString, std::string const &ending); 354 | 355 | string itoa(int value); 356 | } 357 | 358 | namespace numutils 359 | { 360 | 361 | int CountDigits(int num); 362 | 363 | string EnoughSpaces(int MAX, int num); 364 | 365 | bool is_number(const std::string& s); 366 | 367 | } 368 | 369 | namespace oututils 370 | { 371 | 372 | void RepeatOutputs(int num_spaces, ostream & out = cout, string sp = " "); 373 | 374 | } 375 | 376 | 377 | namespace statutils 378 | { 379 | // adjusted rand index 380 | double ari(vector I1, vector I2); 381 | 382 | double ari(vector > I1, vector > I2); 383 | 384 | void test_ari(); 385 | 386 | template 387 | T sum(vector & vec) 388 | { 389 | T m = 0; 390 | for(int i=0; i 398 | T mean(vector & vec) 399 | { 400 | T m = 0; 401 | for(int i=0; i 410 | T var(vector & vec) 411 | { 412 | T m = mean(vec); 413 | T var = 0; 414 | for(int i=0; i getdir (string dir); 426 | 427 | bool file_exists (const std::string& name); 428 | } 429 | #endif // UTILS_H_INCLUDED 430 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/SRC/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H_INCLUDED 2 | #define UTILS_H_INCLUDED 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | namespace ctnutils 15 | { 16 | 17 | //template 18 | //vector< vector > sortbySizeDec(vector< vector > v) 19 | //{ 20 | // for(int i=0; i 33 | struct IdxCompare 34 | { 35 | const std::vector& target; 36 | 37 | IdxCompare(const std::vector& target): target(target) {} 38 | 39 | bool operator()(size_t a, size_t b) const { return target[a].size() > target[b].size(); } 40 | }; 41 | 42 | template 43 | vector sort_indexes(const vector &v, int K) { 44 | 45 | 46 | if(K<0) K =v.size(); 47 | // initialize original index locations 48 | vector idx(K); 49 | for (size_t i = 0; i != idx.size(); ++i) idx[i] = i; 50 | 51 | // sort indexes based on comparing values in v 52 | sort(idx.begin(), idx.end(), IdxCompare(v)); 53 | 54 | return idx; 55 | } 56 | 57 | 58 | // template 59 | // vector sort_indexes(const vector &v, int K) { 60 | // 61 | // if(K<0) K =v.size(); 62 | // // initialize original index locations 63 | // vector idx(K); 64 | // for (size_t i = 0; i != idx.size(); ++i) idx[i] = i; 65 | // 66 | // // sort indexes based on comparing values in v 67 | // sort(idx.begin(), idx.end(), 68 | // [&v](size_t i1, size_t i2) {return v[i1].size() > v[i2].size();}); 69 | // 70 | // return idx; 71 | // } 72 | 73 | 74 | 75 | template 76 | vector merge(const vector & v1, const vector & v2) 77 | { 78 | vector c; 79 | c.insert(c.end(), v1.begin(), v1.end()); 80 | c.insert(c.end(), v2.begin(), v2.end()); 81 | return c; 82 | } 83 | 84 | template 85 | vector merge(const vector & v1, const T & v2) 86 | { 87 | vector c; 88 | c.insert(c.end(), v1.begin(), v1.end()); 89 | c.push_back(v2); 90 | return c; 91 | } 92 | 93 | vector GenGrayArray(int n); 94 | 95 | template 96 | vector< vector > subsets(const vector & s) 97 | { 98 | int len = s.size(); 99 | vector arr = ctnutils::GenGrayArray(len); 100 | 101 | vector< vector > results; 102 | for(int i=0; i result; 105 | for(int j=0; j 120 | vector SetDiff(const vector & A, const vector & B) 121 | { 122 | vector C = A; 123 | for(int i=0; i 142 | void DispVector(const vector & vec, string sep = "\t") 143 | { 144 | if (vec.size()==0) 145 | { 146 | cout << "(empty vector)"; 147 | return; 148 | } 149 | 150 | cout << "("; 151 | for(unsigned i=0; i 157 | void DispVector(const vector & vec, const vector & ref, string sep = "\t") 158 | { 159 | if (vec.size()==0) 160 | { 161 | cout << "(empty vector)"; 162 | return; 163 | } 164 | 165 | cout << "("; 166 | for(unsigned i=0; i 172 | void DispVector(T *vec, int size, string sep = "\t") 173 | { 174 | if (size==0) 175 | { 176 | cout << "(empty vector)"; 177 | return; 178 | } 179 | 180 | cout << "("; 181 | for(int i=0; i 187 | void DispMatrix(vector > & matrix) 188 | { 189 | for(unsigned i=0; i 202 | void DispMatrix(T **matrix, int num_row, int num_col) 203 | { 204 | for(int i=0; i 214 | void FillVector(T *vec, int n, T elem) 215 | { 216 | for(int i=0;i 221 | void FillVector(vector & vec, int n, T elem) 222 | { 223 | for(int i=0;i 228 | bool IsVectorSame(const vector & v1, const vector & v2) 229 | { 230 | if (v1.size()!=v2.size()) 231 | return false; 232 | for(unsigned i=0; i 239 | bool IsVectorSame(T *v1, T *v2, int n) 240 | { 241 | for(int i=0; i 248 | bool Contains(const vector & vec, T elem) 249 | { 250 | for(int i=0; i 260 | bool Contains(const vector & vec, const vector & vec2) 261 | { 262 | for(int i=0; i 270 | struct FindReturn 271 | { 272 | int pos; 273 | T val; 274 | }; 275 | 276 | template 277 | FindReturn Find(const vector & vec, T t) 278 | { 279 | FindReturn fr; 280 | fr.pos = -1; 281 | for(int i=0; i 293 | FindReturn FindMax(const vector & vec, T min) 294 | { 295 | FindReturn fr; 296 | fr.pos = -1; 297 | for(int i=0; i min) 300 | { 301 | fr.pos = i; 302 | fr.val = vec[i]; 303 | min = vec[i]; 304 | } 305 | } 306 | return fr; 307 | } 308 | 309 | template 310 | FindReturn FindMin(const vector & vec, T max) 311 | { 312 | FindReturn fr; 313 | fr.pos = -1; 314 | for(int i=0; i &split(const string &s, char delim, vector &elems); 344 | vector split(const string &s, char delim); 345 | 346 | 347 | string ToUpperCase(string str); 348 | 349 | string ToLowerCase(string str); 350 | 351 | string replace(string value, string const & search, string const & replace); 352 | 353 | bool endsWith (std::string const &fullString, std::string const &ending); 354 | 355 | string itoa(int value); 356 | } 357 | 358 | namespace numutils 359 | { 360 | 361 | int CountDigits(int num); 362 | 363 | string EnoughSpaces(int MAX, int num); 364 | 365 | bool is_number(const std::string& s); 366 | 367 | } 368 | 369 | namespace oututils 370 | { 371 | 372 | void RepeatOutputs(int num_spaces, ostream & out = cout, string sp = " "); 373 | 374 | } 375 | 376 | 377 | namespace statutils 378 | { 379 | // adjusted rand index 380 | double ari(vector I1, vector I2); 381 | 382 | double ari(vector > I1, vector > I2); 383 | 384 | void test_ari(); 385 | 386 | template 387 | T sum(vector & vec) 388 | { 389 | T m = 0; 390 | for(int i=0; i 398 | T mean(vector & vec) 399 | { 400 | T m = 0; 401 | for(int i=0; i 410 | T var(vector & vec) 411 | { 412 | T m = mean(vec); 413 | T var = 0; 414 | for(int i=0; i getdir (string dir); 426 | 427 | bool file_exists (const std::string& name); 428 | } 429 | #endif // UTILS_H_INCLUDED 430 | -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/html.py: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # Functions to generate html files for easy visualization of 3 | # phyloacc datasets 4 | # Gregg Thomas 5 | ############################################################################# 6 | 7 | import os 8 | import phyloacc_lib.core as PC 9 | import phyloacc_lib.templates as TEMPLATES 10 | import phyloacc_lib.templates_post as TEMPLATES_POST 11 | 12 | ############################################################################# 13 | 14 | def writeHTML(globs): 15 | step = "Writing HTML summary file"; 16 | step_start_time = PC.report_step(globs, step, False, "In progress..."); 17 | # Status updated 18 | 19 | # if not os.path.isdir(globs['html-dir']): 20 | # shutil.copytree( os.path.join(globs['script-dir'], "html"), globs['html-dir'] ); 21 | # Copy the associated html files (stylesheets, images) from the provided template folders 22 | 23 | if globs['tree-data-type'] == 'class': 24 | num_spec = globs['st'].num_tips; 25 | tips = globs['st'].tips; 26 | internals = globs['st'].internals; 27 | elif globs['tree-data-type'] == 'func': 28 | num_spec = len(globs['tips']); 29 | tips = globs['tips']; 30 | internals = globs['internals']; 31 | 32 | if globs['run-mode'] == 'adaptive': 33 | comment_start = ""; 34 | comment_end = ""; 35 | else: 36 | comment_start = ""; 38 | # Some HTML blocks will be commented out depending on the run mode 39 | 40 | if globs['filter-alns']: 41 | filter_comment_start = ""; 42 | filter_comment_end = ""; 43 | else: 44 | filter_comment_start = ""; 46 | 47 | if globs['theta']: 48 | theta_comment_start = ""; 49 | theta_comment_end = ""; 50 | else: 51 | theta_comment_start = ""; 53 | 54 | if globs['coal-tree-file']: 55 | coal_tree_comment_start = ""; 56 | coal_tree_comment_end = ""; 57 | else: 58 | coal_tree_comment_start = ""; 60 | 61 | 62 | if globs['batch']: 63 | batch_comment_start = ""; 65 | else: 66 | batch_comment_start = ""; 67 | batch_comment_end = ""; 68 | 69 | with open(globs['html-file'], "w") as htmlfile: 70 | htmlfile.write(TEMPLATES.htmlSummary().format( 71 | # mod_file=os.path.abspath(globs['mod-file']), 72 | run_name=globs['run-name'], 73 | run_time=globs['startdatetimenice'], 74 | host_name=os.uname()[1], 75 | script_call=globs['call'], 76 | num_aln=str(globs['num-loci']), 77 | num_no_inf_loci=str(len(globs['no-inf-sites-loci'])), 78 | filter_comment_start=filter_comment_start, 79 | filter_comment_end=filter_comment_end, 80 | num_filtered_loci=str(globs['filtered-loci']), 81 | num_st_loci=str(globs['st-loci']), 82 | num_gt_loci=str(globs['gt-loci']), 83 | num_batches=str(globs['num-batches']), 84 | batch_size=str(globs['batch-size']), 85 | procs_per_batch=str(globs['procs-per-job']), 86 | num_st_batches=str(len(globs['st-batches'])), 87 | num_gt_batches=str(len(globs['gt-batches'])), 88 | num_jobs=str(globs['num-jobs']), 89 | # num_spec=str(len(globs['st'].tips)), 90 | # num_spec=str(len(globs['tips'])), 91 | num_spec=str(num_spec), 92 | num_targets=str(len(globs['groups']['targets'])), 93 | num_conserved=str(len(globs['groups']['conserved'])), 94 | num_outgroups=str(len(globs['groups']['outgroup'])), 95 | log_file=os.path.basename(globs['logfilename']), 96 | aln_stats_file=os.path.basename(globs['alnstatsfile']), 97 | batch_comment_start=batch_comment_start, 98 | batch_comment_end=batch_comment_end, 99 | snakemake_cmd=globs['smk-cmd'], 100 | theta_comment_start=theta_comment_start, 101 | theta_comment_end=theta_comment_end, 102 | coal_tree_comment_start=coal_tree_comment_start, 103 | coal_tree_file=globs['coal-tree-file'], 104 | coal_tree_comment_end=coal_tree_comment_end, 105 | input_tree_plot=os.path.join("plots", globs['input-tree-plot-file']), 106 | avg_aln_len=str(round(globs['avg-aln-len'], 3)), 107 | median_aln_len=str(round(globs['med-aln-len'], 3)), 108 | avg_seq_len_nogap=str(round(globs['avg-nogap-seq-len'], 3)), 109 | med_seq_len_nogap=str(round(globs['med-nogap-seq-len'], 3)), 110 | aln_len_hist=os.path.join("plots", globs['aln-len-plot-file']), 111 | seq_len_hist=os.path.join("plots", globs['seq-len-plot-file']), 112 | informative_sites_hist=os.path.join("plots", globs['inf-sites-plot-file']), 113 | informative_sites_frac_hist=os.path.join("plots", globs['inf-sites-frac-plot-file']), 114 | variable_informative_sites_plot=os.path.join("plots", globs['var-inf-sites-plot-file']), 115 | comment_start=comment_start, 116 | comment_end=comment_end, 117 | avg_scf_hist=os.path.join("plots", globs['avg-scf-hist-file']), 118 | low_scf_hist=os.path.join("plots", globs['low-scf-hist-file']), 119 | scf_tree_plot=os.path.join("plots", globs['scf-tree-plot-file']), 120 | bl_scf_plot=os.path.join("plots", globs['bl-scf-plot-file']), 121 | date_time=PC.getFooterDateTime(), 122 | )) 123 | # Write the HTML summary file using the template 124 | 125 | step_start_time = PC.report_step(globs, step, step_start_time, "Success"); 126 | globs['html-summary-written'] = True; 127 | # Status update 128 | 129 | return globs; 130 | 131 | ############################################################################# 132 | 133 | def writeHTMLPost(globs): 134 | step = "Writing HTML summary file"; 135 | step_start_time = PC.report_step(globs, step, False, "In progress..."); 136 | # Status updated 137 | 138 | # if not os.path.isdir(globs['html-dir']): 139 | # shutil.copytree( os.path.join(globs['script-dir'], "html"), globs['html-dir'] ); 140 | # Copy the associated html files (stylesheets, images) from the provided template folders 141 | 142 | # if globs['run-mode'] == 'adaptive': 143 | # comment_start = ""; 144 | # comment_end = ""; 145 | # else: 146 | # comment_start = ""; 148 | # # Some HTML blocks will be commented out depending on the run mode 149 | 150 | # if globs['theta']: 151 | # theta_comment_start = ""; 152 | # theta_comment_end = ""; 153 | # else: 154 | # theta_comment_start = ""; 156 | 157 | # if globs['coal-tree-file']: 158 | # coal_tree_comment_start = ""; 159 | # coal_tree_comment_end = ""; 160 | # else: 161 | # coal_tree_comment_start = ""; 163 | 164 | comment_start = "" 166 | 167 | if globs['incomplete-batches']: 168 | batch_comment_start = ""; 169 | batch_comment_end = ""; 170 | else: 171 | batch_comment_start = ""; 173 | 174 | placeholder = ""; 175 | 176 | with open(globs['html-file'], "w") as htmlfile: 177 | htmlfile.write(TEMPLATES_POST.htmlSummary().format( 178 | # mod_file=os.path.abspath(globs['mod-file']), 179 | run_name=globs['run-name'], 180 | run_time=globs['startdatetimenice'], 181 | host_name=os.uname()[1], 182 | script_call=globs['call'], 183 | num_batches_complete_st=str(len(globs['complete-batches-st'])), 184 | num_batches_complete_gt=str(len(globs['complete-batches-gt'])), 185 | num_batches_incomplete_st=str(len(globs['incomplete-batches-st'])), 186 | num_batches_incomplete_gt=str(len(globs['incomplete-batches-gt'])), 187 | batch_size=str(globs['batch-size']), 188 | procs_per_batch=str(globs['procs-per-batch']), 189 | avg_runtime=str(round(PC.mean(globs['batch-runtimes']))), 190 | 191 | batch_comment_start=batch_comment_start, 192 | incomplete_batches=", ".join(globs['incomplete-batches']), 193 | batch_comment_end=batch_comment_end, 194 | 195 | bf1_cutoff=str(globs['bf1-cutoff']), 196 | bf1_loci=str(len(globs['bf1-loci'])), 197 | bf2_cutoff=str(globs['bf2-cutoff']), 198 | bf2_loci=str(len(globs['bf2-loci'])), 199 | bf3_cutoff=str(globs['bf3-cutoff']), 200 | bf3_loci=str(len(globs['bf3-loci'])), 201 | 202 | total_loci=str(len(globs['locus-stats']['elem_lik'])), 203 | target_loci=str(len(globs['m1-loci'])), 204 | full_loci=str(len(globs['m2-loci'])), 205 | 206 | log_file=os.path.basename(globs['logfilename']), 207 | results_folder=os.path.basename(globs['outdir']) + "/", 208 | 209 | med_bf1=str(round(PC.median(globs['bf1']), 3)), 210 | med_bf2=str(round(PC.median(globs['bf2']), 3)), 211 | med_bf3=str(round(PC.median(globs['bf3']), 3)), 212 | 213 | bf_boxplot=os.path.join("plots", globs['bf-dist-file']), 214 | #bf1_hist=os.path.join("plots", globs['bf1-dist-file']), 215 | #bf2_hist=os.path.join("plots", globs['bf2-dist-file']), 216 | bf1_bf2_plot=os.path.join("plots", globs['bf1-bf2-plot-file']), 217 | 218 | avg_m2_locus=str(round(PC.mean(list(globs['m2-per-locus'].values())))), 219 | med_m2_locus=str(round(PC.median(list(globs['m2-per-locus'].values())))), 220 | 221 | m2_dist=os.path.join("plots", globs['m2-locus-dist-file']), 222 | 223 | avg_m2_lineages=str(round(PC.mean(list(globs['m2-per-lineage'].values())))), 224 | med_m2_lineages=str(round(PC.median(list(globs['m2-per-lineage'].values())))), 225 | 226 | m2_counts=os.path.join("plots", globs['m2-lineage-dist-file']), 227 | 228 | comment_start=comment_start, 229 | comment_end=comment_end, 230 | 231 | num_spec=placeholder, 232 | num_targets=placeholder, 233 | num_conserved=placeholder, 234 | num_outgroups=placeholder, 235 | input_tree_plot=placeholder, 236 | 237 | date_time=PC.getFooterDateTime(), 238 | )) 239 | # Write the HTML summary file using the template 240 | 241 | step_start_time = PC.report_step(globs, step, step_start_time, "Success"); 242 | globs['html-summary-written'] = True; 243 | # Status update 244 | 245 | return globs; 246 | 247 | ############################################################################# -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/params.py: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # This file holds some global variables for some of the input options. 3 | # These global parameters should be read only -- they are not modified anywhere 4 | # else in the code except when reading the input options. 5 | ############################################################################# 6 | 7 | import sys 8 | import os 9 | import timeit 10 | import phyloacc_lib.core as PC 11 | 12 | ############################################################################# 13 | 14 | class StrictDict(dict): 15 | # This prevents additional keys from being added to the global params dict in 16 | # any other part of the code, just to help me limit it's scope 17 | # https://stackoverflow.com/questions/32258706/how-to-prevent-key-creation-through-dkey-val 18 | def __setitem__(self, key, value): 19 | if key not in self: 20 | raise KeyError("{} is not a legal key of this StrictDict".format(repr(key))); 21 | dict.__setitem__(self, key, value); 22 | 23 | ############################################################################# 24 | 25 | def init(): 26 | globs_init = { 27 | 28 | 'starttime' : timeit.default_timer(), 29 | 'startdatetime' : PC.getOutTime(), 30 | 'startdatetimenice' : PC.getRunTimeNice(), 31 | # Meta info 32 | 33 | 'pyver' : ".".join(map(str, sys.version_info[:3])), 34 | # System info 35 | 36 | 'call' : "", 37 | 'script-dir' : "", 38 | # Script call info 39 | 40 | 'aln-file' : False, 41 | 'bed-file' : False, 42 | 'id-file' : False, 43 | # Input with concatenated alignment and partitions by bed file 44 | 45 | 'aln-dir' : False, 46 | # Input by a directory full of individual alignments 47 | 48 | 'mod-file' : False, 49 | # Input rate and tree file from PHAST 50 | 51 | 'seq-compression' : 'none', 52 | 'bed-compression' : 'none', 53 | # The type of compression used for input sequence files 54 | 55 | 'outdir' : '', 56 | 'run-name' : 'phyloacc', 57 | 'logfilename' : 'phyloacc.errlog', 58 | 'alnstatsfile' : 'phyloacc-aln-stats.csv', 59 | 'no-inf-loci-file' : 'no-informative-sites-loci.txt', 60 | 'scfstatsfile' : 'phyloacc-scf-stats.csv', 61 | 'scftreefile' : 'phyloacc-scf.tree', 62 | 'logdir' : '', 63 | 'tmpdir' : 'System default.', 64 | 'overwrite' : False, 65 | 'inf-frac-theta' : 0.2, 66 | # I/O options 67 | 68 | 'dollo' : False, 69 | # The Dollo assumption option 70 | 71 | 'run-mode' : 'st', 72 | # Run mode option 73 | 74 | 'plot' : True, 75 | # Option to output plots/html 76 | 77 | 'theta' : False, 78 | 'coal-tree-input' : False, 79 | 'coal-tree-file-unlabeled' : False, 80 | 'coal-tree-file' : False, 81 | 'label-coal-tree-script' : False, 82 | 'coal-tree-str' : False, 83 | # Theta estimation 84 | 85 | 'tree-string' : False, 86 | 'st' : False, 87 | 88 | 'tree-dict' : False, 89 | 'labeled-tree' : False, 90 | 'scf-labeled-tree' : False, 91 | 'root' : False, 92 | 'tips' : [], 93 | 'internals' : [], 94 | # Tree variables 95 | 96 | 'in-seqs' : {}, 97 | 'in-bed' : {}, 98 | 'locus-ids' : [], 99 | 'alns' : {}, 100 | 'aln-stats' : {}, 101 | 'num-loci' : False, 102 | # Sequence variables 103 | 104 | 'avg-aln-len' : "NA", 105 | 'med-aln-len' : "NA", 106 | 'avg-nogap-seq-len' : "NA", 107 | 'med-nogap-seq-len' : "NA", 108 | 'no-inf-sites-loci' : [], 109 | 'gappy-loci' : [], 110 | 'filter-alns' : False, 111 | # Alignment summary stats 112 | 113 | 'input-groups' : { 'targets' : [], 'outgroup' : [], 'conserved' : [] }, 114 | 'groups' : { 'targets' : [], 'outgroup' : [], 'conserved' : [] }, 115 | # Phylo options 116 | 117 | 'scf' : {}, 118 | # Tracks sCF values BY NODE over all loci 119 | 120 | 'quartets' : {}, 121 | # All qurtets for all nodes/branches in the species tree: 122 | # : [ ] 123 | # is a tuple of tuples: 124 | # ((split1-spec1, split1-spec2), (split2-spec1, split2-spec2)) 125 | 126 | 'min-scf' : 0.5, 127 | 'low-scf-branch-prop' : False, 128 | # sCF params 129 | 130 | 'burnin' : 500, 131 | 'mcmc' : 1000, 132 | 'thin' : 1, 133 | 'chain' : 1, 134 | # MCMC options 135 | 136 | #'phyloacc-defaults-file' : "phyloacc-opts.txt", 137 | 'phyloacc-defaults' : {}, 138 | 'phyloacc-opts' : [], 139 | # All other PhyloAcc options as a list 140 | 141 | 'phyloacc' : 'PhyloAcc-ST', 142 | #'phyloacc' : "/n/home07/gthomas/projects/PhyloAcc-interface/PhyloAcc/PhyloAcc", 143 | #'phyloacc-gbgc' : 'PhyloAcc/V2_GBGC/PhyloAcc_gBGC', 144 | #'phyloacc-gt' : 'PhyloAcc-GT2/SRC/PhyloAcc-GT_piQ', 145 | #'phyloacc-gt' : 'testSRC_debug4_2tree/PhyloAcc-GT_piQ', 146 | 'phyloacc-gt' : 'PhyloAcc-GT', 147 | #'phyloacc-gt' : "/n/home07/gthomas/anaconda3/envs/phyloacc-conda/bin/PhyloAcc-GT", 148 | 'coal-cmd' : 'java -jar astral.jar', 149 | 'iqtree-path' : 'iqtree', 150 | # Dependency paths 151 | 152 | 'batch' : True, 153 | # Whether or not to write the batch files 154 | 155 | 'batch-size' : 50, 156 | 'num-batches' : 0, 157 | 'st-loci' : 0, 158 | 'gt-loci' : 0, 159 | 'st-batches' : [], 160 | 'gt-batches' : [], 161 | 'filtered-loci' : [], 162 | # Batch variables 163 | 164 | 'num-procs' : 1, 165 | # Number of procs for this script to use 166 | 167 | 'num-jobs' : 1000, 168 | 'procs-per-job' : 1, 169 | 'total-procs' : 1, 170 | # Number of jobs/procs for PhyloAcc to use 171 | 172 | 'partition' : False, 173 | 'num-nodes' : 1, 174 | 'mem' : 4, 175 | 'time' : 60, 176 | 'local': False, 177 | # Cluster options 178 | 179 | 'aln-pool' : False, 180 | 'scf-pool' : False, 181 | # Process pools 182 | 183 | 'smk' : False, 184 | 'smk-config' : False, 185 | # Job files 186 | 187 | 'id-flag' : True, 188 | # ID files required for now due to Segmentation fault in PhyloAcc without them 189 | 190 | 'iqtree' : '', 191 | 'astral' : '', 192 | 'job-dir' : '', 193 | 'job-alns' : '', 194 | 'job-cfgs' : '', 195 | 'job-bed' : '', 196 | 'job-ids' : '', 197 | 'job-smk' : '', 198 | 'job-out' : '', 199 | 'profile-dir' : False, 200 | 'profile-file' : False, 201 | # Job directories 202 | 203 | 'plot-dir' : '', 204 | 'input-tree-plot-file' : 'input-species-tree.png', 205 | 'aln-len-plot-file' : 'aln-len-hist.png', 206 | 'seq-len-plot-file' : 'avg-seq-len-nogap-hist.png', 207 | 'inf-sites-plot-file' : 'informative-sites-hist.png', 208 | 'inf-sites-frac-plot-file' : 'informative-sites-frac-hist.png', 209 | 'var-inf-sites-plot-file' : 'variable-informative-sites.png', 210 | 'avg-scf-hist-file' : 'avg-scf-per-locus.png', 211 | 'low-scf-hist-file' : 'perc-low-scf-branchers-per-locus.png', 212 | 'scf-tree-plot-file' : 'scf-species-tree.png', 213 | 'bl-scf-plot-file' : 'bl-scf.png', 214 | 'html-dir' : '', 215 | 'html-file' : '', 216 | # Plot and HTML summary files 217 | 218 | #'status-script' : 'slurm_status.py', 219 | 'smk-cmd' : '', 220 | # The final snakemake command to report 221 | 222 | 'test-cmd-flag' : False, 223 | 'test-cmd' : '', 224 | # An example command to run a single batch 225 | 226 | 'label-tree' : False, 227 | 'info' : False, 228 | 'dryrun' : False, 229 | 'quiet' : False, 230 | 'options-flag': False, 231 | 'version-flag': False, 232 | # Other user options 233 | 234 | 'aln-skip-chars' : ["-", "N"], 235 | 'aln-stats-written' : False, 236 | 'scf-stats-written' : False, 237 | 'scf-tree-written' : False, 238 | 'html-summary-written' : False, 239 | 240 | 'warnings' : 0, 241 | 'tree-data-type' : "class", 242 | 'scf-site-type' : "loop", 243 | 'count-disco-sites' : True, 244 | 'pad' : 82, 245 | 'endprog' : False, 246 | 'exit-code' : 0, 247 | 'log-v' : 1, 248 | 'stats' : True, 249 | 'progstarttime' : 0, 250 | 'stepstarttime' : 0, 251 | 'pids' : "", 252 | 'psutil' : False, 253 | 'qstats' : False, 254 | 'norun' : False, 255 | 'no-phyloacc' : False, 256 | 'debug-tree' : False, 257 | 'debug-aln' : False, 258 | 'debug' : False, 259 | 'nolog' : False, 260 | # Internal stuff 261 | } 262 | 263 | globs_init['logfilename'] = "phyloacc-" + globs_init['startdatetime'] + ".errlog"; 264 | # Add the runtime to the error log file name. 265 | 266 | globs_init['phyloacc-defaults'] = { 267 | 'SIMULATE' : { 'type' : "BOOL", 'default' : "FALSE" }, 268 | 'SEED' : { 'type' : "POS_INT", 'default' : "1" }, 269 | 'INIT_CONSERVE_RATE' : { 'type' : "POS_FLOAT", 'default' : "0.5" }, 270 | 'INIT_ACC_RATE' : { 'type' : "POS_FLOAT", 'default' : "1" }, 271 | 'CONSERVE_PRIOR_A' : { 'type' : "POS_FLOAT", 'default' : "5" }, 272 | 'CONSERVE_PRIOR_B' : { 'type' : "POS_FLOAT", 'default' : "0.04" }, 273 | 'ACCE_PRIOR_A' : { 'type' : "POS_FLOAT", 'default' : "10" }, 274 | 'ACCE_PRIOR_B' : { 'type' : "POS_FLOAT", 'default' : "0.2" }, 275 | 'ROPT' : { 'type' : "POS_FLOAT", 'default' : "1" }, 276 | 'CUB' : { 'type' : "POS_FLOAT", 'default' : "1" }, 277 | 'NLB' : { 'type' : "POS_FLOAT", 'default' : "0.6" }, 278 | 'THIN' : { 'type' : "POS_INT", 'default' : "1" }, 279 | 'INIT_LRATE' : { 'type' : "POS_FLOAT", 'default' : "0.8" }, 280 | 'INIT_LRATE2' : { 'type' : "POS_FLOAT", 'default' : "0.1" }, 281 | 'INIT_GRATE' : { 'type' : "POS_FLOAT", 'default' : "0.5" }, 282 | 'HYPER_LRATE_A' : { 'type' : "POS_FLOAT", 'default' : "1" }, 283 | 'HYPER_LRATE_B' : { 'type' : "POS_FLOAT", 'default' : "1" }, 284 | 'HYPER_LRATE2_A' : { 'type' : "POS_FLOAT", 'default' : "1" }, 285 | 'HYPER_LRATE2_B' : { 'type' : "POS_FLOAT", 'default' : "1" }, 286 | 'HYPER_GRATE_A' : { 'type' : "POS_FLOAT", 'default' : "1" }, 287 | 'HYPER_GRATE_B' : { 'type' : "POS_FLOAT", 'default' : "1" }, 288 | 'WL' : { 'type' : "BOOL", 'default' : "FALSE" }, 289 | 'BL_WL' : { 'type' : "POS_INT", 'default' : "15" }, 290 | 'CONSERVE_PROP' : { 'type' : "PERC", 'default' : "0.8" }, 291 | 'CONSERVE_RATE' : { 'type' : "PROP", 'default' : "NA" }, 292 | 'GAP_PROP' : { 'type' : "PERC", 'default' : "0.8" }, 293 | 'CONSTOMIS' : { 'type' : "POS_FLOAT", 'default' : "0.5" }, 294 | 'BR_SAMPLE_THRESHOLD' : { 'type' : "POS_FLOAT", 'default' : "10" }, 295 | 'GAPCHAR' : { 'type' : "CHAR", 'default' : "-" }, 296 | 'PRUNE_TREE' : { 'type' : "BOOL", 'default' : "FALSE" }, 297 | 'TRIM_GAP_PERCENT' : { 'type' : "PERC", 'default' : "0.9" }, 298 | 'MIN_LEN' : { 'type' : "POS_INT", 'default' : "50" }, 299 | 'INDEL' : { 'type' : "POS_INT", 'default' : "0" }, 300 | 'SAMPLE_HYPER' : { 'type' : "POS_INT", 'default' : "0" }, 301 | 'VERBOSE' : { 'type' : "POS_INT", 'default' : "FALSE" }, 302 | 'NUM_THREAD' : { 'type' : "POS_INT", 'default' : "1" }, 303 | 'THETA_CUTOFF' : { 'type' : "POS_FLOAT", 'default' : "NA" } 304 | } 305 | # The default values for PhyloAcc options NOT included in the interface options 306 | 307 | for line in open(os.path.join(os.path.dirname(__file__), "info.yaml"), "r"): 308 | line = line.strip().split(":\t"); 309 | globs_init[line[0]] = line[1]; 310 | # Reads meta info (version, urls, etc.) from the info.yaml file 311 | 312 | # for line in open(globs_init['phyloacc-defaults-file']): 313 | # line = line.strip().split("\t"); 314 | # globs_init['phyloacc-defaults'][line[0]] = {'type' : line[2], 'default' : line[1]}; 315 | 316 | globs = StrictDict(globs_init); 317 | # Restrict the dict from having keys added to it after this 318 | 319 | return globs; 320 | 321 | ############################################################################# -------------------------------------------------------------------------------- /src/PhyloAcc-interface/phyloacc_lib/treeio.py: -------------------------------------------------------------------------------- 1 | ############################################################################# 2 | # Handles the reading of the input trees 3 | # Gregg Thomas 4 | ############################################################################# 5 | 6 | import sys 7 | import os 8 | import phyloacc_lib.core as CORE 9 | import phyloacc_lib.tree as TREE 10 | import phyloacc_lib.tree_old as TREEF 11 | 12 | ############################################################################# 13 | 14 | def readST(globs, tree_type="species", rf_to_st=False): 15 | 16 | result = []; 17 | # The list of objects to return; differs based on tree type 18 | 19 | step = "Reading input " + tree_type + " tree"; 20 | step_start_time = CORE.report_step(globs, step, False, "In progress..."); 21 | # Status update 22 | 23 | try: 24 | if tree_type == "species": 25 | for line in open(globs['mod-file']): 26 | if line.startswith("TREE: "): 27 | tree_str = line.strip().replace("TREE: ", ""); 28 | # Read the tree string from the MOD file 29 | 30 | elif tree_type == "from-log": 31 | for line in open(globs['interface-logfile']): 32 | if line.startswith("# Tree read from mod file:"): 33 | tree_str = line.strip().replace("# Tree read from mod file:", ""); 34 | while tree_str[0] == " ": 35 | tree_str = tree_str[1:]; 36 | # Read the tree string from the interface log file 37 | 38 | if line.startswith("# Loci per batch (-batch)"): 39 | batch_size = line.strip().replace("# Loci per batch (-batch)", ""); 40 | batch_size = list(filter(None, batch_size.split(" ")))[0]; 41 | 42 | if line.startswith("# Processes per job (-p)"): 43 | procs_per_batch = line.strip().replace("# Processes per job (-p)", ""); 44 | procs_per_batch = list(filter(None, procs_per_batch.split(" ")))[0]; 45 | 46 | elif tree_type == "coalescent": 47 | tree_str = open(globs['coal-tree-file'], "r").read(); 48 | 49 | else: 50 | CORE.errorOut("IO1", "INTERNAL: invalid tree type to read", globs); 51 | 52 | if globs['tree-data-type'] == 'class': 53 | tree = TREE.Tree(tree_str); 54 | elif globs['tree-data-type'] == 'func': 55 | tree_dict, tree, root = TREEF.treeParse(tree_str); 56 | # Parse the tree string to the tree class 57 | except: 58 | print("\n\n"); 59 | CORE.errorOut("IO2", "Error reading tree from file!", globs); 60 | # Read the tree 61 | 62 | if globs['tree-data-type'] == 'class': 63 | root_str = "unrooted"; 64 | add_root_node = 0; 65 | if tree.rooted: 66 | root_str = "rooted"; 67 | add_root_node = 1; 68 | # Node counting for log 69 | 70 | step_start_time = CORE.report_step(globs, step, step_start_time, "Success: " + root_str + " tree read"); 71 | CORE.printWrite(globs['logfilename'], globs['log-v'], "# INFO: Tree has "+ str(tree.num_tips) + " tips and " + str(tree.num_internals + add_root_node) + " internal nodes"); 72 | # Status update 73 | 74 | elif globs['tree-data-type'] == 'func': 75 | tips, internals = [], []; 76 | for node in tree_dict: 77 | if tree_dict[node][2] == "tip": 78 | tips.append(node); 79 | else: 80 | internals.append(node); 81 | 82 | step_start_time = CORE.report_step(globs, step, step_start_time, "Success: tree read"); 83 | CORE.printWrite(globs['logfilename'], globs['log-v'], "# INFO: Tree has "+ str(len(tips)) + " tips and " + str(len(internals)) + " internal nodes"); 84 | # Status update 85 | 86 | ## TODO: Compare tree topologies 87 | 88 | if tree_type == "species": 89 | if globs['tree-data-type'] == 'class': 90 | result = [tree_str, tree]; 91 | elif globs['tree-data-type'] == 'func': 92 | result = [tree_str, tree_dict, tree, root, tips, internals]; 93 | elif tree_type == "from-log": 94 | result = [tree_str, tree, batch_size, procs_per_batch]; 95 | else: 96 | result = [tree_str, tree]; 97 | 98 | return result; 99 | # return tree_str, tree_dict, tree, root, tips, internals; 100 | 101 | ############################################################################# 102 | 103 | def readSpeciesGroups(globs): 104 | 105 | step = "Reading species/branch groups"; 106 | step_start_time = CORE.report_step(globs, step, False, "In progress..."); 107 | # Status update 108 | 109 | if globs['tree-data-type'] == 'class': 110 | nodes = globs['st'].nodes; 111 | tips = globs['st'].tips; 112 | elif globs['tree-data-type'] == 'func': 113 | nodes = list(globs['st'].keys()); 114 | tips = globs['tips'] 115 | 116 | for group in globs['groups']: 117 | # Check every group 118 | 119 | for label in globs['input-groups'][group]: 120 | # Check every label in the current group for presence in tree and whether it is an internal label 121 | 122 | label_found = False; 123 | # A flag that lets us know if the label was found in the tree 124 | 125 | # for node in globs['st'].nodes: 126 | # for node in globs['st']: 127 | for node in nodes: 128 | # For every node read in the tree 129 | 130 | # if node in globs['st'].tips: 131 | # if node in globs['tips']: 132 | if node in tips: 133 | if label == node: 134 | label_found = True; 135 | globs['groups'][group].append(label); 136 | break; 137 | # Check if the current label is a tip in the tree and if so add it to the global group and break 138 | 139 | else: 140 | if globs['tree-data-type'] == 'class': 141 | if label in [ node, globs['st'].label[node] ]: 142 | label_found = True; 143 | globs['groups'][group] += globs['st'].getClade(node); 144 | break; 145 | elif globs['tree-data-type'] == 'func': 146 | if label in [n, globs['st'][n][3]]: 147 | label_found = True; 148 | cur_clade = TREEF.getClade(n, globs['st']); 149 | globs['groups'][group] += cur_clade; 150 | break; 151 | # Check if the current label is an internal label (either from the tree input or from treeparse/--labeltree) and 152 | # if so get all the tips descending from it to add to the global group and break 153 | 154 | if not label_found: 155 | CORE.errorOut("IO3", "The following label was provided in a group but is not present in the tree: " + label, globs); 156 | # If the current label wasn't found in the tree, exit here with an error 157 | 158 | ## End node loop 159 | ## End group loop 160 | # A preliminary check here to make sure all provided labels are actually in the input tree 161 | 162 | ########## 163 | 164 | if not globs['groups']['conserved']: 165 | if globs['tree-data-type'] == 'class': 166 | globs['groups']['conserved'] = [ node for node in globs['st'].tips if node not in globs['groups']['targets'] and node not in globs['groups']['outgroup'] ]; 167 | elif globs['tree-data-type'] == 'func': 168 | globs['groups']['conserved'] = [ node for node in globs['tips'] if node not in globs['groups']['targets'] and node not in globs['groups']['outgroup'] ]; 169 | # If the conserved group isn't specified, add all remaining tip species not in the other groups to it here 170 | 171 | # for tip in globs['st'].tips: 172 | # for tip in globs['tips']: 173 | for tip in tips: 174 | num_in_groups = 0; 175 | for group in globs['groups']: 176 | for label in globs['groups'][group]: 177 | if tip == label: 178 | num_in_groups += 1; 179 | 180 | if num_in_groups != 1: 181 | print("parsed species groups:") 182 | print("targets: ", globs['groups']['targets']); 183 | print("outgroup: ", globs['groups']['outgroup']); 184 | print("conserved: ", globs['groups']['conserved']); 185 | CORE.errorOut("IO3", "The following tip label does not appear once in only one group: " + tip + " (" + str(num_in_groups) + "). If you provided internal node labels make sure there aren't duplicate labels within that clade provided elsewhere.", globs); 186 | # Go through every tip label and make sure it appears once and only once in the specified groups 187 | 188 | step_start_time = CORE.report_step(globs, step, step_start_time, "Success"); 189 | 190 | return globs; 191 | 192 | ############################################################################# 193 | 194 | def writeCF(globs): 195 | 196 | step = "Writing: " + globs['scfstatsfile']; 197 | step_start_time = CORE.report_step(globs, step, False, "In progress..."); 198 | # Status update 199 | 200 | scf_headers = ["ID", "sCF", "sCF_N", "sDF1", "sDF1_N", "sDF2", "sDF2_N", "sN", "qN" ]; 201 | cf_headers = scf_headers + [ "label", "length" ]; 202 | 203 | scf_labels = {}; 204 | #print(); 205 | 206 | globs['scfstatsfile'] = os.path.join(globs['outdir'], globs['scfstatsfile']); 207 | # Add the output directory to the scfstatsfile name 208 | 209 | with open(globs['scfstatsfile'], "w") as outfile: 210 | outfile.write("# Site concordance factor statistics\n"); 211 | outfile.write("# ID: Branch ID in the species tree\n"); 212 | 213 | outfile.write("# sCF: Site concordance factor averaged over qN quartets (=sCF_N/sN)\n"); 214 | outfile.write("# sCF_N: sCF in absolute number of sites\n"); 215 | outfile.write("# sDF1: Site discordance factor for alternative quartet 1 (=sDF1_N/sN)\n"); 216 | outfile.write("# sDF1_N: sDF1 in absolute number of sites\n"); 217 | outfile.write("# sDF2: Site discordance factor for alternative quartet 2 (=sDF2_N/sN)\n"); 218 | outfile.write("# sDF2_N: sDF2 in absolute number of sites\n"); 219 | outfile.write("# sN: Number of decisive sites averaged over 100 quartets\n"); 220 | outfile.write("# qN: Number of quartets sampled on this branch\n"); 221 | 222 | outfile.write("# label: The label in the original species tree string\n"); 223 | outfile.write("# length: The branch length in the original species tree string\n"); 224 | 225 | #################### 226 | 227 | if globs['tree-data-type'] == 'class': 228 | internals = globs['st'].internals; 229 | elif globs['tree-data-type'] == 'func': 230 | internals = globs['internals']; 231 | 232 | outfile.write("\t".join(cf_headers) + "\n"); 233 | # for node in globs['st'].internals: 234 | # for node in globs['internals']: 235 | for node in internals: 236 | outline = []; 237 | 238 | if node in globs['scf']: 239 | 240 | outline.append("NA" if globs['scf'][node]['scf'] == "NA" else round(globs['scf'][node]['scf'], 3)); 241 | outline.append(round(globs['scf'][node]['concordant-sites'], 2)); 242 | 243 | outline.append("NA" if globs['scf'][node]['sdf1'] == "NA" else round(globs['scf'][node]['sdf1'], 3)); 244 | outline.append(round(globs['scf'][node]['disco1-sites'], 2)); 245 | 246 | outline.append("NA" if globs['scf'][node]['sdf2'] == "NA" else round(globs['scf'][node]['sdf2'], 3)); 247 | outline.append(round(globs['scf'][node]['disco2-sites'], 2)); 248 | 249 | outline.append(round(globs['scf'][node]['decisive-sites'], 2)); 250 | 251 | scf_labels[node] = "NA" if globs['scf'][node]['scf'] == "NA" else str(round(globs['scf'][node]['scf'], 3)); 252 | 253 | outline.append(globs['scf'][node]['total-quartets']); 254 | 255 | else: 256 | outline += ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"]; 257 | scf_labels[node] = "NA"; 258 | 259 | if globs['tree-data-type'] == 'class': 260 | outline = [node] + [ str(v) for v in outline ] + [globs['st'].label[node], globs['st'].bl[node]]; 261 | if globs['tree-data-type'] == 'func': 262 | outline = [node] + [ str(v) for v in outline ] + [ globs['st'][node][0], globs['st'][node][3] ]; 263 | outfile.write("\t".join(outline) + "\n"); 264 | #print("\t".join(outline)); 265 | 266 | 267 | step_start_time = CORE.report_step(globs, step, step_start_time, "Success: sCF stats written"); 268 | globs['scf-stats-written'] = True; 269 | 270 | #################### 271 | 272 | step = "Writing: " + globs['scftreefile']; 273 | step_start_time = CORE.report_step(globs, step, False, "In progress..."); 274 | # Status update 275 | 276 | globs['scftreefile'] = os.path.join(globs['outdir'], globs['scftreefile']); 277 | # Add the output directory to the scftreefile name 278 | 279 | new_labels = { node : scf_labels[node] for node in globs['scf'] }; 280 | 281 | if globs['tree-data-type'] == 'class': 282 | globs['scf-labeled-tree'] = globs['st'].addLabel(new_labels, delim="_"); 283 | 284 | elif globs['tree-data-type'] == 'func': 285 | globs['scf-labeled-tree'] = globs['labeled-tree']; 286 | # Get the labeled input tree from treeParse 287 | globs['scf-labeled-tree'] = TREEF.addBranchLength(globs['scf-labeled-tree'], globs['st']); 288 | # Add the branch lengths back onto the tree 289 | for node in globs['scf']: 290 | globs['scf-labeled-tree'] = globs['scf-labeled-tree'].replace(node, node + "_" + str(round(globs['scf'][node]['scf'], 2))); 291 | # For every node in the tree, add the averaged scf value over all loci to the label 292 | 293 | with open(globs['scftreefile'], "w") as cf_tree_file: 294 | cf_tree_file.write(globs['scf-labeled-tree']); 295 | 296 | step_start_time = CORE.report_step(globs, step, step_start_time, "Success: sCF tree written"); 297 | globs['scf-tree-written'] = True; 298 | # Status update 299 | 300 | return globs; 301 | 302 | ############################################################################# -------------------------------------------------------------------------------- /phyloacc-cfg-template.yaml: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | # This is a template configuration file for PhyloAcc. 3 | # All options are listed here with a brief description and their default values. 4 | # 5 | # Options can either be specified here or in the command line, for example: 6 | # phyloacc.py -a alignment.fa -b loci.bed -m model.mod -t species1;species2;species3 7 | # is the same as setting those options in the configuration file and running 8 | # phyloacc.py --config 9 | # 10 | # IMPORTANT: Command line options take precedence over those specified in the configuration file! 11 | # If neither are specified for a particular option, the default value will be used. 12 | # 13 | # This file is YAML formatted, meaning each option is a key-value pair, with the 14 | # option name being on the left side of the colon and the value on the right side. 15 | # 16 | ############################################################ 17 | 18 | #################### 19 | # Sequence input 20 | #################### 21 | 22 | ## EITHER the following options ## 23 | 24 | aln_file: 25 | # Alignment file (-a) 26 | # An alignment file with all loci concatenated. 27 | # bed_file must also be specified. 28 | # Expected as FASTA format. 29 | # REQUIRED conditionally: Only specify one of aln_file + bed_file OR aln_dir. 30 | 31 | bed_file: 32 | # bed file (-b) 33 | # A bed file with coordinates for the loci in the concatenated alignment file (aln_file). 34 | # aln_file must also be specified. 35 | # REQUIRED conditionally: Only specify one of aln_file + bed_file OR aln_dir. 36 | 37 | id_file: 38 | # ID file (-i) 39 | # A text file with locus names, one per line, corresponding to regions in bed_file. 40 | # aln_file and bed_file must also be specified. 41 | # DEFAULT: None 42 | 43 | ## OR ## 44 | 45 | aln_dir: 46 | # Alignment directory (-d) 47 | # A directory containing individual alignment files for each locus. 48 | # Expected as FASTA format. 49 | # REQUIRED conditionally: Only specify one of aln_file + bed_file OR aln_dir. 50 | 51 | #################### 52 | # Tree input 53 | #################### 54 | 55 | mod_file: 56 | # Model file (-m) 57 | # A file with a background transition rate matrix and phylogenetic tree with branch lengths as output from phyloFit. 58 | # EXAMPLE FILE: https://github.com/phyloacc/PhyloAcc-test-data/blob/main/ratite.mod 59 | # REQUIRED. 60 | 61 | targets: 62 | # Target species (-t) 63 | # Tip labels in the input tree to be used as target species. 64 | # Enter multiple labels separated by semi-colons (;), e.g. species1;species2;species3. 65 | # REQUIRED. 66 | 67 | conserved: 68 | # Conserved species (-c) 69 | # Tip labels in the input tree to be used as conserved species. 70 | # Enter multiple labels separated by semi-colons (;), e.g. species1;species2;species3. 71 | # DEFAULT: Any species not specified as targets or outgroup will be set as conserved. 72 | 73 | outgroup: 74 | # Outgroup species (-g) 75 | # Tip labels in the input tree to be used as outgroup species. 76 | # Enter multiple labels separated by semi-colons (;), e.g. species1;species2;species3. 77 | # DEFAULT: None. If blank, no species will be set as outgroup. 78 | 79 | coal_tree: 80 | # Coalescent tree (-l) 81 | # A file containing a rooted, Newick formatted tree with the same topology as the species 82 | # tree in the mod file (-m), but with branch lengths in coalescent units. 83 | # Either coal_tree or theta is REQUIRED for the gene tree and adaptive models, not needed for species tree model. 84 | 85 | theta_flag: 86 | # Estimate population size parameter theta (--theta) 87 | # Set this to add gene tree estimation with IQ-tree and species estimation with ASTRAL for 88 | # estimation of the theta prior. Note that a species tree with branch lengths in units of 89 | # substitutions per site is still required in the mod_file. Also note that this may add substantial 90 | # runtime to the pipeline. 91 | # Either coal_tree or theta is REQUIRED for the gene tree and adaptive models, not needed for species tree model. 92 | 93 | #################### 94 | # PhyloAcc Method 95 | #################### 96 | 97 | run_mode: 98 | # Run mode (-r) 99 | # Determines which version of PhyloAcc will be used. 100 | # OPTIONS: 101 | # gt: use the gene tree model for all loci. 102 | # st: use the species tree model for all loci. 103 | # adaptive: use the gene tree model on loci with many branches with low sCF and species tree model on all other loci. 104 | # DEFAULT: st 105 | 106 | dollo_flag: 107 | # Dollo assumption (--dollo) 108 | # Set this to use the Dollo assumption in the original model (lineages descendant from an 109 | # accelerated branch cannot change state). 110 | # OPTIONS: 111 | # True: use the Dollo assumption. 112 | # False: do not use the Dollo assumption. 113 | # DEFAULT: False 114 | 115 | #################### 116 | # Other input options 117 | #################### 118 | 119 | num_procs: 120 | # Number of processes (-n) 121 | # The number of processes that this script should use. 122 | # DEFAULT: 1. 123 | 124 | #################### 125 | # Output 126 | #################### 127 | 128 | out_dir: 129 | # Output directory (-o) 130 | # Desired output directory. This will be created for you if it doesn't exist. 131 | # DEFAULT: phyloacc-[date]-[time] 132 | 133 | overwrite_flag: 134 | # Overwrite existing output directory (--overwrite) 135 | # Set this to overwrite the output directory if it already exists. 136 | # Files within that directory may also be overwritten 137 | # OPTIONS: 138 | # True: overwrite existing output directory. 139 | # False: do not overwrite existing output directory. 140 | # DEFAULT: False 141 | 142 | labeltree: 143 | # Label tree (--labeltree) 144 | # Simply reads the tree from the input mod_file (-m), labels the internal nodes, and exits. 145 | # OPTIONS: 146 | # True: label the tree and exit. 147 | # False: do not label the tree and continue (normal run). 148 | # DEFAULT: False 149 | 150 | summarize_flag: 151 | # Summarize input (--summarize) 152 | # Only generate the input summary plots and page. Do not write or overwrite batch job files. 153 | # OPTIONS: 154 | # True: only generate the input summary. 155 | # False: continue to generate/overwrite batch job files (normal run). 156 | # DEFAULT: False 157 | 158 | #################### 159 | # Alignment filtering 160 | #################### 161 | 162 | filter_alns: 163 | # Filter low quality alignments (--filter) 164 | # By default, any locus with at least 1 informative site is reatained for PhyloAcc. 165 | # Set this to filter out loci that have at least 50% of sites that are 50% or more gap charcaters 166 | # OR that have 50% of sequences that are made up of 50% or more gap charcaters. 167 | # OPTIONS: 168 | # True: filter out low quality alignments. 169 | # False: do not filter out low quality alignments. 170 | # DEFAULT: False 171 | # More alignment filtering options may be available in the future. 172 | 173 | #################### 174 | # SCF options 175 | #################### 176 | 177 | scf_branch_cutoff: 178 | # Low sCF threshold (-scf) 179 | # The value of sCF to consider as 'low' for any given branch in a locus. 180 | # When run_mode (-r) is adaptive, loci with scf_prop (-s) proportion of branches with sCF 181 | # below this threshold will be run with the gene tree model, 182 | # all other loci will be run with the species tree model. 183 | # DEFAULT: 0.5 184 | 185 | scf_prop: 186 | # Proportion of low sCF branches (-s) 187 | # When the run_mode (-r) is adaptive, by default sCFs across all branches will be averaged 188 | # and loci with average sCF below the scf_branch_cutoff (-scf) will be run with the gene 189 | # tree model. Rather than averageing, set this to be a proportion of branches to 190 | # that must have sCF below scf_branch_cutoff (-scf) to be run with the gene tree model, 191 | # all other loci will be run with the species tree model. 192 | # OPTIONS: 193 | # DEFAULT: NA; sCF will be averaged across all branches for each locus 194 | # When a number between 0 and 1 is entered, this will be the proportion of branches that must have sCF below scf_branch_cutoff (-scf) 195 | # to be run with the gene tree model. 196 | 197 | #################### 198 | # MCMC options 199 | #################### 200 | 201 | mcmc: 202 | # MCMC steps (-mcmc) 203 | # The total number of steps in the Markov chain. 204 | # DEFAULT: 1000 205 | 206 | burnin: 207 | # MCMC Burnin (-burnin) 208 | # The number of steps to be discarded in the Markov chain as burnin. 209 | # DEFAULT: 500 210 | 211 | chain: 212 | # MCMC chains (-chain) 213 | # The number of MCMC chains to run. 214 | # DEFAULT: 1 215 | 216 | thin: 217 | # MCMC gene tree steps (-thin) 218 | # For the gene tree model, the number of MCMC steps between gene tree sampling. 219 | # The total number of MCMC steps specified with -mcmc will be scaled by this as mcmc*thin 220 | # DEFAULT: 1 221 | 222 | #################### 223 | # Batching 224 | #################### 225 | 226 | batch_size: 227 | # Batch size (-batch) 228 | # The number of loci to run per batch. 229 | # The total number of batches will be (total loci / batch_size). 230 | # DEFAULT: 50 231 | 232 | procs_per_batch: 233 | # Processes per batch (-p) 234 | # The number of processes (cores) to use for each batch of PhyloAcc. 235 | # DEFAULT: 1. 236 | 237 | num_jobs: 238 | # Number of jobs (-j) 239 | # The number of jobs (batches) that Snakemake will submit at once to your cluster 240 | # DEFAULT: 1. 241 | 242 | cluster_part: 243 | # Cluster partition (-part) 244 | # The partition or list of partitions (separated by commas) on which to run PhyloAcc jobs. 245 | # REQUIRED 246 | 247 | cluster_nodes: 248 | # Cluster nodes (-nodes) 249 | # The number of nodes on the specified partition to submit jobs to. 250 | # DEFAULT: 1. 251 | 252 | cluster_mem: 253 | # Cluter memory allotment (-mem) 254 | # The max memory allotted for each job (batch) in GB. 255 | # Just input the number (e.g. 4 for 4GB) 256 | # DEFAULT: 4. 257 | 258 | cluster_time: 259 | # Cluster max walltime (-time) 260 | # The time in hours to give each job. 261 | # DEFAULT: 1. 262 | 263 | local_flag: 264 | # No cluster submission (--local) 265 | # Set this to instead generate a snakemake command to run the pipeline locally. 266 | # Recommended ONLY for testing 267 | # OPTIONS: 268 | # True: generate a snakemake command that runs locally. 269 | # False: generate a snakemake command and profile for a cluster. 270 | # DEFAULT: False 271 | 272 | #################### 273 | # Executable paths 274 | #################### 275 | 276 | phyloacc_st_path: 277 | # PhyloAcc-ST path (-st-path) 278 | # The path to the PhyloAcc-ST binary. 279 | # DEFAULT: PhyloAcc-ST 280 | 281 | phyloacc_gt_path: 282 | # PhyloAcc-GT path (-gt-path) 283 | # The path to the PhyloAcc-GT binary. 284 | # DEFAULT: PhyloAcc-GT 285 | 286 | iqtree_path: 287 | # IQ-Tree path (-iqtree) 288 | # The path to the IQ-Tree executable for making gene trees with theta (--theta). 289 | # DEFAULT: iqtree 290 | 291 | coal_cmd: 292 | # Coalescent command (-coal) 293 | # The path to the program to estimate branch lengths in coalescent units with theta (--theta) 294 | # SUPPORTED PROGRAMS: ASTRAL. 295 | # DEFAULT: java -jar astral.jar 296 | 297 | #################### 298 | # PhyloAcc options 299 | #################### 300 | 301 | phyloacc_opts: 302 | # PhyloAcc options (-phyloacc) 303 | # A catch-all option for other PhyloAcc parameters. 304 | # Enter as a semi-colon delimited list of options: 'OPT1 value;OPT2 value' 305 | # DEFAULT: None 306 | 307 | options_flag: 308 | # Display all PhyloAcc options (--options) 309 | # Set this to print the full list of PhyloAcc options that can be specified with phyloacc_opts (-phyloacc) and exit. 310 | # OPTIONS: 311 | # True: print the options and exit. 312 | # False: do not print the options and continue. 313 | # DEFAULT: False 314 | # Output for this option is as follows: 315 | # OPTION DEFAULT 316 | # ------------------------------ 317 | # SIMULATE FALSE 318 | # SEED 1 319 | # INIT_CONSERVE_RATE 0.5 320 | # INIT_ACC_RATE 1 321 | # CONSERVE_PRIOR_A 5 322 | # CONSERVE_PRIOR_B 0.04 323 | # ACCE_PRIOR_A 10 324 | # ACCE_PRIOR_B 0.2 325 | # ROPT 1 326 | # CUB 1 327 | # NLB 0.6 328 | # THIN 1 329 | # INIT_LRATE 0.8 330 | # INIT_LRATE2 0.1 331 | # INIT_GRATE 0.5 332 | # HYPER_LRATE_A 1 333 | # HYPER_LRATE_B 1 334 | # HYPER_LRATE2_A 1 335 | # HYPER_LRATE2_B 1 336 | # HYPER_GRATE_A 1 337 | # HYPER_GRATE_B 1 338 | # WL FALSE 339 | # BL_WL 15 340 | # CONSERVE_PROP 0.8 341 | # CONSERVE_RATE NA 342 | # GAP_PROP 0.8 343 | # CONSTOMIS 0.5 344 | # BR_SAMPLE_THRESHOLD 10 345 | # GAPCHAR - 346 | # PRUNE_TREE FALSE 347 | # TRIM_GAP_PERCENT 0.9 348 | # MIN_LEN 50 349 | # INDEL 0 350 | # SAMPLE_HYPER 0 351 | # VERBOSE FALSE 352 | # NUM_THREAD 1 353 | # THETA_CUTOFF NA 354 | 355 | #################### 356 | # Misc. options 357 | #################### 358 | 359 | depcheck: 360 | # Dependency check (--depcheck) 361 | # Run this to check that all dependencies are installed at the provided paths. No other options necessary. 362 | # OPTIONS: 363 | # True: run the dependency check and exit. 364 | # False: do not run the dependency check and continue (normal run). 365 | # DEFAULT: False 366 | 367 | test_cmd_flag: 368 | # Generate test command (--testcmd) 369 | # Set this to also display a command that runs PhyloAcc directly on one of the batches 370 | # OPTIONS: 371 | # True: display the test command at the end of the program. 372 | # False: do not display the test command. 373 | # DEFAULT: False 374 | 375 | 376 | append_log_flag: 377 | # Append to log file (--append) 378 | # Set this to keep the old log file even if --overwrite is specified. 379 | # New log information will instead be appended to the previous log file. 380 | # OPTIONS: 381 | # True: append to the log file. 382 | # False: overwrite the log file. 383 | # DEFAULT: False 384 | 385 | info_flag: 386 | # Print info (--info) 387 | # Print some meta information about the program and exit. No other options required. 388 | # OPTIONS: 389 | # True: print the info and exit. 390 | # False: do not print the info and continue (normal run). 391 | 392 | version_flag: 393 | # Print version information (--version) 394 | # Simply print the version and exit. Can also be called as '-version', '-v', or '--v' 395 | # OPTIONS: 396 | # True: print the version and exit. 397 | # False: do not print the version and continue (normal run). 398 | # DEFAULT: False 399 | 400 | quiet_flag: 401 | # Quiet mode (--quiet) 402 | # Set this flag to prevent PhyloAcc from reporting detailed information about each step. 403 | # OPTIONS: 404 | # True: run in quiet mode. 405 | # False: run in verbose mode. 406 | # DEFAULT: False 407 | 408 | #################### 409 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST-GBGC/SRC/newick.cpp: -------------------------------------------------------------------------------- 1 | // This code, downloaded from Internet, is used to 2 | // parse the phylogenetic tree in Newick format. 3 | 4 | #include "newick.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "utils.h" 11 | 12 | #define APPEND_LEN 256 13 | 14 | using namespace std; 15 | using namespace arma; 16 | 17 | typedef struct newick_child 18 | { 19 | struct newick_node *node; 20 | struct newick_child *next; 21 | } newick_child; 22 | 23 | typedef struct newick_node 24 | { 25 | int id; 26 | char *taxon; 27 | char *seq; 28 | float dist; 29 | int childNum; 30 | struct newick_child *child; 31 | struct newick_node *parent; 32 | } newick_node; 33 | 34 | newick_node* parseTree(char *str); 35 | 36 | typedef struct seqMem 37 | { 38 | void *pos; 39 | struct seqMem *next; 40 | } seqMem; 41 | 42 | seqMem *start; 43 | seqMem *current; 44 | 45 | void seqMemInit() 46 | { 47 | start = NULL; 48 | } 49 | 50 | void* seqMalloc(int size) 51 | { 52 | if (start == NULL) 53 | { 54 | start = (seqMem*)malloc(sizeof(seqMem)); 55 | memset(start, '\0', sizeof(seqMem)); 56 | current = start; 57 | } 58 | else 59 | { 60 | current->next = (seqMem*)malloc(sizeof(seqMem)); 61 | memset(current->next, '\0', sizeof(seqMem)); 62 | current = current->next; 63 | } 64 | current->pos = malloc(size); 65 | memset(current->pos, '\0', size); 66 | return(current->pos); 67 | } 68 | 69 | void seqFreeAll() 70 | { 71 | while (start != NULL) 72 | { 73 | current = start->next; 74 | free(start->pos); 75 | free(start); 76 | start = current; 77 | } 78 | 79 | start = NULL; 80 | } 81 | 82 | void seqFree(void* pos) 83 | { 84 | seqMem *node, *prenode; 85 | node = start; 86 | prenode = start; 87 | while (node != NULL) 88 | { 89 | if (node->pos == pos) 90 | { 91 | free(node->pos); 92 | if (node == start) 93 | { 94 | start = node->next; 95 | } 96 | else if (node->next == NULL) 97 | { 98 | current = prenode; 99 | prenode->next = NULL; 100 | } 101 | else 102 | { 103 | prenode->next = node->next; 104 | } 105 | free(node); 106 | break; 107 | } 108 | 109 | prenode = node; 110 | node = node->next; 111 | } 112 | } 113 | 114 | void inputString(char *input, char **ppcStr, int *iLen, int *iMaxLen) 115 | { 116 | int inputLen; 117 | char *temp; 118 | inputLen = strlen(input); 119 | if (inputLen == 0) 120 | { 121 | return; 122 | } 123 | while (*iMaxLen < (*iLen + inputLen) + 1) 124 | { 125 | *iMaxLen = *iMaxLen + APPEND_LEN; 126 | } 127 | temp = (char*)seqMalloc(*iMaxLen); 128 | if (*ppcStr == NULL) 129 | { 130 | memcpy(temp, input, inputLen); 131 | } 132 | else 133 | { 134 | memcpy(temp, *ppcStr, *iLen); 135 | strcat(temp, input); 136 | } 137 | *iLen = *iLen + inputLen; 138 | if (*ppcStr != NULL) 139 | { 140 | seqFree(*ppcStr); 141 | } 142 | *ppcStr = temp; 143 | } 144 | 145 | newick_node* parseTree(char *str) 146 | { 147 | newick_node *node; 148 | newick_child *child; 149 | char *pcCurrent; 150 | char *pcStart; 151 | char *pcColon = NULL; 152 | char cTemp; 153 | int iCount; 154 | 155 | pcStart = str; 156 | 157 | if (*pcStart != '(') 158 | { 159 | // Leaf node. Separate taxon name from distance. If distance not exist then take care of taxon name only 160 | pcCurrent = str; 161 | while (*pcCurrent != '\0') 162 | { 163 | if (*pcCurrent == ':') 164 | { 165 | pcColon = pcCurrent; 166 | } 167 | pcCurrent++; 168 | } 169 | node = (newick_node*)seqMalloc(sizeof(newick_node)); 170 | if (pcColon == NULL) 171 | { 172 | // Taxon only 173 | node->taxon = (char*)seqMalloc(strlen(pcStart) + 1); 174 | memcpy(node->taxon, pcStart, strlen(pcStart)); 175 | } 176 | else 177 | { 178 | // Taxon 179 | *pcColon = '\0'; 180 | node->taxon = (char*)seqMalloc(strlen(pcStart) + 1); 181 | memcpy(node->taxon, pcStart, strlen(pcStart)); 182 | *pcColon = ':'; 183 | // Distance 184 | pcColon++; 185 | node->dist = (float)atof(pcColon); 186 | } 187 | node->childNum = 0; 188 | } 189 | else 190 | { 191 | // Create node 192 | node = (newick_node*)seqMalloc(sizeof(newick_node)); 193 | child = NULL; 194 | // Search for all child nodes 195 | // Find all ',' until corresponding ')' is encountered 196 | iCount = 0; 197 | pcStart++; 198 | pcCurrent = pcStart; 199 | while (iCount >= 0) 200 | { 201 | switch (*pcCurrent) 202 | { 203 | case '(': 204 | // Find corresponding ')' by counting 205 | pcStart = pcCurrent; 206 | pcCurrent++; 207 | iCount++; 208 | while (iCount > 0) 209 | { 210 | if (*pcCurrent == '(') 211 | { 212 | iCount++; 213 | } 214 | else if (*pcCurrent == ')') 215 | { 216 | iCount--; 217 | } 218 | pcCurrent++; 219 | } 220 | while (*pcCurrent != ',' && *pcCurrent != ')') 221 | { 222 | pcCurrent++; 223 | } 224 | cTemp = *pcCurrent; 225 | *pcCurrent = '\0'; 226 | // Create a child node 227 | if (child == NULL) 228 | { 229 | node->child = (newick_child*)seqMalloc(sizeof(newick_child)); 230 | node->childNum = 1; 231 | child = node->child; 232 | } 233 | else 234 | { 235 | child->next = (newick_child*)seqMalloc(sizeof(newick_child)); 236 | node->childNum++; 237 | child = child->next; 238 | } 239 | child->node = parseTree(pcStart); 240 | *pcCurrent = cTemp; 241 | if (*pcCurrent != ')') 242 | { 243 | pcCurrent++; 244 | } 245 | break; 246 | 247 | case ')': 248 | // End of tihs tree. Go to next part to retrieve distance 249 | iCount--; 250 | break; 251 | 252 | case ',': 253 | // Impossible separation since according to the algorithm, this symbol will never encountered. 254 | // Currently don't handle this and don't create any node 255 | break; 256 | 257 | default: 258 | // leaf node encountered 259 | pcStart = pcCurrent; 260 | while (*pcCurrent != ',' && *pcCurrent != ')') 261 | { 262 | pcCurrent++; 263 | } 264 | cTemp = *pcCurrent; 265 | *pcCurrent = '\0'; 266 | // Create a child node 267 | if (child == NULL) 268 | { 269 | node->child = (newick_child*)seqMalloc(sizeof(newick_child)); 270 | node->childNum = 1; 271 | child = node->child; 272 | } 273 | else 274 | { 275 | child->next = (newick_child*)seqMalloc(sizeof(newick_child)); 276 | node->childNum++; 277 | child = child->next; 278 | } 279 | child->node = parseTree(pcStart); 280 | *pcCurrent = cTemp; 281 | if (*pcCurrent != ')') 282 | { 283 | pcCurrent++; 284 | } 285 | break; 286 | } 287 | } 288 | 289 | // If start at ':', then the internal node has no name. 290 | pcCurrent++; 291 | if (*pcCurrent == ':') 292 | { 293 | pcStart = pcCurrent + 1; 294 | while (*pcCurrent != '\0' && *pcCurrent != ';') 295 | { 296 | pcCurrent++; 297 | } 298 | cTemp = *pcCurrent; 299 | *pcCurrent = '\0'; 300 | node->dist = (float)atof(pcStart); 301 | *pcCurrent = cTemp; 302 | } 303 | else if (*pcCurrent != ';' && *pcCurrent != '\0') 304 | { 305 | // Find ':' to retrieve distance, if any. 306 | // At this time *pcCurrent should equal to ')' 307 | pcStart = pcCurrent; 308 | while (*pcCurrent != ':' && *pcCurrent!=';') 309 | { 310 | pcCurrent++; 311 | } 312 | cTemp = *pcCurrent; 313 | *pcCurrent = '\0'; 314 | node->taxon = (char*)seqMalloc(strlen(pcStart) + 1); 315 | memcpy(node->taxon, pcStart, strlen(pcStart)); 316 | *pcCurrent = cTemp; 317 | pcCurrent++; 318 | pcStart = pcCurrent; 319 | while (*pcCurrent != '\0' && *pcCurrent != ';') 320 | { 321 | pcCurrent++; 322 | } 323 | cTemp = *pcCurrent; 324 | *pcCurrent = '\0'; 325 | node->dist = (float)atof(pcStart); 326 | *pcCurrent = cTemp; 327 | } 328 | } 329 | 330 | return node; 331 | } 332 | 333 | void printTree(newick_node *root) 334 | { 335 | newick_child *child; 336 | if (root->childNum == 0) 337 | { 338 | printf("%s:%0.6f", root->taxon, root->dist); 339 | } 340 | else 341 | { 342 | child = root->child; 343 | printf("("); 344 | while (child != NULL) 345 | { 346 | printTree(child->node); 347 | if (child->next != NULL) 348 | { 349 | printf(","); 350 | } 351 | child = child->next; 352 | } 353 | if (root->taxon != NULL) 354 | { 355 | printf(")%s:%0.6f", root->taxon, root->dist); 356 | } 357 | else 358 | { 359 | printf("):%0.6f", root->dist); 360 | } 361 | } 362 | } 363 | 364 | void TravelTree1(newick_node *root, int &S) 365 | { 366 | newick_child *child = root->child; 367 | 368 | while (child != NULL) 369 | { 370 | TravelTree1(child->node, S); 371 | child = child->next; 372 | } 373 | 374 | // if leaf 375 | if (root->child == NULL) 376 | S = S + 1; 377 | } 378 | 379 | int cur_leaf_id = -1; 380 | int cur_branch_id = -1; 381 | 382 | void TravelTree2(newick_node *root, PhyloTree &phylo_tree) 383 | { 384 | newick_child *child = root->child; 385 | 386 | while (child != NULL) 387 | { 388 | TravelTree2(child->node, phylo_tree); 389 | child = child->next; 390 | } 391 | 392 | if (root->child == NULL) 393 | { 394 | root->id = ++cur_leaf_id; 395 | } 396 | else 397 | { 398 | root->id = ++cur_branch_id + phylo_tree.S; 399 | } 400 | } 401 | 402 | void TravelTree3(newick_node *root, PhyloTree &phylo_tree) 403 | { 404 | newick_child *child = root->child; 405 | 406 | while (child != NULL) 407 | { 408 | phylo_tree.dag[root->id][child->node->id] = true; 409 | 410 | TravelTree3(child->node, phylo_tree); 411 | child = child->next; 412 | } 413 | 414 | phylo_tree.distances[root->id] = root->dist; 415 | phylo_tree.nodes_names[root->id] = root->taxon; 416 | if (root->id < phylo_tree.S) 417 | { 418 | phylo_tree.species_names[root->id] = root->taxon; 419 | 420 | } 421 | } 422 | 423 | // load the phylogenetic tree 424 | PhyloTree LoadPhyloTree(string phylo_tree_path) 425 | { 426 | cout << "Loading phylogenetic tree from " << phylo_tree_path << "......" << endl; 427 | 428 | 429 | ifstream in_prof(phylo_tree_path.c_str()); 430 | 431 | if (!in_prof) 432 | { 433 | cerr << "(Error. Cannot open the phylogenetic tree input file: " << phylo_tree_path << ")" << endl; 434 | exit(1); 435 | } 436 | 437 | 438 | // count the num of species, base pairs and load the profiles 439 | 440 | PhyloTree phylo_tree; 441 | phylo_tree.pi = zeros(4); 442 | phylo_tree.subs_rate = zeros(4,4); 443 | 444 | newick_node *root = NULL; 445 | string linestr; 446 | 447 | while(!in_prof.eof()) 448 | { 449 | std::getline(in_prof, linestr); 450 | if(linestr=="") break; 451 | linestr = strutils::trim(linestr); 452 | vector line_splits = strutils::split(linestr, ' '); 453 | if(!strcmp(line_splits[0].c_str(), "BACKGROUND:")){ 454 | 455 | int ind=0; 456 | for(std::size_t g=1; g tmp = strutils::split(strutils::trim(linestr),' '); 469 | int ind=0; 470 | for(std::size_t g=0; g < tmp.size(); g++) 471 | { 472 | if(tmp[g]=="") continue; 473 | phylo_tree.subs_rate(i,ind) = atof(tmp[g].c_str()); 474 | ind++; 475 | } 476 | } 477 | 478 | } 479 | else if (!strcmp(line_splits[0].c_str(), "TREE:")){ 480 | root = parseTree((char*)strutils::trim(line_splits[1]).c_str()); 481 | } 482 | } 483 | 484 | // cout << newick_string << endl; 485 | 486 | // printTree(root); 487 | 488 | 489 | 490 | 491 | // count the total num of species and label each node and 492 | int S = 0; 493 | TravelTree1(root, S); 494 | phylo_tree.S = S; 495 | int N = 2 * phylo_tree.S - 1; 496 | TravelTree2(root, phylo_tree); 497 | 498 | // cout << "-----------" << S << endl; 499 | 500 | // load the tree structure and species names 501 | phylo_tree.species_names = vector(S); 502 | phylo_tree.nodes_names = vector(N); 503 | phylo_tree.distances = vector(N); 504 | phylo_tree.dag = vector< vector >(N, vector(N, false)); 505 | TravelTree3(root, phylo_tree); 506 | 507 | // cout << " Done." << endl; 508 | 509 | // cout << "Number of species: " << S << "." << endl << endl; 510 | 511 | return phylo_tree; 512 | } 513 | -------------------------------------------------------------------------------- /src/PhyloAcc-ST/newick.cpp: -------------------------------------------------------------------------------- 1 | // This code, downloaded from Internet, is used to 2 | // parse the phylogenetic tree in Newick format. 3 | 4 | #include "newick.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "utils.h" 11 | 12 | #define APPEND_LEN 256 13 | 14 | using namespace std; 15 | using namespace arma; 16 | 17 | typedef struct newick_child 18 | { 19 | struct newick_node *node; 20 | struct newick_child *next; 21 | } newick_child; 22 | 23 | typedef struct newick_node 24 | { 25 | int id; 26 | char *taxon; 27 | char *seq; 28 | float dist; 29 | int childNum; 30 | struct newick_child *child; 31 | struct newick_node *parent; 32 | } newick_node; 33 | 34 | newick_node* parseTree(char *str); 35 | 36 | typedef struct seqMem 37 | { 38 | void *pos; 39 | struct seqMem *next; 40 | } seqMem; 41 | 42 | seqMem *start; 43 | seqMem *current; 44 | 45 | void seqMemInit() 46 | { 47 | start = NULL; 48 | } 49 | 50 | void* seqMalloc(int size) 51 | { 52 | if (start == NULL) 53 | { 54 | start = (seqMem*)malloc(sizeof(seqMem)); 55 | memset(start, '\0', sizeof(seqMem)); 56 | current = start; 57 | } 58 | else 59 | { 60 | current->next = (seqMem*)malloc(sizeof(seqMem)); 61 | memset(current->next, '\0', sizeof(seqMem)); 62 | current = current->next; 63 | } 64 | current->pos = malloc(size); 65 | memset(current->pos, '\0', size); 66 | return(current->pos); 67 | } 68 | 69 | void seqFreeAll() 70 | { 71 | while (start != NULL) 72 | { 73 | current = start->next; 74 | free(start->pos); 75 | free(start); 76 | start = current; 77 | } 78 | 79 | start = NULL; 80 | } 81 | 82 | void seqFree(void* pos) 83 | { 84 | seqMem *node, *prenode; 85 | node = start; 86 | prenode = start; 87 | while (node != NULL) 88 | { 89 | if (node->pos == pos) 90 | { 91 | free(node->pos); 92 | if (node == start) 93 | { 94 | start = node->next; 95 | } 96 | else if (node->next == NULL) 97 | { 98 | current = prenode; 99 | prenode->next = NULL; 100 | } 101 | else 102 | { 103 | prenode->next = node->next; 104 | } 105 | free(node); 106 | break; 107 | } 108 | 109 | prenode = node; 110 | node = node->next; 111 | } 112 | } 113 | 114 | void inputString(char *input, char **ppcStr, int *iLen, int *iMaxLen) 115 | { 116 | int inputLen; 117 | char *temp; 118 | inputLen = strlen(input); 119 | if (inputLen == 0) 120 | { 121 | return; 122 | } 123 | while (*iMaxLen < (*iLen + inputLen) + 1) 124 | { 125 | *iMaxLen = *iMaxLen + APPEND_LEN; 126 | } 127 | temp = (char*)seqMalloc(*iMaxLen); 128 | if (*ppcStr == NULL) 129 | { 130 | memcpy(temp, input, inputLen); 131 | } 132 | else 133 | { 134 | memcpy(temp, *ppcStr, *iLen); 135 | strcat(temp, input); 136 | } 137 | *iLen = *iLen + inputLen; 138 | if (*ppcStr != NULL) 139 | { 140 | seqFree(*ppcStr); 141 | } 142 | *ppcStr = temp; 143 | } 144 | 145 | newick_node* parseTree(char *str) 146 | { 147 | newick_node *node; 148 | newick_child *child; 149 | char *pcCurrent; 150 | char *pcStart; 151 | char *pcColon = NULL; 152 | char cTemp; 153 | int iCount; 154 | 155 | pcStart = str; 156 | 157 | if (*pcStart != '(') 158 | { 159 | // Leaf node. Separate taxon name from distance. If distance not exist then take care of taxon name only 160 | pcCurrent = str; 161 | while (*pcCurrent != '\0') 162 | { 163 | if (*pcCurrent == ':') 164 | { 165 | pcColon = pcCurrent; 166 | } 167 | pcCurrent++; 168 | } 169 | node = (newick_node*)seqMalloc(sizeof(newick_node)); 170 | if (pcColon == NULL) 171 | { 172 | // Taxon only 173 | node->taxon = (char*)seqMalloc(strlen(pcStart) + 1); 174 | memcpy(node->taxon, pcStart, strlen(pcStart)); 175 | } 176 | else 177 | { 178 | // Taxon 179 | *pcColon = '\0'; 180 | node->taxon = (char*)seqMalloc(strlen(pcStart) + 1); 181 | memcpy(node->taxon, pcStart, strlen(pcStart)); 182 | *pcColon = ':'; 183 | // Distance 184 | pcColon++; 185 | node->dist = (float)atof(pcColon); 186 | } 187 | node->childNum = 0; 188 | } 189 | else 190 | { 191 | // Create node 192 | node = (newick_node*)seqMalloc(sizeof(newick_node)); 193 | child = NULL; 194 | // Search for all child nodes 195 | // Find all ',' until corresponding ')' is encountered 196 | iCount = 0; 197 | pcStart++; 198 | pcCurrent = pcStart; 199 | while (iCount >= 0) 200 | { 201 | switch (*pcCurrent) 202 | { 203 | case '(': 204 | // Find corresponding ')' by counting 205 | pcStart = pcCurrent; 206 | pcCurrent++; 207 | iCount++; 208 | while (iCount > 0) 209 | { 210 | if (*pcCurrent == '(') 211 | { 212 | iCount++; 213 | } 214 | else if (*pcCurrent == ')') 215 | { 216 | iCount--; 217 | } 218 | pcCurrent++; 219 | } 220 | while (*pcCurrent != ',' && *pcCurrent != ')') 221 | { 222 | pcCurrent++; 223 | } 224 | cTemp = *pcCurrent; 225 | *pcCurrent = '\0'; 226 | // Create a child node 227 | if (child == NULL) 228 | { 229 | node->child = (newick_child*)seqMalloc(sizeof(newick_child)); 230 | node->childNum = 1; 231 | child = node->child; 232 | } 233 | else 234 | { 235 | child->next = (newick_child*)seqMalloc(sizeof(newick_child)); 236 | node->childNum++; 237 | child = child->next; 238 | } 239 | child->node = parseTree(pcStart); 240 | *pcCurrent = cTemp; 241 | if (*pcCurrent != ')') 242 | { 243 | pcCurrent++; 244 | } 245 | break; 246 | 247 | case ')': 248 | // End of tihs tree. Go to next part to retrieve distance 249 | iCount--; 250 | break; 251 | 252 | case ',': 253 | // Impossible separation since according to the algorithm, this symbol will never encountered. 254 | // Currently don't handle this and don't create any node 255 | break; 256 | 257 | default: 258 | // leaf node encountered 259 | pcStart = pcCurrent; 260 | while (*pcCurrent != ',' && *pcCurrent != ')') 261 | { 262 | pcCurrent++; 263 | } 264 | cTemp = *pcCurrent; 265 | *pcCurrent = '\0'; 266 | // Create a child node 267 | if (child == NULL) 268 | { 269 | node->child = (newick_child*)seqMalloc(sizeof(newick_child)); 270 | node->childNum = 1; 271 | child = node->child; 272 | } 273 | else 274 | { 275 | child->next = (newick_child*)seqMalloc(sizeof(newick_child)); 276 | node->childNum++; 277 | child = child->next; 278 | } 279 | child->node = parseTree(pcStart); 280 | *pcCurrent = cTemp; 281 | if (*pcCurrent != ')') 282 | { 283 | pcCurrent++; 284 | } 285 | break; 286 | } 287 | } 288 | 289 | // If start at ':', then the internal node has no name. 290 | pcCurrent++; 291 | if (*pcCurrent == ':') 292 | { 293 | pcStart = pcCurrent + 1; 294 | while (*pcCurrent != '\0' && *pcCurrent != ';') 295 | { 296 | pcCurrent++; 297 | } 298 | cTemp = *pcCurrent; 299 | *pcCurrent = '\0'; 300 | node->dist = (float)atof(pcStart); 301 | *pcCurrent = cTemp; 302 | } 303 | else if (*pcCurrent != ';' && *pcCurrent != '\0') 304 | { 305 | // Find ':' to retrieve distance, if any. 306 | // At this time *pcCurrent should equal to ')' 307 | pcStart = pcCurrent; 308 | while (*pcCurrent != ':' && *pcCurrent!=';') 309 | { 310 | pcCurrent++; 311 | } 312 | cTemp = *pcCurrent; 313 | *pcCurrent = '\0'; 314 | node->taxon = (char*)seqMalloc(strlen(pcStart) + 1); 315 | memcpy(node->taxon, pcStart, strlen(pcStart)); 316 | *pcCurrent = cTemp; 317 | pcCurrent++; 318 | pcStart = pcCurrent; 319 | while (*pcCurrent != '\0' && *pcCurrent != ';') 320 | { 321 | pcCurrent++; 322 | } 323 | cTemp = *pcCurrent; 324 | *pcCurrent = '\0'; 325 | node->dist = (float)atof(pcStart); 326 | *pcCurrent = cTemp; 327 | } 328 | } 329 | 330 | return node; 331 | } 332 | 333 | void printTree(newick_node *root) 334 | { 335 | newick_child *child; 336 | if (root->childNum == 0) 337 | { 338 | printf("%s:%0.6f", root->taxon, root->dist); 339 | } 340 | else 341 | { 342 | child = root->child; 343 | printf("("); 344 | while (child != NULL) 345 | { 346 | printTree(child->node); 347 | if (child->next != NULL) 348 | { 349 | printf(","); 350 | } 351 | child = child->next; 352 | } 353 | if (root->taxon != NULL) 354 | { 355 | printf(")%s:%0.6f", root->taxon, root->dist); 356 | } 357 | else 358 | { 359 | printf("):%0.6f", root->dist); 360 | } 361 | } 362 | } 363 | 364 | void TravelTree1(newick_node *root, int &S) 365 | { 366 | newick_child *child = root->child; 367 | 368 | while (child != NULL) 369 | { 370 | TravelTree1(child->node, S); 371 | child = child->next; 372 | } 373 | 374 | // if leaf 375 | if (root->child == NULL) 376 | S = S + 1; 377 | } 378 | 379 | int cur_leaf_id = -1; 380 | int cur_branch_id = -1; 381 | 382 | void TravelTree2(newick_node *root, PhyloTree &phylo_tree) 383 | { 384 | newick_child *child = root->child; 385 | 386 | while (child != NULL) 387 | { 388 | TravelTree2(child->node, phylo_tree); 389 | child = child->next; 390 | } 391 | 392 | if (root->child == NULL) 393 | { 394 | root->id = ++cur_leaf_id; 395 | } 396 | else 397 | { 398 | root->id = ++cur_branch_id + phylo_tree.S; 399 | } 400 | } 401 | 402 | void TravelTree3(newick_node *root, PhyloTree &phylo_tree) 403 | { 404 | newick_child *child = root->child; 405 | 406 | while (child != NULL) 407 | { 408 | phylo_tree.dag[root->id][child->node->id] = true; 409 | 410 | TravelTree3(child->node, phylo_tree); 411 | child = child->next; 412 | } 413 | 414 | phylo_tree.distances[root->id] = root->dist; 415 | 416 | if (root == nullptr || root->taxon == nullptr) { 417 | cout << "\n(Error. Some nodes in the tree aren't labeled. Please label all nodes and try again!)" << endl; 418 | exit(1); 419 | } 420 | // if the current node does not have a label, print an error and exit 421 | 422 | phylo_tree.nodes_names[root->id] = root->taxon; 423 | if (root->id < phylo_tree.S) 424 | { 425 | phylo_tree.species_names[root->id] = root->taxon; 426 | 427 | } 428 | } 429 | 430 | // load the phylogenetic tree 431 | PhyloTree LoadPhyloTree(string phylo_tree_path) 432 | { 433 | cout << "Loading phylogenetic tree from " << phylo_tree_path << "......" << endl; 434 | 435 | 436 | ifstream in_prof(phylo_tree_path.c_str()); 437 | 438 | if (!in_prof) 439 | { 440 | cerr << "(Error. Cannot open the phylogenetic tree input file: " << phylo_tree_path << ")" << endl; 441 | exit(1); 442 | } 443 | 444 | 445 | // count the num of species, base pairs and load the profiles 446 | 447 | PhyloTree phylo_tree; 448 | phylo_tree.pi = zeros(4); 449 | phylo_tree.subs_rate = zeros(4,4); 450 | 451 | newick_node *root = NULL; 452 | string linestr; 453 | 454 | while(!in_prof.eof()) 455 | { 456 | std::getline(in_prof, linestr); 457 | if(linestr=="") break; 458 | linestr = strutils::trim(linestr); 459 | vector line_splits = strutils::split(linestr, ' '); 460 | if(!strcmp(line_splits[0].c_str(), "BACKGROUND:")){ 461 | 462 | int ind=0; 463 | for(std::size_t g=1; g tmp = strutils::split(strutils::trim(linestr),' '); 476 | int ind=0; 477 | for(std::size_t g=0; g < tmp.size(); g++) 478 | { 479 | if(tmp[g]=="") continue; 480 | phylo_tree.subs_rate(i,ind) = atof(tmp[g].c_str()); 481 | ind++; 482 | } 483 | } 484 | 485 | } 486 | else if (!strcmp(line_splits[0].c_str(), "TREE:")){ 487 | root = parseTree((char*)strutils::trim(line_splits[1]).c_str()); 488 | } 489 | } 490 | 491 | // cout << newick_string << endl; 492 | 493 | printTree(root); 494 | printf("\n"); 495 | 496 | // count the total num of species and label each node and 497 | int S = 0; 498 | TravelTree1(root, S); 499 | phylo_tree.S = S; 500 | int N = 2 * phylo_tree.S - 1; 501 | TravelTree2(root, phylo_tree); 502 | 503 | // cout << "-----------" << S << endl; 504 | 505 | // load the tree structure and species names 506 | phylo_tree.species_names = vector(S); 507 | phylo_tree.nodes_names = vector(N); 508 | phylo_tree.distances = vector(N); 509 | phylo_tree.dag = vector< vector >(N, vector(N, false)); 510 | TravelTree3(root, phylo_tree); 511 | 512 | // cout << " Done." << endl; 513 | 514 | // cout << "Number of species: " << S << "." << endl << endl; 515 | 516 | return phylo_tree; 517 | } 518 | --------------------------------------------------------------------------------