├── .gitignore ├── HybPiperUtils ├── README.md ├── fasta_merge.py └── filter_by_length.py ├── LICENSE ├── README.md ├── alleles_workflow ├── README.md ├── combine_alignments.py ├── create_alleles_alignments.sh ├── extract_phase_bcftools.sh ├── intron_exon_extractor.py └── map_to_supercontigs.sh ├── brlenoutliers └── brlen_outliers.py ├── haplonerate ├── README.md ├── haplonerate.py ├── haplonerate3N.py └── img │ └── AJB_Figure_1.pdf ├── homologizer ├── convert_to_nexus.py ├── label_swap.txt ├── readme.md ├── revbayes_template.txt ├── revscript_maker.py └── swap_labels.py ├── minorityreport ├── README.md ├── img │ ├── concordant.png │ └── conflict1.png └── minority_report.py └── phypartspiecharts ├── PhyParts_PieCharts.ipynb ├── README.md ├── img ├── default_pies.jpg ├── pleuro_nodes.png └── sphag_taka.png ├── phyparts_example ├── out.concon.tre ├── out.hist ├── out.hist.alts ├── out.node.key ├── phyparts_dist.csv ├── phyparts_pies.csv ├── pies.svg └── species.tre ├── phypartspiecharts.py └── reroot_trees.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /HybPiperUtils/README.md: -------------------------------------------------------------------------------- 1 | # HybPiper Utils 2 | 3 | Assorted shell and Python scripts to work with data from HybPiper output. 4 | 5 | ### Software 6 | 7 | HybPiper: [www.github.com/mossmatters/HybPiper](www.github.com/mossmatters/HybPiper) 8 | 9 | Most of the scripts require Python 3.0 or higher and `biopython` which can be installed using `conda`. 10 | 11 | 12 | #### `fasta_merge.py` 13 | 14 | Script to merge (concatenate) alignments from different alignments for phylogenetic analysis. 15 | 16 | ``` 17 | usage: fasta_merge.py [-h] [--fastafiles FASTAFILES [FASTAFILES ...]] 18 | [--filelist FILELIST] [--raxml {DNA,WAG,JTT,CODON}] 19 | 20 | This script will take a list of FASTA files and concatenate them for use in 21 | phylogenetic inference. The sequence headers (up until the first space) must be identical 22 | in each individual FASTA file. 23 | 24 | Individual gene sequences should be aligned prior to running this script! 25 | 26 | This script requires BioPython to read/write FASTA sequences. 27 | 28 | optional arguments: 29 | -h, --help show this help message and exit 30 | --fastafiles FASTAFILES [FASTAFILES ...] 31 | List of Fasta Files. Can use wildcard on Linux/Mac systems 32 | --filelist FILELIST File containing list of Fasta files. Alternative to --fastalist 33 | --raxml {DNA,WAG,JTT,CODON} 34 | Create a partition file 'partitions.raxml' intended for raxml in the current directory. For amino acid sequences, select the substitution model. To specify a separate model for 1st/2nd vs. 3rd codon positions, select CODON. 35 | ``` 36 | 37 | ### `filter_by_length.py` 38 | 39 | A script for filtering gene-sample combinations based on length filters for each gene. Two filters are available: minimum length, and percentage of mean length of the targets. The script will assume you have already run `hybpiper stats` and `hybpiper retrieve_sequences` as input. 40 | 41 | As of HybPiper 2.1.6, the `hybpiper retrieve_sequences` command could only filter sequences at a "whole project" level - for example, removing a sample if fewer than 20% of genes were recovered. 42 | 43 | Suggested Workflow: 44 | 45 | 1. Run `hybpiper stats` to generate the `stats.tsv` and `lengths.tsv` files 46 | 2. Run `hybpiper retrieve_sequences` to create a folder of FASTA sequences 47 | 3. Run this script to create new FASTA files based on the per-gene filters. 48 | Also writes to standard output the `denylist` by gene, redirect this to save to a file. 49 | 50 | The FASTA sequences will expect to have the naming scheme of HybPiper: 51 | 52 | - `geneName.FNA` for nucleotide exon files 53 | - `geneName.FAA` for amino acid files 54 | - `geneName_supercontig.fasta` for supercontig files 55 | - `geneName_intron.fasta` for intron-only files 56 | 57 | The geneNames will be taken from either the `hybpiper stats` file (`--lengthfile`) or a supplied list of gene sample combinations (`--denylist`, also produced by running this script). There are two filters, `--length_filter` for the minimum length to accept a sequence (for all genes) and `--percent_filter` for a fraction of the mean length determined from the `seq_lengths.tsv` file for each gene. For example: 58 | 59 | ``` 60 | python filter_by_length.py --lengthfile ../seq_lengths.tsv --seq_type FNA --percent_filter 0.1 > denylist.txt 61 | ``` 62 | 63 | If you wish to filter intron or supercontig sequences, run a second time with the `--denylist` flag to skip the filtering based on lengths or percentages: 64 | 65 | ``` 66 | python filter_by_length.py --lengthfile ../seq_lengths.tsv --seq_type supercontig --denylist ../denylist.txt 67 | ``` 68 | -------------------------------------------------------------------------------- /HybPiperUtils/fasta_merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | helptext ='''This script will take a list of FASTA files and concatenate them for use in 4 | phylogenetic inference. The sequence headers (up until the first space) must be identical 5 | in each individual FASTA file. 6 | 7 | Individual gene sequences should be aligned prior to running this script! 8 | 9 | This script requires BioPython to read/write FASTA sequences.''' 10 | 11 | import os,sys,argparse 12 | from Bio import SeqIO 13 | from Bio.SeqRecord import SeqRecord 14 | from Bio.Seq import Seq 15 | 16 | def read_sequences(fastafiles): 17 | '''Given a list of FASTA file names, read in each sequence to a dictionary of dictionaries, one per file''' 18 | return {filename:SeqIO.to_dict(SeqIO.parse(filename,'fasta')) for filename in fastafiles} 19 | 20 | def get_unique_names(gene_dict): 21 | '''Given the dictionary of SeqRecord dictionaries, return a list of the unique sequence headers''' 22 | all_names = [] 23 | for gene in gene_dict: 24 | all_names += list(gene_dict[gene].keys()) 25 | return set(all_names) 26 | 27 | def insert_sequences(gene_dict,unique_names): 28 | '''Given the dictionary of dictionaries, insert blank sequences if any are missing for a gene''' 29 | inserted_sequences = 0 30 | for gene in gene_dict: 31 | for name in unique_names: 32 | if name not in gene_dict[gene]: 33 | gene_length = len(next(iter(gene_dict[gene].values()))) 34 | gene_dict[gene][name] = SeqRecord(Seq("-"*gene_length),id=name) 35 | inserted_sequences += 1 36 | sys.stderr.write("{} Empty sequences inserted across all genes.\n".format(inserted_sequences)) 37 | return gene_dict 38 | 39 | def concatenate_sequences(gene_dict,fastafiles,unique_names): 40 | '''Given a dictionary of dictionaries with complete sampling in each gene, write out concatenated sequences to stdout. Returns a list of partition lengths.''' 41 | new_seq_dict = {} 42 | partition_lengths = [] 43 | for gene in fastafiles: 44 | for name in unique_names: 45 | try: 46 | new_seq_dict[name] += gene_dict[gene][name] 47 | except KeyError: 48 | new_seq_dict[name] = gene_dict[gene][name] 49 | partition_lengths.append(len(next(iter(gene_dict[gene].values())))) 50 | for final_seq in new_seq_dict: 51 | SeqIO.write(new_seq_dict[final_seq],sys.stdout,'fasta') 52 | final_seq_length = len(new_seq_dict[final_seq]) 53 | sys.stderr.write("Final conatenated sequence length: {}\n".format(final_seq_length)) 54 | return partition_lengths 55 | 56 | def raxml_partition(fastafiles,partition_lengths,partition_type): 57 | '''Generate a raxml partition file for the given fastafiles. User specifies the partition type''' 58 | gene_start = 1 59 | partition_file = open("partition.raxml",'w') 60 | 61 | if partition_type == 'CODON': 62 | for g in range(len(fastafiles)): 63 | codon3_start = gene_start + 2 64 | codon3_end = gene_start + partition_lengths[g] - 1 65 | codon1_end = codon3_end - 2 66 | codon2_start = gene_start + 1 67 | codon2_end = codon3_end - 1 68 | partition_file.write("{},{}{}={}-{}\\3,{}-{}\\3\n".format("DNA",fastafiles[g],"12",gene_start,codon1_end,codon2_start,codon2_end)) 69 | partition_file.write("{},{}{}={}-{}\\3\n".format("DNA",fastafiles[g],"3",codon3_start,codon3_end)) 70 | gene_start = codon3_end + 1 71 | else: 72 | for g in range(len(fastafiles)): 73 | gene_end = gene_start + partition_lengths[g] - 1 74 | partition_file.write("{},{}={}-{}\n".format(partition_type,fastafiles[g],gene_start,gene_end)) 75 | gene_start = gene_end + 1 76 | partition_file.close() 77 | 78 | 79 | 80 | 81 | def main(): 82 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 83 | parser.add_argument("--fastafiles",nargs='+',help="List of Fasta Files. Can use wildcard on Linux/Mac systems") 84 | parser.add_argument("--filelist",help="File containing list of Fasta files. Alternative to --fastalist") 85 | parser.add_argument("--raxml",help="Create a partition file 'partitions.raxml' intended for raxml in the current directory. For amino acid sequences, select the substitution model. To specify a separate model for 1st/2nd vs. 3rd codon positions, select CODON.", 86 | choices = ['DNA','WAG','JTT','CODON' 87 | ],default=None) 88 | 89 | if len(sys.argv) < 2: 90 | parser.print_help() 91 | sys.exit(1) 92 | 93 | args = parser.parse_args() 94 | 95 | if args.fastafiles: 96 | #print args.fastafiles 97 | if args.filelist: 98 | sys.stderr.write("Specify either a list of FASTA files or a file containing names, not both!\n") 99 | sys.exit(1) 100 | else: 101 | fastafiles = args.fastafiles 102 | 103 | elif args.filelist: 104 | #print args.filelist 105 | if os.path.isfile(args.filelist): 106 | fastafiles = [x.rstrip() for x in open(args.filelist)] 107 | else: 108 | sys.stderr.write("File containing list of FASTA files not found!") 109 | sys.exit(1) 110 | 111 | else: 112 | sys.stderr.write("You must specify the FASTA files as a list or in a file.\n") 113 | sys.exit(1) 114 | 115 | sys.stderr.write("{} FASTA files found.\n".format(len(fastafiles))) 116 | gene_dict = read_sequences(fastafiles) 117 | 118 | sys.stderr.write("All sequences read successfully.\n") 119 | unique_names = get_unique_names(gene_dict) 120 | sys.stderr.write("{} Unique names found. If you were expecting fewer sequences, check your IDs!\n".format(len(unique_names))) 121 | gaps_inserted = insert_sequences(gene_dict,unique_names) 122 | 123 | partition_lengths = concatenate_sequences(gaps_inserted,fastafiles,unique_names) 124 | 125 | if args.raxml: 126 | raxml_partition(fastafiles,partition_lengths,args.raxml) 127 | 128 | if __name__ == "__main__":main() -------------------------------------------------------------------------------- /HybPiperUtils/filter_by_length.py: -------------------------------------------------------------------------------- 1 | import os,sys,argparse 2 | 3 | from Bio import SeqIO 4 | 5 | helptext ='''This script will filter output from HybPiper based on the output of hybpiper retrieve_sequences 6 | 7 | As of HybPiper version 2.1.6, hybpiper retrieve_sequences only supports filtering based 8 | on project-wide thresholds (i.e. number of total genes recovered). This script will allow 9 | filtering based on individual genes and the mean length or minimum length threshold. 10 | 11 | 1. Run hybpiper stats to generate the stats.tsv and lengths.tsv files 12 | 2. Run hybpiper retrieve_sequences to create a folder of FASTA sequences 13 | 3. Run this script to create new FASTA files based on the per-gene filters. 14 | Also writes to standard output the denylist by gene, redirect this to save to a file. 15 | 16 | The FASTA sequences will expect to have the naming scheme of HybPiper: 17 | geneName.FNA for nucleotide exon files 18 | geneName.FAA for amino acid files 19 | geneName_supercontig.fasta for supercontig files 20 | geneName_intron.fasta for intron-only files 21 | 22 | The geneNames will be taken from either the hybpiper stats file (--lengthfile) or a supplied 23 | list of gene sample combinations (--denylist, also produced by running this script) 24 | 25 | If you wish to filter intron or supercontig sequences, run again with the --denylist flag 26 | to skip the filtering based on lengths. 27 | ''' 28 | 29 | def filter_fastas(deny_dict,seq_type): 30 | if seq_type == "supercontig": 31 | seqend = "_supercontig.fasta" 32 | elif seq_type == "intron": 33 | seqend = "_intron.fasta" 34 | else: 35 | seqend = "." + seq_type 36 | 37 | fastafiles = [x for x in os.listdir() if x.endswith(seqend)] 38 | for f in fastafiles: 39 | geneName = f.replace(seqend,'') 40 | genedenylist = set(deny_dict[geneName]) 41 | if seq_type == "supercontig" or seq_type == "intron": 42 | newFn = f"{geneName}.filtered{seqend}" 43 | else: 44 | newFn = f"{geneName}.filtered.{seq_type}" 45 | with open(newFn,'w') as outfile: 46 | for seq in SeqIO.parse(f,'fasta'): 47 | if seq.id in genedenylist: 48 | continue 49 | else: 50 | SeqIO.write(seq,outfile,'fasta') 51 | return 52 | 53 | def write_denylist(deny_dict): 54 | #with open(denylistfn,'w') as outfile: 55 | for gene in deny_dict: 56 | samples = ",".join(deny_dict[gene]) 57 | sys.stdout.write(f"{gene}\t{samples}\n") 58 | return 59 | 60 | def filter_seqs(gene_lengths,minLength,minPercent): 61 | '''Takes the sample-gene lengths and filters and returns a dictionary by gene of samples to be on the denylist''' 62 | 63 | deny_dict = {} 64 | total_deny = 0 65 | for gene in gene_lengths: 66 | deny_dict[gene] = [] 67 | percentThresh = gene_lengths[gene]["mean_length"] * minPercent 68 | #print(gene,percentThresh) 69 | for sampleName in gene_lengths[gene]["sample_lengths"]: 70 | sampleLength = gene_lengths[gene]["sample_lengths"][sampleName] 71 | if sampleLength < minLength: 72 | deny_dict[gene].append(sampleName) 73 | total_deny += 1 74 | continue 75 | if sampleLength < percentThresh: 76 | deny_dict[gene].append(sampleName) 77 | total_deny += 1 78 | sys.stderr.write(f"Filtered {total_deny} total sequences at {len(deny_dict)} genes based on parameters.") 79 | return deny_dict 80 | 81 | def parse_seqlens(seqlens_fn): 82 | '''Takes the file name for the seqlengths output of hybpiper stats and returns: 83 | - a list of sample names 84 | - a dictionary for each gene containing: 85 | * the name of the gene as the dict key 86 | * "mean length":integer 87 | * "sample_lengths":{a dictionary of key:sample_lengths}''' 88 | 89 | sample_names = [] 90 | gene_lengths = {} 91 | 92 | seqlens = open(seqlens_fn) 93 | genenames = seqlens.readline().rstrip().split("\t")[1:] 94 | meanlens = seqlens.readline().rstrip().split("\t")[1:] 95 | for geneNum in range(len(genenames)): 96 | gene_lengths[genenames[geneNum]] = {"mean_length":float(meanlens[geneNum]),"sample_lengths":{}} 97 | for line in seqlens: 98 | line = line.rstrip().split("\t") 99 | sampleName = line.pop(0) 100 | sample_names.append(sampleName) 101 | for geneNum in range(len(genenames)): 102 | gene_lengths[genenames[geneNum]]["sample_lengths"][sampleName] = float(line[geneNum]) 103 | 104 | return sample_names,gene_lengths 105 | 106 | 107 | def parse_denylist(denylist_fn): 108 | '''parses the text file at denylist_fn and returns a dict with the geneName:[samplelist] pairs''' 109 | deny_dict = {} 110 | total_deny = 0 111 | for line in open(denylist_fn): 112 | line = line.rstrip().split("\t") 113 | try: 114 | samples = line[1].split(",") 115 | except IndexError: 116 | samples = [] 117 | total_deny += len(samples) 118 | deny_dict[line[0]] = samples 119 | sys.stderr.write(f"Found {total_deny} total samples at {len(deny_dict)} genes in the denylist {denylist_fn}") 120 | return deny_dict 121 | 122 | 123 | def main(): 124 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 125 | parser.add_argument("--denylist",help="Text file containing gene-sample combinations to omit. \n The format of the file should be one gene per line, a tab, \n and then a comma-delimited list of samples to disallow: \n gene[tab]sample,sample,sample ",default=None) 126 | parser.add_argument("--lengthfile",help="Output of hybpiper stats, with list of genes in first row, \n mean target lengths in second row, and sample recovery in other rows.") 127 | parser.add_argument("--seq_type",help="File seq_type for all FASTA files to filter in current directory. \n For example, the amino acid output of HybPiper would be: FAA",choices=["FNA","FAA","supercontig","intron"]) 128 | parser.add_argument("--length_filter",help="Minimum length to allow a sequence \n in nucleotides for DNA or amino acids for protein sequences",default=0,type=int) 129 | parser.add_argument("--percent_filter",help="Minimum fraction (between 0 and 1) of the mean target length to allow a sequence for a gene. \n Lengths taken from HybPiper stats file.",default=0,type=float) 130 | 131 | if len(sys.argv) < 2: 132 | parser.print_help() 133 | sys.exit(1) 134 | 135 | args = parser.parse_args() 136 | 137 | if args.denylist: 138 | deny_dict = parse_denylist(args.denylist) 139 | else: 140 | sample_names,gene_lengths = parse_seqlens(args.lengthfile) 141 | deny_dict = filter_seqs(gene_lengths,args.length_filter,args.percent_filter) 142 | write_denylist(deny_dict) 143 | 144 | filter_fastas(deny_dict,args.seq_type) 145 | 146 | 147 | if __name__ == "__main__":main() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Matt Johnson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # phyloscripts by mossmatters 2 | ---- 3 | Helper scripts for processing and visualization of phylogenetics datasets, mostly written in Python. For more information about each of the scripts, visit the appropriate subdirectory. 4 | 5 | 6 | # [Alleles Workflow](https://github.com/mossmatters/phyloscripts/tree/master/alleles_workflow) 7 | 8 | Python and shell scripts for phasing alleles from HybSeq data, as done in [Kates et al. AJB 2018](https://www.ncbi.nlm.nih.gov/pubmed/29729187). 9 | 10 | # [PhypartsPieCharts](https://github.com/mossmatters/phyloscripts/tree/master/phypartspiecharts) 11 | 12 | Script for plotting pie charts from bipartition analysis on a species phylogeny. 13 | 14 | 15 | # [HybpiperUtils](https://github.com/mossmatters/phyloscripts/tree/master/hybpiperutils) 16 | 17 | Scripts for working with target capture data recovered or assembled using [HybPiper](github.com/mossmatters/hybpiper), including `fasta_merge.py` from version 1.3.1. 18 | 19 | # [Minority Report](https://github.com/mossmatters/phyloscripts/tree/master/minorityreport) 20 | 21 | Display minority bipartitions on a species tree. 22 | 23 | # [Haplonerate](https://github.com/mossmatters/phyloscripts/tree/master/haplonerate) 24 | 25 | Adjust phased allelic sequences to have phased alleles only in the largest phase block, with ambiguity codes elsewhere. Used in [Kates et al. AJB 2018](https://www.ncbi.nlm.nih.gov/pubmed/29729187) 26 | 27 | # Branch Length Outlier 28 | 29 | Identify extreme branch lengths in a set of gene trees. Generates images of offending trees so that manual curation of every gene tree is not necessary. 30 | -------------------------------------------------------------------------------- /alleles_workflow/README.md: -------------------------------------------------------------------------------- 1 | # Alleles from HybSeq Data 2 | 3 | Assorted shell and Python scripts to generate allele data from HybPiper output. 4 | 5 | ### Software 6 | 7 | HybPiper (to generate supercontigs): www.github.com/mossmatters/HybPiper 8 | 9 | Picard: http://broadinstitute.github.io/picard/ 10 | 11 | GATK: https://software.broadinstitute.org/gatk/download/ 12 | 13 | WhatsHap: http://whatshap.readthedocs.io 14 | 15 | 16 | ### Prerequisites 17 | 18 | Run the main HybPiper script `reads_first.py` followed by `intronerate.py` to generate supercontigs for each recovered gene. This will function as "reference sequence" for identifying variants. 19 | 20 | 21 | Concatenate all the supercontigs into a single file. Given a directory gerated by HybPiper called `prefix`: 22 | 23 | ```cat prefix/*/prefix/sequences/intron/*_supercontig.fasta > prefix.supercontigs.fasta``` 24 | 25 | Concatenate all of the GFF intron/exon boundary annotations: 26 | 27 | ```cat prefix/*/prefix/sequences/intronerate.gff > prefix.intronerate.fasta``` 28 | 29 | ### `map_to_supercontigs.sh` 30 | 31 | This script will: 32 | 33 | 1. Map paired-end reads to the supercontigs using `bwa mem`. 34 | 2. Remove duplicate reads using `picard`. 35 | 3. Identify variant sites using `gatk HaplotypeCaller` 36 | 4. Filter variant sites to identify only SNPs. 37 | 5. Generate a new set of reference sequences with SNPs replaced by IUPAC ambiguity codes, using `gatk FastaAlternateReferenceMaker` 38 | 39 | **IMPORTANT: Modify the lines indicating paths to the picard and gatk jarfiles!** 40 | 41 | 42 | Command line: 43 | `bash map_to_supercontigs.sh HybPiperPrefix read1.fq read2.fq` 44 | 45 | The Output will be Directory containing several BAM files, VCF files containing all variants and another with only SNPs, and an alternate reference file with SNPs replaced with IUPAC ambiguity bases. 46 | 47 | ### Phase alleles with WhatsHap 48 | 49 | **Note**: WhatsHap requires Python 3. I used a conda environment specifically for whatshap to avoid conflicts with other Python environments. 50 | 51 | Running the above script on each sample will create a set of directories containing the BAM and VCF files. To use the read data to phase this, first create a text file containing the names of these directories, one per line, called `namelist.txt`. To loop over these and generate the Phased output: 52 | 53 | ``` 54 | while read i; do cd $i; whatshap phase -o \ 55 | $i.supercontigs.fasta.snps.whatshap.vcf \ 56 | $i.supercontigs.fasta.snps.vcf \ 57 | $i.supercontigs.fasta.marked.bam; 58 | whatshap stats \ 59 | --gtf $i.whatshap.gtf \ 60 | --tsv $i.whatshap.stats.tsv \ 61 | $i.supercontigs.fasta.snps.whatshap.vcf; 62 | cd ..; 63 | done < ../namelist.txt 64 | ``` 65 | This will create a new VCF with phase information, and the stats command will also generate some table summarizing the phasing results. 66 | 67 | ### `extract_phase_bcftools.sh` 68 | 69 | Run this script to generate separate files for each of the alleles generated by WhatsHap using `bcftools`. 70 | 71 | Command: `extract_phase_bcftools.sh prefix` 72 | 73 | 74 | `haplonerate.py` can be found in this same GitHub Repository. 75 | 76 | The files are now ready to run `haplonerate.py` which can be found here: https://github.com/mossmatters/phyloscripts/tree/master/haplonerate 77 | 78 | Name the output file from `haplonerate.py` as `prefix.supercontigs.alleles.fasta` 79 | 80 | ### `intron_exon_extractor.py` 81 | 82 | Generates separate files for the intron, exon, and supercontig sequences for one sample. Will generate a separate file for each sequence. Can handle default HybPiper, IUPAC-coded, and phased-allele datasets where `_h1` and `_h2` are appended to the sample name. 83 | 84 | Command: `python intron_exon_extractor.py prefix` 85 | 86 | ### `create_alleles_alignments.sh` 87 | 88 | Generate combined intron and exon alignments from phased sequences: 89 | 90 | 1. Align exon sequences using MACSE 91 | 2. Align intron sequences using mafft 92 | 3. Trim both alignments using TrimAl 93 | 4. Combine intron and exon alignments and generate a RAXML partition file using `combine_alignments.py` 94 | 95 | **NOTE: This script is provided only to show parameter settings! The PATH to files and executables is specific to our computer!** 96 | 97 | ### `combine_alignments.py` 98 | 99 | Given an exon alignment and an intron alignment for the same gene, combine them and also output a partition file for RAXML based on codon position and intron location. 100 | 101 | Command `python combine_alignments.py exon.fasta intron.fasta geneName` 102 | 103 | *Requires BioPython* -------------------------------------------------------------------------------- /alleles_workflow/combine_alignments.py: -------------------------------------------------------------------------------- 1 | 2 | #Script to combine exon and intron alignments for a gene and generate a RAxML partition file. 3 | 4 | import sys,os 5 | from Bio import SeqIO 6 | from Bio.Seq import Seq 7 | from Bio.SeqRecord import SeqRecord 8 | 9 | if len(sys.argv) < 4: 10 | print("Usage: python combine_alignments.py exon.fasta intron.fasta geneName") 11 | sys.exit(1) 12 | 13 | exon_fn = sys.argv[1] 14 | intron_fn = sys.argv[2] 15 | geneName = sys.argv[3] 16 | 17 | exon_dict = SeqIO.to_dict(SeqIO.parse(exon_fn,'fasta')) 18 | exonLength = len(next(exon_dict.itervalues())) 19 | with open("{}.combined.fasta".format(geneName),'w') as outfile: 20 | 21 | if os.path.isfile(intron_fn): 22 | for seq in SeqIO.parse(intron_fn,'fasta'): 23 | intronLength = len(seq) 24 | sampleID = seq.id.split("-")[0] 25 | newseq = exon_dict[sampleID].seq + seq.seq 26 | outfile.write(">{}\n{}\n".format(sampleID,newseq)) 27 | partition = """DNA, codons1-2 = 1-{}\\3, 2-{}\\3 28 | DNA, codon3 = 3-{}\\3 29 | DNA, intron = {}-{} 30 | 31 | """.format(exonLength, exonLength, exonLength, exonLength+1,exonLength+intronLength) 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | # if seq.id.startswith("McBryde"): 40 | # sampleID = "MV2" 41 | # else: 42 | 43 | 44 | 45 | 46 | else: 47 | for sampleID in exon_dict: 48 | newseq = exon_dict[sampleID].seq 49 | outfile.write(">{}\n{}\n".format(sampleID,newseq)) 50 | partition = """DNA, codons1-2 = 1-{}\\3, 2-{}\\3 51 | DNA, codon3 = 3-{}\\3 52 | """.format(exonLength, exonLength, exonLength, exonLength+1) 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | with open("{}.combined.partition".format(geneName),'w') as partitionfile: 61 | partitionfile.write(partition) -------------------------------------------------------------------------------- /alleles_workflow/create_alleles_alignments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #PBS -q default 4 | #PBS -l nodes=1:ppn=12 5 | #PBS -j oe 6 | #PBS -o iupac_alignments.out 7 | 8 | set -eo pipefail 9 | 10 | # Shell script to recreate the IUPAC ambiguity coded alignments for Artocarpus. 11 | 12 | 13 | cd ~/Projects/artocarpus/alleles_paper/haplotype_sequences 14 | 15 | genelist=/home/mjohnson/Projects/artocarpus/alleles_paper/singlecopy_genelist.txt 16 | namelist=/home/mjohnson/Projects/artocarpus/alleles_paper/namelist_ajb.txt 17 | 18 | ##########EXONS############ 19 | 20 | ###### Exon sequences generated from HybPiper output: 21 | 22 | mkdir -p exon 23 | rm exon/* 24 | parallel "cat /home/mjohnson/Projects/artocarpus/alleles_paper/haplotype_sequences/{1}/exon/{2}.alleles.FNA >> exon/{2}.alleles.FNA" :::: $namelist :::: $genelist 25 | 26 | ##### Alignments with MACSE 27 | 28 | parallel --eta macse -prog alignSequences -seq exon/{}.alleles.FNA :::: $genelist 29 | 30 | ##### Replace frame shifts ! with gaps - 31 | 32 | mkdir -p macse 33 | rm macse/* 34 | mv exon/*_macse* macse 35 | 36 | parallel sed -i "s/\!/-/g" macse/{}.alleles_macse_NT.fasta :::: $genelist 37 | 38 | ##### Trim alignments to retain only sites present in 75% of taxa 39 | 40 | mkdir -p exon_trimmed 41 | rm exon_trimmed/* 42 | 43 | parallel "trimal -gt 0.75 -in macse/{}.alleles_macse_NT.fasta -out exon_trimmed/{}.alleles.macse.trimmed.FNA" :::: $genelist 44 | 45 | #Fix McBryde-MV2 46 | 47 | parallel sed -i -E 's/McBryde-MV2/McBryde/g' exon_trimmed/{}.macse.trimmed.FNA :::: $genelist 48 | 49 | ###########INTRONS######### 50 | 51 | ##### Intron sequences generated from HybPiper (intronerate.py): 52 | 53 | mkdir -p intron 54 | rm intron/* 55 | parallel "cat /home/mjohnson/Projects/artocarpus/alleles_paper/haplotype_sequences/{1}/intron/{2}.intron.alleles.fasta >> intron/{2}.intron.alleles.fasta" :::: $namelist :::: $genelist 56 | 57 | # Remove gene name from intron sequence files 58 | 59 | #parallel sed -i -E 's/-.+$//g' intron/{}.intron.alleles.fasta :::: $genelist 60 | 61 | # Align intron sequences with MAFFT. Timeout because of known huge sequence. 62 | mkdir -p mafft 63 | rm mafft/* 64 | 65 | parallel --timeout 4000% --eta "mafft --maxiterate 1000 --globalpair --preservecase intron/{}.intron.alleles.fasta > mafft/{}.intron.alleles.mafft.fasta" :::: $genelist 66 | 67 | # Trim alignments to retain only sites present in 75% of taxa 68 | 69 | mkdir -p intron_trimmed 70 | rm intron_trimmed/* 71 | 72 | parallel "trimal -gt 0.75 -in mafft/{}.intron.alleles.mafft.fasta -out intron_trimmed/{}.intron.alleles.mafft.trimmed.fasta" :::: $genelist 73 | 74 | 75 | # Combine alignments 76 | 77 | #parallel python ../../combine_alignments.py exon_trimmed/{}.iupac.macse.trimmed.FNA intron_trimmed/{}.intron.iupac.mafft.trimmed.fasta {} :::: $genelist 78 | 79 | #mkdir -p ../artocarpus_alignments/default/ 80 | #mv *.fasta ../artocarpus_alignments/default/ 81 | #mv *.partition ../artocarpus_alignments/default 82 | -------------------------------------------------------------------------------- /alleles_workflow/extract_phase_bcftools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | #Script to prepare phased haplotype sequences for each for one sample. 4 | 5 | prefix=$1 6 | genelist=genelist.txt 7 | mkdir -p $prefix 8 | cd $prefix 9 | rm -r * 10 | 11 | #Run bcftools to extract sequences 12 | 13 | bgzip -c $prefix.supercontigs.fasta.snps.whatshap.vcf > $prefix.supercontigs.fasta.snps.whatshap.vcf.gz 14 | tabix $prefix.supercontigs.fasta.snps.whatshap.vcf.gz 15 | mkdir -p phased_bcftools 16 | rm phased_bcftools/* 17 | 18 | parallel "samtools faidx $iupac_dir/$prefix.supercontigs.fasta $prefix-{1} | bcftools consensus -H 1 $prefix.supercontigs.fasta.snps.whatshap.vcf.gz > phased_bcftools/$prefix-{1}.phased.fasta" :::: $genelist 19 | parallel "samtools faidx $iupac_dir/$prefix.supercontigs.fasta $prefix-{1} | bcftools consensus -H 2 $prefix.supercontigs.fasta.snps.whatshap.vcf.gz >> phased_bcftools/$prefix-{1}.phased.fasta" :::: $genelist 20 | 21 | cd .. 22 | 23 | 24 | -------------------------------------------------------------------------------- /alleles_workflow/intron_exon_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Script to use the GFF files from intronerate and the ambiguity-encoded FASTA files to generate separate intron and exon files for each gene. 3 | 4 | import re,sys,os,errno,shutil 5 | from Bio import SeqIO 6 | from Bio.Seq import Seq 7 | from Bio.SeqRecord import SeqRecord 8 | 9 | def mkdir_p(path): 10 | try: 11 | os.makedirs(path) 12 | except OSError as exc: # Python >2.5 13 | if exc.errno == errno.EEXIST and os.path.isdir(path): 14 | pass 15 | else: raise 16 | 17 | my_re = re.compile(r"([0-9]+ )(.+):1") 18 | 19 | #Fix the names in IUPAC file 20 | 21 | prefix=sys.argv[1] 22 | 23 | if os.path.isdir(prefix): 24 | os.chdir(prefix) 25 | 26 | if os.path.isfile("{}.supercontigs.fasta.iupac".format(prefix)): 27 | with open("{}.supercontigs.iupac.fasta".format(prefix),'w') as outfile: 28 | for seq in SeqIO.parse("{}.supercontigs.fasta.iupac".format(prefix),'fasta'): 29 | seq.id = my_re.sub("\g<2>",seq.description) 30 | seq.description = '' 31 | SeqIO.write(seq,outfile,'fasta') 32 | 33 | 34 | #Parse GFF into dictionaries for each gene (one for introns, one for exons) 35 | # ASSUMES THE GFF IS SORTED WITHIN EACH GENE!!!! 36 | 37 | intron_dict = {} 38 | exon_dict = {} 39 | 40 | gff_fn = prefix+".intronerate.gff"#sys.argv[2] 41 | for line in open(gff_fn): 42 | line=line.split() 43 | if line[2] == "exon": 44 | try: 45 | exon_dict[line[0]].append((int(line[3])-1,int(line[4]))) 46 | except KeyError: 47 | exon_dict[line[0]] = [(int(line[3])-1,int(line[4]))] 48 | # elif line[2] == "intron": 49 | # try: 50 | # intron_dict[line[0]].append((int(line[3])-1,int(line[4]))) 51 | # except KeyError: 52 | # intron_dict[line[0]] = [(int(line[3])-1,int(line[4]))] 53 | 54 | try: 55 | supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.iupac.fasta".format(prefix),'fasta')) 56 | dataType = "iupac" 57 | except IOError: 58 | try: 59 | supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.alleles.fasta".format(prefix),'fasta')) 60 | dataType = "alleles" 61 | except IOError: 62 | try: 63 | supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.svdq.fasta".format(prefix),'fasta')) 64 | dataType = "svdq" 65 | except IOError: 66 | supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.default.fasta".format(prefix),'fasta')) 67 | dataType = 'default' 68 | 69 | 70 | for gene in exon_dict: 71 | try: 72 | geneLength = len(supercontig_dict[gene]) 73 | except KeyError: 74 | haploGeneName = "{}_h1-{}".format(prefix,gene.split("-")[-1]) 75 | geneLength = len(supercontig_dict[haploGeneName]) 76 | exon_ranges = exon_dict[gene] 77 | # intron_dict[gene] = [(0,exon_dict[0][0]),exon_dict[0][1]] 78 | for exon_interval in range(len(exon_ranges)+1): 79 | if exon_interval == 0: 80 | intron_dict[gene] = [(0,exon_ranges[exon_interval][0]-1)] 81 | elif exon_interval == len(exon_ranges) : 82 | intron_dict[gene].append((exon_ranges[-1][1],geneLength)) 83 | 84 | else: 85 | start = exon_ranges[exon_interval - 1][1] 86 | stop = exon_ranges[exon_interval][0] - 1 87 | intron_dict[gene].append((start,stop)) 88 | #print(intron_dict["NZ866-gene001.single"]) 89 | 90 | 91 | 92 | 93 | 94 | newseq = '' 95 | 96 | for seqType in ["exon","intron","supercontig"]: 97 | if os.path.exists(seqType): 98 | shutil.rmtree(seqType) 99 | os.makedirs(seqType) 100 | 101 | for gene in supercontig_dict: 102 | if gene.startswith("McBryde"): 103 | geneName = gene.split("-")[2] 104 | sampleName = gene.split("-")[1] 105 | else: 106 | geneName = gene.split("-")[1] 107 | sampleName = gene.split("-")[0] 108 | 109 | with open("exon/{}.{}.FNA".format(geneName,dataType),'a') as exonout: 110 | newseq = '' 111 | exonLookupName = supercontig_dict[gene].id.replace("_h1",'') 112 | exonLookupName = exonLookupName.replace("_h2",'') 113 | if exonLookupName not in exon_dict: 114 | continue 115 | for gff_interval in exon_dict[exonLookupName]: 116 | newseq += supercontig_dict[gene].seq[gff_interval[0]:gff_interval[1]] 117 | exonout.write(">{}\n{}\n".format(sampleName,newseq)) 118 | with open("intron/{}.intron.{}.fasta".format(geneName,dataType),'a') as intronout: 119 | newseq='' 120 | for gff_interval in intron_dict[exonLookupName]: 121 | newseq += supercontig_dict[gene].seq[gff_interval[0]:gff_interval[1]] 122 | intronout.write(">{}\n{}\n".format(sampleName,newseq)) 123 | 124 | with open("supercontig/{}.supercontig.{}.fasta".format(geneName,dataType),'a') as supercontigout: 125 | supercontigout.write(">{}\n{}\n".format(sampleName,supercontig_dict[gene].seq)) 126 | -------------------------------------------------------------------------------- /alleles_workflow/map_to_supercontigs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #PBS -q default 4 | #PBS -j oe 5 | #PBS -o ambiguity.out 6 | #PBS -l nodes=1:ppn=1 7 | #PBS -t 1-24 8 | 9 | #cd $TMPDIR #Projects/artocarpus/alleles_paper/iupac_sequences 10 | #prefix=$(tail -n $PBS_ARRAYID /home/mjohnson/Projects/artocarpus/alleles_paper/namelist_ajb.txt | head -1) 11 | 12 | # This workflow will take the supercontig output of HybPiper and return a supercontig that 13 | # contains heterozygous positions as ambiguity bases. Uses paired reads. 14 | 15 | #The script should be run on a FASTA file containing all the supercontigs of interest. 16 | 17 | 18 | if [[ $# -eq 0 ]] ; then 19 | echo 'usage: hybpiper_ambiguity.sh supercontig.fasta readfile1.fq readfile2.fq' 20 | exit 1 21 | fi 22 | 23 | #########CHANGE THESE PATHS AS NEEDED########### 24 | 25 | gatkpath=/opt/Software/GenomeAnalysisTK.jar 26 | picardpath=/opt/Software/picard/build/libs/picard.jar 27 | 28 | #############COMMAND LINE ARGUMENTS############ 29 | 30 | prefix=$1 31 | read1fq=$2 32 | read2fq=$3 33 | 34 | mkdir $prefix 35 | cd $prefix 36 | 37 | #while read i 38 | #do 39 | #cat ~/Projects/artocarpus/alleles_paper/hybpiper/$prefix/$i/$prefix/sequences/intron/"$i"_supercontig.fasta 40 | #done < ~/Projects/artocarpus/alleles_paper/newtargets_genelist.txt >> $prefix.supercontigs.fasta 41 | 42 | supercontig=$prefix.supercontigs.fasta 43 | 44 | #read1fq=~/Projects/artocarpus/alleles_paper/reads/"$prefix".R1.paired.fastq 45 | #read2fq=~/Projects/artocarpus/alleles_paper/reads/"$prefix".R2.paired.fastq 46 | 47 | #####STEP ZERO: Make Reference Databases 48 | 49 | java -jar $picardpath CreateSequenceDictionary \ 50 | R=$supercontig 51 | bwa index $supercontig 52 | samtools faidx $supercontig 53 | 54 | #####STEP ONE: Map reads 55 | 56 | echo "Mapping Reads" 57 | 58 | bwa mem $supercontig $read1fq $read2fq | samtools view -bS - | samtools sort - -o $supercontig.sorted.bam 59 | 60 | java -jar $picardpath FastqToSam \ 61 | F1=$read1fq \ 62 | F2=$read2fq \ 63 | O=$supercontig.unmapped.bam \ 64 | SM=$supercontig 65 | 66 | java -jar $picardpath MergeBamAlignment \ 67 | ALIGNED=$supercontig.sorted.bam \ 68 | UNMAPPED=$supercontig.unmapped.bam \ 69 | O=$supercontig.merged.bam \ 70 | R=$supercontig 71 | 72 | #####STEP TWO: Mark duplicates 73 | 74 | echo "Marking Duplicates" 75 | java -jar $picardpath MarkDuplicates \ 76 | I=$supercontig.merged.bam \ 77 | O=$supercontig.marked.bam \ 78 | M=$supercontig.metrics.txt 79 | 80 | #######STEP THREE: Identify variants, select only SNPs 81 | 82 | echo "Identifying variants" 83 | 84 | samtools index $supercontig.marked.bam 85 | #samtools mpileup -B -f $supercontig $supercontig.marked.bam -v -u > $supercontig.vcf 86 | 87 | java -jar $gatkpath \ 88 | -R $supercontig \ 89 | -T HaplotypeCaller \ 90 | -I $supercontig.marked.bam \ 91 | -o $supercontig.vcf 92 | 93 | 94 | 95 | time java -jar $gatkpath \ 96 | -T SelectVariants \ 97 | -R $supercontig \ 98 | -V $supercontig.vcf \ 99 | -selectType SNP \ 100 | -o $supercontig.snps.vcf 101 | 102 | 103 | ######STEP FOUR: Output new supercontig FASTA with ambiguity codes 104 | 105 | echo "Generating IUPAC FASTA file" 106 | 107 | java -jar $gatkpath \ 108 | -T FastaAlternateReferenceMaker \ 109 | -R $supercontig \ 110 | -o $supercontig.iupac \ 111 | -V $supercontig.snps.vcf \ 112 | -IUPAC $supercontig 113 | 114 | cd .. 115 | cp -r $prefix /home/mjohnson/Projects/artocarpus/alleles_paper/iupac_sequences/$prefix 116 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /brlenoutliers/brlen_outliers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | helptext='''This script identfies branch length outliers on a phylogeny. An outlier is a 4 | branch with a length that exceeds a percentage of the maximum depth of the tree. 5 | 6 | The input is a file containing one tree in newick format, and an optional file containing 7 | a list of outgroup taxa (one per line). 8 | 9 | The output will be an ASCII depiction of each branch with a length exceeding the threshold 10 | (default 25% for ingroups, 75% for outgroups). A PNG file will also be generated for the 11 | tree, with outgroup taxa in blue and branch length outliers in red. 12 | 13 | Dependencies: 14 | Python > 2.7 15 | ETE3 installed with all graphical dependencies 16 | ''' 17 | 18 | #Given a tree, determine if there are long branch lengths 19 | 20 | import sys, argparse, os 21 | from ete3 import Tree,TreeStyle,TextFace,NodeStyle 22 | 23 | 24 | # outgroups=set([x.rstrip() for x in open(sys.argv[2])]) 25 | # outgroups_in_tree = list(set(t.get_leaf_names()).intersection(set(outgroups))) 26 | # ingroups_in_tree = list(set(t.get_leaf_names()).difference(set(outgroups))) 27 | # 28 | # if len(outgroups_in_tree) > 1: 29 | # ancestor = t.get_common_ancestor(outgroups_in_tree) 30 | # try: 31 | # t.set_outgroup(ancestor) 32 | # #ingroup_monophyly = t.check_monophyly(ingroups_in_tree,"name") 33 | # #if not ingroup_monophyly[0]: 34 | # # sys.stdout.write("Ingroup polyphyletic! for {}\n".format(sys.argv[1])) 35 | # # print(ingroup_monophyly) 36 | # except: 37 | # sys.stdout.write("Ingroup not monophyletic for {}!\n".format(sys.argv[1])) 38 | # sys.exit(1) 39 | # #print t.write() 40 | # elif len(outgroups_in_tree) == 1: 41 | # t.set_outgroup(outgroups_in_tree[0]) 42 | # ancestor = t.get_leaves_by_name(outgroups_in_tree[0])[0] 43 | # #print t.write() 44 | # else: 45 | # sys.stdout.write("no outgroups found for {}!\n".format(sys.argv[1])) 46 | # sys.exit(1) 47 | 48 | #ingroup = ancestor.get_sisters()[0].detach() 49 | #ingroup_depth = ingroup.get_farthest_node()[1] 50 | 51 | 52 | 53 | #print(outgroups) 54 | def get_bad_nodes(t,inlen,outlen,leaflen,outgroups=None): 55 | bad_nodes = [] 56 | tree_depth = t.get_farthest_node()[1] 57 | for node in t.traverse(): 58 | isOutgroup = True 59 | for leaf in node.get_leaves(): 60 | if outgroups: 61 | if leaf.name not in outgroups: 62 | isOutgroup = False 63 | else: 64 | isOutgroup=False 65 | if isOutgroup: 66 | if node.dist > tree_depth * outlen: 67 | print(node) 68 | bad_nodes.append(node) 69 | elif node.is_leaf(): 70 | if node.dist > tree_depth * leaflen: 71 | print(node) 72 | bad_nodes.append(node) 73 | elif node.dist > tree_depth * inlen: 74 | print(node) 75 | bad_nodes.append(node) 76 | return bad_nodes 77 | 78 | def make_png(t,bad_nodes,png_name,outgroups=None): 79 | for n in t.traverse(): 80 | n.img_style["size"]=0 81 | if n in bad_nodes: 82 | nstyle = NodeStyle() 83 | nstyle["hz_line_color"] = "red" 84 | nstyle["hz_line_width"] = 3 85 | n.set_style(nstyle) 86 | if n.is_leaf(): 87 | if outgroups: 88 | if n.name in outgroups: 89 | name_face = TextFace(n.name, fgcolor="blue") 90 | else: 91 | name_face = TextFace(n.name, fgcolor="black") 92 | else: 93 | name_face = TextFace(n.name, fgcolor="black") 94 | n.add_face(name_face,0,"branch-right") 95 | 96 | if len(bad_nodes) > 0: 97 | ts = TreeStyle() 98 | ts.show_leaf_name = False 99 | gene_name = png_name 100 | ts.title.add_face(TextFace(gene_name,fsize=15,bold=True),0) 101 | my_png = t.render(png_name,tree_style=ts) 102 | 103 | def main(): 104 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 105 | parser.add_argument("treefile",help="File containing one tree in newick format") 106 | parser.add_argument("--outgroups",help="File containing list of outgroup taxa, one per line") 107 | parser.add_argument("--png",help="Name of png file, default is same as tree file name",default=None) 108 | parser.add_argument("--inlen",help="Percentage of max tree depth for ingroup outliers default = %(default)s",default=0.25,type=float) 109 | parser.add_argument("--outlen",help="Percentage of max tree depth for outgroup outliers default = %(default)s",default=0.75,type=float) 110 | parser.add_argument("--leaflen",help="Percentage of max tree depth for leaf outliers default = %(default)s",default=0.25,type=float) 111 | 112 | if len(sys.argv) == 1: 113 | parser.print_help() 114 | sys.exit(1) 115 | args = parser.parse_args() 116 | 117 | if os.path.isfile(args.treefile): 118 | t = Tree(args.treefile) 119 | else: 120 | print("Treefile {} not found!\n".format(args.treefile)) 121 | sys.exit(1) 122 | if args.png: 123 | if args.png.endswith(".png"): 124 | png_name = args.png 125 | else: 126 | png_name = args.png + ".png" 127 | else: 128 | png_name = os.path.basename(args.treefile).split(".")[0] + ".png" 129 | 130 | if args.outgroups: 131 | outgroups = set([x.rstrip() for x in open(args.outgroups)]) 132 | else: 133 | outgroups = None 134 | 135 | bad_nodes = get_bad_nodes(t,args.inlen,args.outlen,args.leaflen,outgroups=outgroups) 136 | if len(bad_nodes) > 0: 137 | make_png(t,bad_nodes,png_name,outgroups=outgroups) 138 | #else: 139 | #print("No outliers found for {}\n".format(os.path.basename(args.treefile))) 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | if __name__ == "__main__":main() 150 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /haplonerate/README.md: -------------------------------------------------------------------------------- 1 | # Haplonerate 2 | 3 | After using a read-backed phasing algorithm, such as GATK or WhatsHap, allelic sequences generally contain phased alleles throughout the entire reference sequence. However, there is frequently more than one phase block, especially if the data was generated with targeted sequencing (HybSeq) where multiple exons were recovered from a single locus. 4 | 5 | In the example below, variant sites in the blue (left) block cannot be phased with alleles in the yellow (right) block. 6 | 7 | ![](img/AJB_Figure_1.pdf) 8 | 9 | One solution is to reduce the sequence to the longest phase block, deleting other sites. This is not ideal for phylogenetic analysis, as the deleted sites may retain informative sites among species. It will also result in sequences that are of variable lengths across individuals, which may affect alignment. 10 | 11 | Another solution would be to (hard) mask the sequence outside the longest phase block with Ns. This retains the sequence length but intra-individual informative sites are still lost. 12 | 13 | Finally, phased alleles can be retained in the longest phase block, variant sites are replaced with with ambiguity codes in other regions. 14 | 15 | This script takes two files, containing phased haplotype sequences for one 16 | or more genes and edits the sequences to retain only variable sites in the largest phase 17 | block. The two haplotype sequences can be generated by bcftools, for example. 18 | 19 | 20 | 21 | 22 | ## Setup 23 | 24 | 1. Following variant calling, generate a new reference sequence that contains ambiguity codes. In GATK, use `FastaAlternateReferenceMaker`. 25 | 1. Run a Read-backed phasing algorithm, such as WhatsHap (http://whatshap.readthedocs.io/), which generates a phased VCF file and a GTF file containing the locations of phase blocks. 26 | 2. Generate separate phased sequences for each gene in FASTA format, for example using `bcftools consensus` in the samtools package. 27 | 3. Run `haplonerate.py` to adjust the sequences. 28 | 29 | 30 | ## Input 31 | 32 | **Required**: 33 | 34 | * GTF file annotating the locations of phase blocks. 35 | * Two FASTA file containing sequences for one or more genes. The script assumes that the sequences are paired-- the first sequence in the first file corresponds to the first sequence in the second file. 36 | 37 | **Options** 38 | 39 | `--output` Specifies an output file for the edited sequences (default is `stdout`). Both alleles are written to the same file with `_h1` or `_h2` appended to the name. 40 | 41 | `--block` Specifies a file for printing the phase block information for each gene (number of blocks, length of gene, and length of longest block). 42 | 43 | `--edit` Which editing method is preferred. There are three options for editing the output sequences: 44 | 45 | * **delete**: retain only the longest phase block, delete the rest of the sequence 46 | * **ref**: use reference sequences to fill the rest of the sequence outside the longest block (default) 47 | * **mask**: fill sequence not in the longest phase block with N 48 | 49 | If `--edit ref` is used, the reference sequence must be supplied with `--reference`. 50 | 51 | ## Example Usages 52 | 53 | **Default usage (`--edit ref`)** 54 | 55 | `haplonerate.py whatshap.gtf haplotype_h1.fasta haplotype_h2.fasta --reference ambiguity_ref.fasta` 56 | 57 | ### Triploid Data 58 | 59 | Use the `haplonerate3N.py` script instead: 60 | 61 | `haplonerate.py whatshap.gtf haplotype_h1.fasta haplotype_h2.fasta haplotype_h3.fasta --reference ambiguity_ref.fasta --edit delete` 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /haplonerate/haplonerate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import sys,os,argparse 5 | from Bio import SeqIO 6 | from Bio.Seq import Seq 7 | from Bio.SeqRecord import SeqRecord 8 | 9 | helptext = '''This script takes two files, containing phased haplotype sequences for one 10 | or more genes and edits the sequences to retain only variable sites in the largest phase 11 | block. The two haplotype sequences can be generated by bcftools, for example. 12 | 13 | There are three options for editing the sequences: 14 | 15 | delete: retain only the longest phase block, delete the rest of the sequence 16 | ref: use reference sequences to fill the rest of the sequence outside the longest block 17 | mask: fill sequence not in the longest phase block with N 18 | 19 | haplonerate reads the phase blocks from a GTF file, such as the one produced by the stats 20 | function in whatshap 21 | 22 | ''' 23 | 24 | def get_gtf_dict(gtf_fn): 25 | gtf_dict = {} 26 | for line in open(gtf_fn): 27 | line = line.split() 28 | geneName = line[0]#.split("-")[-1] 29 | phase_range = (int(line[3]),int(line[4])) 30 | try: 31 | gtf_dict[geneName].append(phase_range) 32 | except KeyError: 33 | gtf_dict[geneName] = [phase_range] 34 | return gtf_dict 35 | 36 | 37 | 38 | #prefix = sys.argv[1] 39 | 40 | #Use the bcftools method to extract haplotypes from each gene 41 | 42 | #seqdirectory = "/home/mjohnson/Projects/artocarpus/alleles_paper/iupac_sequences/{}".format(prefix) 43 | #geneList = set([x.rstrip() for x in open("/home/mjohnson/Projects/artocarpus/alleles_paper/newtargets_genelist.txt")]) 44 | #os.chdir(prefix) 45 | 46 | #Read in the ambiguity coded sequences into a dictionary 47 | 48 | #iupac_dict = SeqIO.to_dict(SeqIO.parse("{}/{}.supercontigs.iupac.fasta".format(seqdirectory,prefix),'fasta')) 49 | 50 | 51 | 52 | #Use the GTF from Whatshap to determine the longest phase block for the sequence. 53 | 54 | 55 | def getLargestPhaseBlock(ranges,seqLength): 56 | '''Given the phase blocks for a sequence, return the most inclusive range''' 57 | longestblock = 0 58 | for r in range(len(ranges)): 59 | if r == 0: 60 | start = 1 61 | if len(ranges) > 1: 62 | end = ranges[r+1][0] - 1 63 | else: 64 | end = seqLength 65 | 66 | elif r == len(ranges) - 1 : 67 | start = ranges[r-1][1] + 1 68 | end = seqLength 69 | 70 | else: 71 | start = ranges[r-1][1] + 1 72 | end = ranges[r+1][0] - 1 73 | 74 | 75 | if end - start > longestblock: 76 | most_inclusive_range = (start,end) 77 | longestblock = end - start 78 | #print seqLength,ranges,most_inclusive_range 79 | return most_inclusive_range 80 | 81 | def insertPhase(iupacSeq,haploSeq,phaseBlock,newSeqID): 82 | '''Given an IUPAC sequence, the phased haplotype sequence, and the longest phaseBlock, 83 | Return one sequence with phased characters in the block, IUPAC sequences outside it''' 84 | 85 | newSeq = '' 86 | for c in range(len(iupacSeq.seq)): 87 | if phaseBlock[0] -1 <= c <= phaseBlock[1] - 1: 88 | newSeq += haploSeq.seq[c] 89 | else: 90 | newSeq += iupacSeq.seq[c] 91 | return SeqRecord(Seq(newSeq),id=newSeqID,description='') 92 | 93 | 94 | def replace_with_ref(seq1,seq2,ref,phaseBlock): 95 | if seq1.seq == seq2.seq: 96 | return [SeqRecord(ref.seq,id=ref.id,description='')] 97 | else: 98 | haplo1 = insertPhase(ref,seq1,phaseBlock,"{}_h1".format(seq1.id)) 99 | haplo2 = insertPhase(ref,seq2,phaseBlock,"{}_h2".format(seq2.id)) 100 | return [haplo1,haplo2] 101 | 102 | def delete_extra(seq1,seq2,ref,phaseBlock): 103 | if seq1.seq == seq2.seq: 104 | return [SeqRecord(ref.seq,id=ref.id,description='')] 105 | else: 106 | haplo1 = SeqRecord(seq1.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h1".format(seq1.id),description='') 107 | haplo2 = SeqRecord(seq2.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h2".format(seq2.id),description='') 108 | return [haplo1,haplo2] 109 | 110 | def main(): 111 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 112 | parser.add_argument("gtf",help="gtf file annotating the positions of phase blocks for each gene") 113 | parser.add_argument("haplotype_files",help="Two FASTA files containing sequences for one or more genes",nargs="+") 114 | parser.add_argument("--reference","-r",help="FASTA file of reference sequences, required with --edit ref") 115 | parser.add_argument("--edit",help="How to deal with sites outside longest phase block. Default: ref",default="ref",choices=["ref","delete","mask"]) 116 | parser.add_argument("--output",'-o',help="Output FASTA containing haplotype sequences for each gene. default = stdout",default=sys.stdout) 117 | parser.add_argument("--block",help="file to write phase block information") 118 | args = parser.parse_args() 119 | 120 | if len(args.haplotype_files) != 2: 121 | print("Please supply exactly two haplotype FASTA files!\n") 122 | sys.exit(1) 123 | 124 | gtf_dict = get_gtf_dict(args.gtf) 125 | 126 | #if args.edit == "ref": 127 | ref_dict = SeqIO.to_dict(SeqIO.parse(args.reference,'fasta')) 128 | 129 | haplotype1_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[0],'fasta')) 130 | haplotype2_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[1],'fasta')) 131 | geneList = set(haplotype1_dict.keys()) 132 | seqs_to_write = [] 133 | phase_report = [] 134 | for gene in geneList: 135 | if gene in gtf_dict: 136 | if gene in ref_dict: 137 | phaseBlock = getLargestPhaseBlock(gtf_dict[gene], len(ref_dict[gene])) 138 | phase_report.append("{}\t{}\t{}\t{}\t{}".format(gene,len(gtf_dict[gene]),len(ref_dict[gene]),phaseBlock[0],phaseBlock[1])) 139 | if args.edit == 'ref': 140 | seqs_to_write += replace_with_ref(haplotype1_dict[gene],haplotype2_dict[gene],ref_dict[gene],phaseBlock) 141 | elif args.edit == "delete": 142 | seqs_to_write += delete_extra(haplotype1_dict[gene],haplotype2_dict[gene],ref_dict[gene],phaseBlock) 143 | else: 144 | if gene in ref_dict: 145 | seqs_to_write += [SeqRecord(ref_dict[gene].seq,id=gene,description='')] 146 | SeqIO.write(seqs_to_write,args.output,'fasta') 147 | if args.block: 148 | with open(args.block,'w') as outfile: 149 | outfile.write("\n".join(phase_report)) 150 | 151 | 152 | if __name__ == "__main__":main() 153 | 154 | 155 | 156 | 157 | 158 | #For sites in the longest block, replace sequences in iupac sequence with phased sequence. 159 | 160 | #Use the intron/exon extractor as before 161 | 162 | -------------------------------------------------------------------------------- /haplonerate/haplonerate3N.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import sys,os,argparse 5 | from Bio import SeqIO 6 | from Bio.Seq import Seq 7 | from Bio.SeqRecord import SeqRecord 8 | 9 | helptext = '''This script takes two files, containing phased haplotype sequences for one 10 | or more genes and edits the sequences to retain only variable sites in the largest phase 11 | block. The two haplotype sequences can be generated by bcftools, for example. 12 | 13 | There are three options for editing the sequences: 14 | 15 | delete: retain only the longest phase block, delete the rest of the sequence 16 | ref: use reference sequences to fill the rest of the sequence outside the longest block 17 | mask: fill sequence not in the longest phase block with N 18 | 19 | haplonerate reads the phase blocks from a GTF file, such as the one produced by the stats 20 | function in whatshap 21 | 22 | ''' 23 | 24 | def get_gtf_dict(gtf_fn): 25 | gtf_dict = {} 26 | for line in open(gtf_fn): 27 | line = line.split() 28 | geneName = line[0]#.split("-")[-1] 29 | phase_range = (int(line[3]),int(line[4])) 30 | try: 31 | gtf_dict[geneName].append(phase_range) 32 | except KeyError: 33 | gtf_dict[geneName] = [phase_range] 34 | return gtf_dict 35 | 36 | 37 | 38 | #prefix = sys.argv[1] 39 | 40 | #Use the bcftools method to extract haplotypes from each gene 41 | 42 | #seqdirectory = "/home/mjohnson/Projects/artocarpus/alleles_paper/iupac_sequences/{}".format(prefix) 43 | #geneList = set([x.rstrip() for x in open("/home/mjohnson/Projects/artocarpus/alleles_paper/newtargets_genelist.txt")]) 44 | #os.chdir(prefix) 45 | 46 | #Read in the ambiguity coded sequences into a dictionary 47 | 48 | #iupac_dict = SeqIO.to_dict(SeqIO.parse("{}/{}.supercontigs.iupac.fasta".format(seqdirectory,prefix),'fasta')) 49 | 50 | 51 | 52 | #Use the GTF from Whatshap to determine the longest phase block for the sequence. 53 | 54 | 55 | def getLargestPhaseBlock(ranges,seqLength): 56 | '''Given the phase blocks for a sequence, return the most inclusive range''' 57 | longestblock = 0 58 | for r in range(len(ranges)): 59 | if r == 0: 60 | start = 1 61 | if len(ranges) > 1: 62 | end = ranges[r+1][0] - 1 63 | else: 64 | end = seqLength 65 | 66 | elif r == len(ranges) - 1 : 67 | start = ranges[r-1][1] + 1 68 | end = seqLength 69 | 70 | else: 71 | start = ranges[r-1][1] + 1 72 | end = ranges[r+1][0] - 1 73 | 74 | 75 | if end - start > longestblock: 76 | most_inclusive_range = (start,end) 77 | longestblock = end - start 78 | #print seqLength,ranges,most_inclusive_range 79 | return most_inclusive_range 80 | 81 | def insertPhase(iupacSeq,haploSeq,phaseBlock,newSeqID): 82 | '''Given an IUPAC sequence, the phased haplotype sequence, and the longest phaseBlock, 83 | Return one sequence with phased characters in the block, IUPAC sequences outside it''' 84 | 85 | newSeq = '' 86 | for c in range(len(iupacSeq.seq)): 87 | if phaseBlock[0] -1 <= c <= phaseBlock[1] - 1: 88 | newSeq += haploSeq.seq[c] 89 | else: 90 | newSeq += iupacSeq.seq[c] 91 | return SeqRecord(Seq(newSeq),id=newSeqID,description='') 92 | 93 | 94 | def replace_with_ref(seq1,seq2,ref,phaseBlock): 95 | if seq1.seq == seq2.seq: 96 | return [SeqRecord(ref.seq,id=ref.id,description='')] 97 | else: 98 | haplo1 = insertPhase(ref,seq1,phaseBlock,"{}_h1".format(seq1.id)) 99 | haplo2 = insertPhase(ref,seq2,phaseBlock,"{}_h2".format(seq2.id)) 100 | return [haplo1,haplo2] 101 | 102 | def delete_extra(seq1,seq2,seq3,ref,phaseBlock): 103 | # if seq1.seq == seq2.seq: 104 | # return [SeqRecord(ref.seq,id=ref.id,description='')] 105 | # else: 106 | 107 | haplo1 = SeqRecord(seq1.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h1".format(seq1.id),description='') 108 | haplo2 = SeqRecord(seq2.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h2".format(seq2.id),description='') 109 | haplo3 = SeqRecord(seq3.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h3".format(seq1.id),description='') 110 | return [haplo1,haplo2,haplo3] 111 | 112 | def main(): 113 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 114 | parser.add_argument("gtf",help="gtf file annotating the positions of phase blocks for each gene") 115 | parser.add_argument("haplotype_files",help="Three FASTA files containing sequences for one or more genes",nargs="+") 116 | parser.add_argument("--reference","-r",help="FASTA file of reference sequences, required with --edit ref") 117 | parser.add_argument("--edit",help="How to deal with sites outside longest phase block. Default: ref",default="ref",choices=["ref","delete","mask"]) 118 | parser.add_argument("--output",'-o',help="Output FASTA containing haplotype sequences for each gene. default = stdout",default=sys.stdout) 119 | parser.add_argument("--block",help="file to write phase block information") 120 | args = parser.parse_args() 121 | 122 | if len(args.haplotype_files) != 3: 123 | print("Please supply exactly three haplotype FASTA files!\n") 124 | sys.exit(1) 125 | 126 | gtf_dict = get_gtf_dict(args.gtf) 127 | 128 | #if args.edit == "ref": 129 | ref_dict = SeqIO.to_dict(SeqIO.parse(args.reference,'fasta')) 130 | 131 | haplotype1_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[0],'fasta')) 132 | haplotype2_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[1],'fasta')) 133 | haplotype3_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[2],'fasta')) 134 | geneList = set(haplotype1_dict.keys()) 135 | seqs_to_write = [] 136 | phase_report = [] 137 | for gene in geneList: 138 | if gene in gtf_dict: 139 | if gene in ref_dict: 140 | phaseBlock = getLargestPhaseBlock(gtf_dict[gene], len(ref_dict[gene])) 141 | phase_report.append("{}\t{}\t{}\t{}\t{}".format(gene,len(gtf_dict[gene]),len(ref_dict[gene]),phaseBlock[0],phaseBlock[1])) 142 | if args.edit == 'ref': 143 | seqs_to_write += replace_with_ref(haplotype1_dict[gene],haplotype2_dict[gene],ref_dict[gene],phaseBlock) 144 | elif args.edit == "delete": 145 | seqs_to_write += delete_extra(haplotype1_dict[gene],haplotype2_dict[gene],haplotype3_dict[gene],ref_dict[gene],phaseBlock) 146 | else: 147 | if gene in ref_dict: 148 | seqs_to_write += [SeqRecord(ref_dict[gene].seq,id=gene,description='')] 149 | SeqIO.write(seqs_to_write,args.output,'fasta') 150 | if args.block: 151 | with open(args.block,'w') as outfile: 152 | outfile.write("\n".join(phase_report)) 153 | 154 | 155 | if __name__ == "__main__":main() 156 | 157 | 158 | 159 | 160 | 161 | #For sites in the longest block, replace sequences in iupac sequence with phased sequence. 162 | 163 | #Use the intron/exon extractor as before 164 | 165 | -------------------------------------------------------------------------------- /haplonerate/img/AJB_Figure_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/haplonerate/img/AJB_Figure_1.pdf -------------------------------------------------------------------------------- /homologizer/convert_to_nexus.py: -------------------------------------------------------------------------------- 1 | 2 | # Script to convert Physcomitrium/Entosthodon phased FASTA gene files into NEXUS format 3 | # Only samples in the list will be retained 4 | # The gene name will also be stripped when writing out 5 | import sys 6 | from Bio import AlignIO,SeqIO 7 | from Bio.Alphabet import generic_dna 8 | from Bio.Nexus import Nexus 9 | from io import StringIO 10 | 11 | 12 | 13 | samples_to_keep = ["Physcomitrium-immersum-3176_h1", 14 | "Physcomitrium-immersum-3176_h2", 15 | "Entosthodon-hungaricus-3838_h1", 16 | "Entosthodon-hungaricus-3838_h2", 17 | "Physcomitrium-pyriforme-3410_h1", 18 | "Physcomitrium-pyriforme-3410_h2"] 19 | 20 | alignment_fn = sys.argv[1] 21 | geneID = alignment_fn.split(".")[0] 22 | 23 | reduced_alignment_fn = "{}.revbayes.nexus".format(geneID) 24 | 25 | seqs_to_write = [] 26 | with open(reduced_alignment_fn,'w') as outfile: 27 | for seq in SeqIO.parse(alignment_fn,'fasta', alphabet=generic_dna): 28 | seq.id = seq.id.replace("-{}".format(geneID),"") 29 | 30 | if "_h" in seq.id: 31 | if seq.id in samples_to_keep: 32 | seqs_to_write.append(seq) 33 | else: 34 | seqs_to_write.append(seq) 35 | #SeqIO.write(seqs_to_write,reduced_alignment_fn,'nexus') 36 | 37 | output = StringIO() 38 | SeqIO.write(seqs_to_write, output, 'nexus') 39 | p = Nexus.Nexus() 40 | p.read(output.getvalue()) 41 | p.write_nexus_data(reduced_alignment_fn, interleave=False) 42 | 43 | #AlignIO.convert(reduced_alignment_fn,'fasta',reduced_alignment_fn.replace("fasta","nexus") ,'nexus',generic_dna,interleave=False) 44 | 45 | -------------------------------------------------------------------------------- /homologizer/label_swap.txt: -------------------------------------------------------------------------------- 1 | Entosthodon-hungaricus-3838_h1 Entosthodon-hungaricus-3838_h2 2 | Physcomitrium-immersum-3176_h1 Physcomitrium-immersum-3176_h2 3 | Physcomitrium-pyriforme-3410_h1 Physcomitrium-pyriforme-3410_h2 -------------------------------------------------------------------------------- /homologizer/readme.md: -------------------------------------------------------------------------------- 1 | # Homologizer 2 | 3 | > "It's too late to homologize..." -Not One Republic 4 | 5 | It's not too late! Using the setHomeologPhase function in [RevBayes](revbayes.github.io), gene alignments with pairs of sequences can be adjusted so that labels across genes are homologized. Once adjusted, the data from many genes can be analyzed together, for example using a concatenated supermatrix or summary species tree analysis. 6 | 7 | ### Step 0: Input files 8 | 9 | Gene alignments that include samples that have two homeolog sequences. 10 | The sequences within each gene should be phased. 11 | For target capture data, see the `alleles_workflow` and `haplonerate` methods in this same [Phyloscripts](https://github.com/mossmatters/phyloscripts) repository for tips on generating phased haplotypes within a gene sequence. The gene alignments can be in FASTA or NEXUS format, but must end with a regular suffix. For example, all fasta files must end with `.fasta` or `.fa`. Place all of the alignment files in a directory. **Do not place any other files in that directory**. 12 | 13 | Users must also prepare a text file containing the naming scheme for sequences, separated by commas. For example: 14 | 15 | Entosthodon-hungaricus-3838_h1,Entosthodon-hungaricus-3838-h2 16 | Physcomitrium-immersum-3176_h1,Physcomitrium-immersum-3176_h2 17 | Physcomitrium-pyriforme-3410_h1,Physcomitrium-pyriforme-3410_h2 18 | 19 | These labels will be used during the RevBayes script to swap labels for the polyploid individuals. 20 | 21 | **In this version, only two sequences per individual are supported** 22 | 23 | This repository also contains `revbayes_template.txt` which is adapted from Will Freyman's [repository](https://github.com/wf8/homeolog_phasing). This file contains all the options for running RevBayes MCMC using the combined alignment. Adjust the parameters as needed (for example, number of generations or substitution model). 24 | 25 | ### Step 1: Making RevBayes scripts 26 | 27 | Given a set of gene alignments, we will need to prepare RevBayes files that have the appropriate alleles to switch. The paired labels are taken from the label swap file, and is added to a basic template for RevBayes (taken from Will Freyman's version). 28 | 29 | **For now, only genes with no missing labels are accepted** 30 | 31 | For ease of analysis, all the genes are split up into chunks of genes (by default, max 25 genes). This means that for 250 loci, there will be 10 RevBayes scripts generated. 32 | 33 | Sequences are converted NEXUS for RevBayes. 34 | 35 | ### Step 2: Run RevBayes on sets of genes 36 | 37 | Issues: 38 | 39 | - MPIrun version of RevBayes crashes, probably a memory error 40 | - Single-thread version of RevBayes grabs all RAM on the machine (256 GB on one machine!) 41 | 42 | ### Step 3: Summarize RevBayes output 43 | 44 | 45 | The script `swap_labels.py` reads the RevBayes log files and calculates the posterior probability of label swapping. A threshold can be picked (set to 95% by default), if the swapping PP is below this, both sequences are deleted from that sample. The output 46 | 47 | # TODO: 48 | 49 | - Get RevBayes running more efficiently 50 | - Accomodate missing labels 51 | - How to summarize across different chunks of genes. Each chunk will be homologized, but no guarantee of this across chunks. -------------------------------------------------------------------------------- /homologizer/revbayes_template.txt: -------------------------------------------------------------------------------- 1 | 2 | num_loci = alignments.size() 3 | num_swap = label_swap.size() 4 | for (i in 1:num_loci) { 5 | data[i] = readDiscreteCharacterData(alignments[i][1]) 6 | } 7 | 8 | 9 | 10 | # add missing taxa 11 | for (i in 1:num_loci) { 12 | for (j in 1:num_loci) { 13 | data[i].addMissingTaxa(data[j].taxa()) 14 | } 15 | } 16 | num_tips = data[1].ntaxa() 17 | 18 | # set initial phase 19 | for (i in 1:num_loci) { 20 | for (j in 1:num_swap){ 21 | data[i].setHomeologPhase(label_swap[j][1],label_swap[j][1]) 22 | } 23 | } 24 | 25 | mvi = 0 26 | 27 | n_branches = 2 * num_tips - 3 28 | for (i in 1:n_branches) { 29 | branch_lengths[i] ~ dnExponential(10) 30 | moves[++mvi] = mvScale(branch_lengths[i], lambda=1.0, weight=2) 31 | } 32 | topology ~ dnUniformTopology(data[1].taxa()) 33 | moves[++mvi] = mvNNI(topology, weight=20.0) 34 | moves[++mvi] = mvSPR(topology, weight=20.0) 35 | tree := treeAssembly(topology, branch_lengths) 36 | 37 | for (i in 1:num_loci) { 38 | 39 | # gtr for each locus 40 | er_prior <- v(1,1,1,1,1,1) 41 | er[i] ~ dnDirichlet(er_prior) 42 | moves[++mvi] = mvSimplexElementScale(er[i], weight=3) 43 | 44 | pi_prior <- v(1,1,1,1) 45 | pi[i] ~ dnDirichlet(pi_prior) 46 | moves[++mvi] = mvSimplexElementScale(pi[i], weight=3) 47 | 48 | Q[i] := fnGTR(er[i], pi[i]) 49 | 50 | ctmc[i] ~ dnPhyloCTMC(tree=tree, Q=Q[i], type="DNA") 51 | ctmc[i].clamp(data[i]) 52 | } 53 | 54 | w = 1/8 55 | for (i in 1:num_loci) { 56 | # switch phasing proposals 57 | for (j in 1:num_swap){ 58 | moves[++mvi] = mvHomeologPhase(ctmc[i],label_swap[j][1],label_swap[j][2],weight=w) 59 | } 60 | } 61 | 62 | mymodel = model(Q) 63 | 64 | monitors[1] = mnModel(filename=output_file + ".log", printgen=10) 65 | monitors[2] = mnFile(filename=output_file + ".trees", printgen=10, tree) 66 | monitors[3] = mnScreen(printgen=10) 67 | for (i in 1:num_loci){ 68 | monitors[i+3] = mnHomeologPhase(filename=output_file + "_" + alignments[i][1] + "_phase.log", printgen=10, ctmc[i]) 69 | } 70 | mymcmc = mcmc(mymodel, monitors, moves) 71 | mymcmc.run(generations=30000) 72 | 73 | treetrace = readTreeTrace(output_file + ".trees", treetype="clock", burnin=0.25) 74 | map_tree = mapTree(treetrace, output_file + ".tree") 75 | 76 | mymcmc.operatorSummary() 77 | -------------------------------------------------------------------------------- /homologizer/revscript_maker.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # This script will generate the RevBayes scripts necessary to run Homologizer on sets of genes. 4 | # 5 | # Matt Johnson, Texas Tech University 6 | 7 | 8 | ######### TO DO / CONSIDERATIONS 9 | # 10 | # RevBayes can read the label-swap from a txt file 11 | 12 | # What will RevBayes do when a taxon that has homeologs is missing? 13 | # There's code to fill in missing taxa but I suppose it will fill in both alleles 14 | # This SHOULD result in a 50/50 probability for that gene 15 | # Will be accommodated by the script that does the label switching, need to remember that 16 | # maybe the taxon isn't present 17 | # I'm also not sure how many genes will only have one homeolog (and therefore no _h1 or _h2) 18 | # This way, should be able to use one taxon-swap file for all genes 19 | # First version: just use genes with complete sampling. 20 | 21 | 22 | # Also need to get it to read the correct alignments 23 | # Is it possible to pass command line arguments to revbayes scripts? 24 | # Guessing not, so Python can write a header for the script call the appropriate 25 | # 26 | # When doing subsets, how do we know that the phase is the same from subset to subset? 27 | # Could do one final analysis with one gene picked from each of the subsets 28 | 29 | import argparse,pathlib,os 30 | from Bio import AlignIO,SeqIO 31 | from Bio.Alphabet import generic_dna,generic_protein 32 | from Bio.Nexus import Nexus 33 | from io import StringIO 34 | 35 | 36 | helptext = ''' 37 | ###### REQUIREMENTS 38 | # 39 | # Python > 3.5 40 | # Biopython 41 | 42 | ###### INPUTS 43 | # 44 | # Aligned sequence files in a format readable by BioPython 45 | # Max number of genes to include per RevBayes run 46 | 47 | ##### OUTPUTS 48 | # For each run of RevBayes: 49 | # a) a text file containing the locations of the gene alignments in the subset 50 | # b) a RevBayes script to run a subset of genes 51 | # A text file containing the names of the RevBayes scripts (could be used for SGE array jobs) 52 | ''' 53 | 54 | def convert_to_nexus(alignment_fn,genedir,input_type="fasta",seqtype = 'dna'): 55 | if seqtype == 'dna': 56 | seqs_to_write = [seq for seq in SeqIO.parse(os.path.join(genedir,alignment_fn),input_type, alphabet=generic_dna)] 57 | else: 58 | seqs_to_write = [seq for seq in SeqIO.parse(os.path.join(genedir,alignment_fn),input_type, alphabet=generic_protein)] 59 | new_fn = ".".join(alignment_fn.split(".")[:-1]) + ".nexus" 60 | output = StringIO() 61 | SeqIO.write(seqs_to_write, output, 'nexus') 62 | p = Nexus.Nexus() 63 | p.read(output.getvalue()) 64 | p.write_nexus_data(os.path.join("nexusfiles",new_fn), interleave=False) 65 | 66 | 67 | 68 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 69 | parser.add_argument("genedir",help="directory containing all of the gene alignments. No other files should be in this directory.") 70 | parser.add_argument("swaplist",help="file containing labels to swap, two per sample, separated by a comma") 71 | parser.add_argument("--numgenes","-n",help="number of genes to include in each RevBayes job",type=int,default=25) 72 | parser.add_argument("--alignfiletype","-a",help="Alignment file type. Must be one used by BioPython",default="fasta") 73 | parser.add_argument("revbayestemplate",help="Template for RevBayes. This script will prepend with alignment and swap info") 74 | args = parser.parse_args() 75 | 76 | 77 | 78 | alignments = os.listdir(args.genedir) 79 | 80 | ######## CONVERT TO NEXUS 81 | 82 | pathlib.Path("nexusfiles").mkdir(parents=True, exist_ok=True) 83 | 84 | if args.alignfiletype != "nexus": 85 | for fn in alignments: 86 | convert_to_nexus(fn,args.genedir) 87 | 88 | split_gene_list = [alignments[x:x+args.numgenes] for x in range(0, len(alignments), args.numgenes)] 89 | print("Will generate {} RevBayes scripts with a max of {} genes".format(len(split_gene_list),args.numgenes)) 90 | 91 | ######## WRITE REVBAYES SCRIPTS 92 | revbayes_header = ''' 93 | ####ADD THESE FROM PYTHON SCRIPT 94 | output_file = "alleles.{}" 95 | label_swap = readTable("{}",delimiter=",") 96 | alignments = readTable("{}") 97 | ####### 98 | ''' 99 | 100 | pathlib.Path("revbayes_scripts").mkdir(parents=True, exist_ok=True) 101 | revbayes_text = open(args.revbayestemplate).read() 102 | 103 | for gs in range(len(split_gene_list)): 104 | genelist_file = "revbayes_scripts/genelist.{}.txt".format(gs) 105 | with open(genelist_file,'w') as outfile: 106 | for gene in split_gene_list[gs]: 107 | outfile.write(args.genedir + "/" + gene+"\n") 108 | 109 | with open("revbayes_scripts/homeolog_phase.{}.Rev".format(gs),'w') as revbayes_out: 110 | revbayes_out.write(revbayes_header.format(gs,args.swaplist,genelist_file)) 111 | revbayes_out.write(revbayes_text) 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /homologizer/swap_labels.py: -------------------------------------------------------------------------------- 1 | #Script to swap labels in alignments based on the results of RevBayes 2 | 3 | 4 | helptext = ''' 5 | ###### Input 6 | # 7 | # List of genes (from first script) 8 | # RevBayes output log files 9 | 10 | ###### Options 11 | # 12 | # Swap threshold- based on posterior distribution, below threshold sample is deleted (default 95%) 13 | # Burnin percentage (default 10%) 14 | # Output alignment file type (default FASTA) 15 | 16 | ##### Output 17 | # 18 | # Directory containing alignments of specified type with the labels swapped. 19 | 20 | ''' 21 | 22 | import argparse,pathlib,os 23 | import pandas as pd 24 | from Bio import SeqIO, AlignIO 25 | 26 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 27 | parser.add_argument('genelist',help = "File containing list of gene alignments.") 28 | parser.add_argument('initalAlignmentDir',help="Directory containing RevBayes logs for each alignment") 29 | parser.add_argument('--swappct','-s',help="Posterior probability to do swapping. Below this, both alleles from the sample are deleted.",default=95) 30 | parser.add_argument('--burnin','-b',help="Percentage of RevBayes to discard as burnin",default=10) 31 | parser.add_argument('--outfiletype','-f',help="Alignment file type to output",default='fasta') 32 | parser.add_argument('--infiletype','-i',help="Alignment file type to output",default='fasta') 33 | 34 | args = parser.parse_args() 35 | 36 | 37 | ###### READ IN GENELIST 38 | 39 | genelist = [x.rstrip() for x in open(args.genelist)] 40 | subset_num = args.genelist.split(".")[1] 41 | outputdir = args.initalAlignmentDir+"_swapped" 42 | pathlib.Path(outputdir).mkdir(parents=True, exist_ok=True) 43 | 44 | for gene in genelist: 45 | ###### READ IN REVBAYES LOGS 46 | logfilepath = "alleles.{}_{}_phase.log".format(subset_num,gene) 47 | gene_logfile = pd.read_csv(logfilepath,header=0,index_col=0,sep="\t") 48 | dim_logfile = gene_logfile.shape 49 | burnin = int(dim_logfile[0]/args.burnin) 50 | gene_swap_dict = {} 51 | for h in range(dim_logfile[1]): 52 | pp = gene_logfile.iloc[burnin:,h].value_counts()/dim_logfile[0] * 100 53 | if pp[0] < args.swappct: 54 | #None will indicate the sequence should be skipped when re-writing 55 | gene_swap_dict[pp.name] = None 56 | print("PP for {} in gene {} was {}".format(pp.name,gene,pp[0])) 57 | else: 58 | gene_swap_dict[pp.name] = pp.index[0] 59 | with open(os.path.join(outputdir,os.path.split(gene)[1]),'w') as outfile: 60 | for seq in SeqIO.parse(gene,args.infiletype): 61 | if seq.id in gene_swap_dict: 62 | if gene_swap_dict[seq.id]: 63 | seq.id = gene_swap_dict[seq.id] 64 | SeqIO.write(seq,outfile,args.outfiletype) 65 | else: 66 | SeqIO.write(seq,outfile,args.outfiletype) 67 | 68 | ##### READ IN ALIGNMENT AND SWAP LABELS 69 | 70 | -------------------------------------------------------------------------------- /minorityreport/README.md: -------------------------------------------------------------------------------- 1 | # Minority Report 2 | 3 | This script will summarize the concordant and conflicting bipartitions found in a Phyparts analysis by plotting them on the species tree. 4 | 5 | One specific node must be specified. You can get the numerical identifier for the target node by running `phypartspiecharts.py` with the `--show_nodes` flag. 6 | 7 | ## Dependencies 8 | 9 | [ETE3](etetoolkit.org) with Python > 2.7 10 | 11 | [Linux convert](https://www.imagemagick.org/script/convert.php) (part of ImageMagick) 12 | 13 | ## Usage 14 | 15 | Run this from the same directory as the Phyparts ouptut. 16 | 17 | ``` 18 | python minority_report.py species.tre phyparts_root 31 3 19 | ``` 20 | 21 | This command will display the concordant and discordant bipartitions on `species.tre` using the `phyparts_root.alts` and `phyparts_root.hist` files for node number 31. Only bipartitions occurring in at least 3 gene trees will be displayed. 22 | 23 | ## Output 24 | 25 | A PDF will be generated in the current directory using the Linux tool `convert`. The first page of the PDF is the species phylogeny with the selected bipartition highlighted in blue text. In this example, a clade is highlighted on the species tree. The number of concordant gene trees (25, in this example) is indicated at the top: 26 | 27 | ![](img/concordant.png) 28 | 29 | All subsequent pages show the *same species tree topology* but alternative, conflicting bipartitions are highlighted. For example, this image shows an alternative bipartition for the same node as above: 30 | 31 | ![](img/conflict1.png) 32 | 33 | 34 | Species GW1701 and NZ609 are found in a bipartition with the top clade in 20 gene trees (almost as many as the concordant bipartition!). 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /minorityreport/img/concordant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/minorityreport/img/concordant.png -------------------------------------------------------------------------------- /minorityreport/img/conflict1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/minorityreport/img/conflict1.png -------------------------------------------------------------------------------- /minorityreport/minority_report.py: -------------------------------------------------------------------------------- 1 | 2 | import os,sys,argparse,subprocess 3 | from ete3 import Tree,TreeStyle,TextFace,NodeStyle 4 | 5 | helptext = '''This script will print tree figures representing the minority bipartitions 6 | found by PhyParts. Given the species tree used for PhyParts, the PhyParts output file 'root' 7 | and the node of interest, one PNG file will be generated per minority bipartition, and 8 | the decendants from the bipartition will be color-coded on the species tree. 9 | ''' 10 | 11 | #color_it = ["Entosthodon-hungaricus-3177","Entosthodon-attenuatus-3479","Physcomitrium-spathulatum-3549","Physcomitrium-pyriforme-3727","Physcomitrium-pyriforme-3728","Physcomitrium-immersum-3176","Physcomitrium-pyriforme-3118","Physcomitrella-magdalenae-3844","Physcomitrium-hookeri-3412","Physcomitrium-sp-3842","Entosthodon-attenuatus-3835","Physcomitrium-hookeri-3409","Physcomitrium-pyriforme-3798","Physcomitrium-sp-3115","Physcomitrium-pyriforme-3387","Entosthodon-obtusus-3347","Physcomitrium-pyriforme-3404","Aphanorrhegma-serratum-3305","Physcomitridium-readeri-3892","Entosthodon-americanus-3894","Entosthodon-lindigii-3546","Physcomitrium-sp-3508","Physcomitrium-sp-3672","Entosthodon-attenuatus-3543","Physcomitrium-pyriforme-3555","Physcomitrella-patens-3403","Physcomitrium-collenchymatum-3480","Entosthodon-obtusus-3395","Physcomitrium-eurystomum-3841","Physcomitrium-sp-3551","Physcomitrium-subsphaericum-3556","Physcomitrium-collenchymatum-3178","Physcomitrium-sp-3496","Physcomitrella-patens-3139","Physcomitrium-japonicum-3413","Physcomitrium-japonicum-3411","Physcomitrium-pyriforme-3787","Physcomitrium-pyriforme-3886","Entosthodon-sp-3837","Entosthodon-subintegrus-3840","Physcomitrium-eurystomum-3392","Physcomitrium-sp-3539","Physcomitrium-pyriforme-3883","Physcomitrium-sp-3816","Entosthodon-bergianus-3509","Physcomitrium-sp-3817","Physcomitrium-sp-3814"] 12 | 13 | def get_alternative_bipartitions(node,phyparts_root,min_alt): 14 | alt_biparts = [] 15 | alt_counts = [] 16 | for line in open(phyparts_root + ".hist.alts"): 17 | line = line.split() 18 | nodenum = int(line[2]) 19 | if nodenum == node: 20 | bipart2 = line[4].rstrip().split(",") 21 | bipart1 = line[3].split(":")[1].split(",") 22 | numalt = int(line[3].split(":")[0].replace(")","").replace("(","")) 23 | if numalt >= min_alt: 24 | alt_biparts.append((bipart1,bipart2)) 25 | alt_counts.append(numalt) 26 | return alt_biparts,alt_counts 27 | 28 | def render_tree(species_tree,bipart1,num_alt,png_fn,replace_taxon=None): 29 | color1 = "blue" 30 | color2 = "black" 31 | ts=TreeStyle() 32 | ts.show_leaf_name=False 33 | ts.show_scale=False 34 | nstyle = NodeStyle() 35 | nstyle["size"] = 0 36 | 37 | ts.title.add_face(TextFace("{} bipartition in {} gene trees".format(png_fn,num_alt),fsize=15,bold=True),0) 38 | plot_tree = species_tree 39 | for node in plot_tree.traverse(): 40 | node.set_style(nstyle) 41 | if node.name in bipart1: 42 | name_face = TextFace(node.name,fgcolor=color1) 43 | else: 44 | name_face = TextFace(node.name,fgcolor=color2) 45 | node.add_face(name_face,0,'branch-right') 46 | if replace_taxon: 47 | for leaf in plot_tree.get_leaves: 48 | try: 49 | leaf.name=taxon_subst[leaf.name] 50 | except KeyError: 51 | continue 52 | plot_tree.convert_to_ultrametric() 53 | plot_tree.render(png_fn,tree_style=ts,w=600) 54 | 55 | def majority_tree(species_tree,node_num,phyparts_root): 56 | 57 | num_concord = sum([1 for line in open("{}.concord.node.{}".format(phyparts_root,node_num))]) 58 | png_fn = "node_{}_speciestree.png".format(node_num,num_concord) 59 | for line in open(phyparts_root+".node.key"): 60 | node = int(line.split()[0]) 61 | if node == node_num: 62 | subtree = Tree(line.rstrip().split()[1]+";") 63 | subtree_bipart = subtree.get_leaf_names() 64 | render_tree(species_tree,subtree_bipart,num_concord,png_fn) 65 | 66 | 67 | def main(): 68 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 69 | parser.add_argument('species_tree',help="Newick formatted species tree topology.") 70 | parser.add_argument('phyparts_root',help="File root name used for Phyparts.") 71 | parser.add_argument('node_num',type=int,default=0,help="Node number from Phyparts. To see a tree with numbered nodes, run phypartspiecharts.py with --show_nodes.") 72 | parser.add_argument('min_alt',type=int,default=0,help="Only print alternative bipartitions if they occur in this many gene trees") 73 | parser.add_argument('--taxon_subst',help="Comma-delimted file to translate tip names.") 74 | 75 | args = parser.parse_args() 76 | 77 | try: 78 | subprocess.check_output('which convert',shell=True) 79 | convert = True 80 | except: 81 | convert = False 82 | if args.taxon_subst: 83 | taxon_subst = {line.split(",")[0]:line.split(",")[1] for line in open(args.taxon_subst,'U')} 84 | else: 85 | taxon_subst = None 86 | 87 | 88 | alt_bipart,alt_counts = get_alternative_bipartitions(args.node_num,args.phyparts_root,args.min_alt) 89 | print("{} alternative bipartitions occurred in more than {} gene trees\n".format(len(alt_counts),args.min_alt)) 90 | 91 | species_tree = Tree(args.species_tree) 92 | species_tree.ladderize(direction=1) 93 | majority_tree(species_tree,args.node_num,args.phyparts_root) 94 | 95 | for alt in range(len(alt_counts)): 96 | png_fn = "node_{}_alt_{}.png".format(args.node_num,alt) 97 | species_tree = Tree(args.species_tree) 98 | species_tree.ladderize(direction=1) 99 | render_tree(species_tree,alt_bipart[alt][0],alt_counts[alt],png_fn,replace_taxon=taxon_subst) 100 | if convert: 101 | os.system("convert node_{}_speciestree.png node_{}_alt_*.png node_{}.pdf".format(args.node_num,args.node_num,args.node_num)) 102 | os.system("rm node_{}*.png".format(args.node_num)) 103 | 104 | #t = Tree("((((Discelium-nudum-3746:1,Encalypta-intermedia-3219:1)1:1.02738,((Timmia-austriaca-3619:1,Entosthodon-pulchellus-3120:1)1:0.678047,(Chamaebryum-pottioides-3630:1,Chamaebryum-pottioides-3573:1)1:5.61798)1:0.843377)1:3.51197,(((Funaria-flavicans-4092:1,Funaria-hygrometrica-3891:1)1:3.87102,((Funaria-sp-3541:1,(Funaria-hygcalvescens-3633:1,Funaria-sp-3514:1)1:0.397782)1:2.28115,(Funaria-microstoma-3834:1,(Funaria-arctica-3544:1,(Funaria-polaris-3542:1,(Funaria-arctica-3833:1,((Funaria-hygrometrica-3476:1,(Funaria-hygrometrica-3388:1,Funaria-hygrometrica-3179:1)0.38:0.00555864)1:0.739336,((Funaria-sp-3882:1,Funaria-sp-3393:1)1:0.648175,(Funaria-hygrometrica-3515:1,Funaria-hygrometrica-3632:1)0.64:0.0737125)1:0.445965)1:0.943796)1:0.575318)1:0.240042)1:0.139316)1:0.88789)1:2.00293)1:5.75785,(((Physcomitrellopsis-africana-3142:1,Entosthodon-smithhurstii-3465:1)1:2.40187,(Entosthodon-sp-3726:1,(Entosthodon-sp-3545:1,(Entosthodon-clavatus-3896:1,Entosthodon-clavatus-3895:1)1:0.669355)1:3.49935)1:0.193954)1:2.82861,(((Entosthodon-hungaricus-3177:1,Entosthodon-americanus-3894:1)1:1.55658,(Entosthodon-lindigii-3546:1,((Entosthodon-muhlenbergii-3893:1,(Entosthodon-planoconvexus-3114:1,Entosthodon-duriaei-3843:1)1:2.39883)1:3.99232,((Entosthodon-attenuatus-3835:1,(Entosthodon-attenuatus-3479:1,Entosthodon-attenuatus-3543:1)1:0.133086)1:3.23622,((Entosthodon-sp-3837:1,(Physcomitrium-sp-3842:1,Entosthodon-subintegrus-3840:1)1:2.61412)1:2.65073,(Entosthodon-bergianus-3509:1,(Entosthodon-obtusus-3395:1,Entosthodon-obtusus-3347:1)1:3.84659)0.91:0.0632156)1:0.36051)1:3.21301)0.92:0.0664078)0.81:0.0496285)1:0.110983,((Physcomitrium-hookeri-3412:1,(Physcomitrium-hookeri-3409:1,Physcomitrium-pyriforme-3404:1)1:0.375291)1:3.74685,((Physcomitridium-readeri-3892:1,(((Physcomitrium-eurystomum-3392:1,Physcomitrium-eurystomum-3841:1)0.46:0.0189278,(Physcomitrium-pyriforme-3555:1,Physcomitrium-pyriforme-3387:1)0.55:0.0260365)1:0.212244,(((Physcomitrium-sp-3816:1,Physcomitrium-sp-3508:1)1:2.44605,((Physcomitrium-sp-3551:1,Physcomitrium-sp-3539:1)1:1.2462,(Physcomitrium-japonicum-3413:1,Physcomitrium-japonicum-3411:1)0.78:0.0484692)1:1.21849)1:3.30359,((Physcomitrium-pyriforme-3118:1,Physcomitrium-pyriforme-3883:1)1:3.0447,(Physcomitrium-pyriforme-3787:1,((Physcomitrella-magdalenae-3844:1,(Physcomitrium-spathulatum-3549:1,(Physcomitrium-sp-3814:1,Physcomitrium-subsphaericum-3556:1)1:0.625116)1:0.522302)1:1.95296,(Physcomitrium-sp-3496:1,(Physcomitrium-pyriforme-3798:1,(Physcomitrium-pyriforme-3727:1,(Physcomitrium-pyriforme-3886:1,Physcomitrium-pyriforme-3728:1)1:0.129654)1:0.441593)1:1.5649)1:1.80883)0.59:0.0382)1:1.16855)1:2.09587)1:0.55998)1:0.590922)1:0.23246,(Physcomitrium-sp-3817:1,((Physcomitrella-patens-3403:1,Physcomitrella-patens-3139:1)1:4.05812,((Physcomitrium-sp-3672:1,(Aphanorrhegma-serratum-3305:1,(Physcomitrium-collenchymatum-3480:1,(Physcomitrium-sp-3115:1,Physcomitrium-collenchymatum-3178:1)0.49:0.035722)1:2.5141)1:0.389341)0.88:0.152428,Physcomitrium-immersum-3176:1)1:0.479012)1:1.1982)1:2.30257)1:0.817867)1:1.25824)1:4.29516)1:0.799009)1:5.4761)1:1,Goniomitrium-africanum-4081:1);") 105 | #t.ladderize(direction=1) 106 | #ts = TreeStyle() 107 | #ts.show_leaf_name = False 108 | 109 | if __name__ == "__main__":main() 110 | -------------------------------------------------------------------------------- /phypartspiecharts/README.md: -------------------------------------------------------------------------------- 1 | # PhypartsPieCharts 2 | 3 | Using the output of PhyParts (https://bitbucket.org/blackrim/phyparts), plot pie charts on the species phylogeny showing the percentage of concordant gene trees, percentage in the top alternative bipartition, other conflicting topologies, and uninformative genes. 4 | 5 | For more information about PhyParts, consult the [original paper by Smith et al.](https://bmcevolbiol.biomedcentral.com/articles/10.1186/s12862-015-0423-0) and the Python notebook [here](PhyParts_PieCharts.ipynb). 6 | 7 | **Dependencies** 8 | 9 | Requires [ETE3](http://etetoolkit.org/) and Python > 2.7 10 | 11 | 12 | **Sample Usage** 13 | 14 | ``` 15 | python phypartspiecharts.py species.tre phyparts_root 158 16 | ``` 17 | Run the script from the directory containing the Phyparts output files. 18 | 19 | `species.tre` is the rooted species phylogeny used in Phyparts. 20 | 21 | `phyparts_root` is the basename of the phyparts output. The default is `out`. The important output files are `out.concon.tre`, `out.hist`, `out.alt`, and `out.key`. 22 | 23 | Finally, indicate the number of gene trees used in Phyparts. This is used to properly calculate the pie chart percentages. 24 | 25 | The default output will be `pies.svg` containing a ladderized version of your species tree and pie charts on each node. The default color scheme is the percentage of gene trees that are: 26 | 27 | * Blue: concordant 28 | * Green: the top alternative biparttion 29 | * Red: all other alternative bipartitions 30 | * Black: uninformative for that node 31 | 32 | Numbers above and below the branch also indicate the number of concordant and conflicting gene trees, respectively. 33 | 34 | **Example Output** 35 | 36 | ![](img/default_pies.jpg) 37 | 38 | **Other options** 39 | 40 | * `--svg_name` Change the name of the svg image file. Default: `pies.svg` 41 | * `--show_nodes` Display a tree with nodes labeled by the Phyparts numbering scheme. Useful for further inspection of alternative bipartitions (i.e. `minority_report.py`). The tree is opened in a new window, so this may not work for remote logins (use `ssh -Y`). 42 | * `--taxon_subst` Provide a comma delimited file that replaces tip labels in `species.tre` with a new label. Useful for converting accession numbers to species names, for example. 43 | * `--colors` Provide custom colors for the pie chart wedges. RGB triplets (comma-separated), hexadecimal, and named colors can be used and should be separated by a space. The colors will correspond to the same order as above. 44 | * `--to_csv` Generates CSV files for the categories and nodes, for input into ggTree in R (code provided by [Ben Cooper](https://github.com/benjamin-j-cooper)). 45 | 46 | **Run with Example Data** 47 | 48 | Data from [Medina et al. JSE 2019](https://onlinelibrary.wiley.com/doi/full/10.1111/jse.12516) 49 | 50 | ``` 51 | cd phyparts_example 52 | python ../phypartspiecharts.py species.tre out 648 53 | ``` 54 | 55 | ### Reroot Script 56 | 57 | Also includes a script for rerooting gene and species trees. Requires ETE3. 58 | 59 | Usage: 60 | 61 | `python reroot_trees.py my.tree outgroup.list > rerooted.tre` 62 | 63 | Where `outgroup.list` is a text file containing a list of outgroup names found on the tree (one per line). Script will attempt to find the LCA of the outgroup names, set that LCA as the root branch, and print the rooted tree. 64 | 65 | To run on a set of unrooted gene trees with GNU `parallel`: Assuming each gene is in the format `geneName.tre` and there is a list of `geneName` in a file called `genelist.txt`: 66 | 67 | `parallel "python reroot_trees.py {}.tre outgroup.list > {}.rerooted.tre" :::: genelist.txt` 68 | 69 | **Warning**: If no branch can be found, the tree file will be empty! You can remove empty tree files with: 70 | 71 | `find . -size 0 -delete` 72 | 73 | Remember to re-root the species tree as well before running PhyParts. 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /phypartspiecharts/img/default_pies.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/phypartspiecharts/img/default_pies.jpg -------------------------------------------------------------------------------- /phypartspiecharts/img/pleuro_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/phypartspiecharts/img/pleuro_nodes.png -------------------------------------------------------------------------------- /phypartspiecharts/img/sphag_taka.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/phypartspiecharts/img/sphag_taka.png -------------------------------------------------------------------------------- /phypartspiecharts/phyparts_example/out.concon.tre: -------------------------------------------------------------------------------- 1 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)478,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)231)626,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)638)132,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)602)482)250)634)76)50)59,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)342)584,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)646,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)261)600)635)640)641,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)495,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)384,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)143)556)638,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)636,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)390)308)549,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)246)615)618)194)560)639)638)439)518)315)645,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)463)644)263,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)575)629); 2 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)195)2,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)0)252,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)7)11)185)6)216)226)197,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)146)1,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)0,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)196)0)1)4)3,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)5,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)93,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)273)52)3,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)115)153)7,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)242)12)10)185)39)5)7)90)33)79)2,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)35)2)193,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)2)1); 3 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)0.9784027768416858,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)0.79076569047233)0.9689967641335626,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1.0)0.34872974537672113,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)0.9479351387882053)0.9049856997476616)0.5858268861374235)0.9371291522677113)0.25882761838229973)0.16382415706238607)0.18651191047865784,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)0.7500526160703238)0.9824436644806955,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1.0,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)0.6447504894707947)1.0)0.9831630546463646)0.9624808797655535)0.970304429867128,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)0.910410615398184,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)0.795980949498425,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.4428717628115614)0.8033376969061944)0.9318395066925902,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)0.9747441895741165,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)0.7243414002705952)0.6132456034656285)0.9506041771913188,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)0.7523018242875253)0.9274210646341674)0.9093078758550853)0.6420089852288237)0.8305051571490991)0.9520143595012353)0.9200661096764626)0.8018909077533978)0.8479602575782158)0.6423148907671082)0.9750481474391287,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)0.8859424320701869)0.9711731195998496)0.7098496912720081,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)0.9767909279348379)0.983124949874508); 4 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)0.9784027768416858,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)0.79076569047233)0.9689967641335626,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1.0)0.35059059605189336,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)0.9518845860911305)0.9121982699563427)0.5870778339468724)0.9371708799789142)0.25882761838229973)0.1665969369584339)0.1895506573461794,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)0.7628637350241585)0.983021142874864,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1.0,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)0.6671950717062091)1.0)0.9833173867152819)0.9625055731351766)0.970304429867128,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)0.910410615398184,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)0.796639738720315,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.4428717628115614)0.8044187220999806)0.9321209146933542,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)0.9747441895741165,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)0.7246592227996926)0.6315598024116463)0.9523422941390248,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)0.7557833731708657)0.9279206152360917)0.9143761929210199)0.6752825223370501)0.8305051571490991)0.9520143595012353)0.9200661096764626)0.8018909077533978)0.8483042656155186)0.6433538869820215)0.9750481474391287,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)0.8859424320701869)0.9713464955049456)0.7173285514839135,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)0.9767909279348379)0.9833173867152819); 5 | -------------------------------------------------------------------------------- /phypartspiecharts/phyparts_example/out.hist: -------------------------------------------------------------------------------- 1 | Node0,645.0,1.0,1.0,1.0,647 2 | Node1,61.0,4.0,19.0,16.0,1.0,1.0,6.0,1.0,3.0,1.0,4.0,2.0,4.0,1.0,1.0,1.0,10.0,1.0,1.0,1.0,11.0,1.0,2.0,1.0,1.0,1.0,1.0,14.0,1.0,1.0,1.0,16.0,5.0,1.0,2.0,4.0,9.0,2.0,10.0,1.0,1.0,4.0,10.0,4.0,2.0,5.0,1.0,3.0,1.0,9.0,4.0,1.0,11.0,1.0,2.0,1.0,256 3 | Node2,48.0,4.0,19.0,1.0,1.0,6.0,1.0,3.0,1.0,4.0,2.0,4.0,1.0,1.0,9.0,1.0,10.0,19.0,1.0,1.0,1.0,11.0,5.0,1.0,2.0,1.0,1.0,1.0,12.0,1.0,14.0,1.0,1.0,1.0,1.0,16.0,5.0,1.0,2.0,4.0,2.0,10.0,1.0,7.0,1.0,4.0,2.0,5.0,1.0,9.0,8.0,1.0,11.0,1.0,2.0,1.0,276 4 | Node3,76.0,4.0,19.0,1.0,6.0,3.0,1.0,4.0,2.0,1.0,1.0,9.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,2.0,1.0,8.0,1.0,15.0,1.0,14.0,1.0,50.0,1.0,1.0,1.0,5.0,1.0,2.0,4.0,2.0,10.0,1.0,2.0,1.0,12.0,9.0,8.0,1.0,1.0,292 5 | Node4,478.0,1.0,479 6 | Node5,634.5,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,640 7 | Node6,626.0,2.0,628 8 | Node7,231.0,7.0,6.0,2.0,426 9 | Node8,250.5,1.0,1.0,33.0,3.0,1.0,6.0,1.0,8.0,1.0,1.0,4.0,1.0,1.0,1.0,13.0,1.0,2.0,3.0,1.0,23.0,435 10 | Node9,131.5,33.0,21.0,15.0,1.0,14.0,42.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,23.0,384 11 | Node10,637.5,638 12 | Node11,536.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0,493 13 | Node12,548.0,2.0,3.0,1.0,1.0,609 14 | Node13,316.5,1.0,1.0,5.0,1.0,1.0,1.0,2.0,1.0,4.0,2.0,4.0,9.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,2.0,1.0,1.0,5.0,1.0,1.0,1.0,2.0,7.0,3.0,1.0,1.0,4.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,394 15 | Node14,608.0,1.0,585 16 | Node15,318.0,9.0,12.0,488 17 | Node16,516.5,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,551 18 | Node17,641.0,1.0,2.0,644 19 | Node18,640.5,3.0,1.0,644 20 | Node19,646.0,646 21 | Node20,638.1666666666667,1.0,636 22 | Node21,620.6666666666667,600 23 | Node22,236.6666666666667,15.0,11.0,457 24 | Node23,439.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,9.0,1.0,1.0,1.0,1.0,1.0,7.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,529 25 | Node24,638.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,645 26 | Node25,640.8333333333333,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,641 27 | Node26,495.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,500 28 | Node27,554.8333333333333,7.0,1.0,4.0,1.0,6.0,2.0,12.0,1.0,1.0,1.0,3.0,5.0,10.0,4.0,608 29 | Node28,382.3333333333333,3.0,6.0,7.0,6.0,3.0,6.0,477 30 | Node29,143.0,9.0,7.0,7.0,6.0,10.0,5.0,3.0,12.0,2.0,3.0,6.0,416 31 | Node30,639.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,644 32 | Node31,636.0,1.0,1.0,1.0,637 33 | Node32,560.0,1.0,1.0,1.0,7.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,3.0,8.0,1.0,1.0,1.0,8.0,599 34 | Node33,573.3333333333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,556 35 | Node34,284.33333333333337,9.0,12.0,1.0,18.0,12.0,19.0,1.0,1.0,1.0,1.0,6.0,461 36 | Node35,389.3333333333333,14.0,12.0,1.0,12.0,6.0,1.0,1.0,7.0,1.0,6.0,505 37 | Node36,235.0,3.0,1.0,1.0,1.0,1.0,1.0,14.0,1.0,8.0,1.0,1.0,1.0,10.0,1.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,8.0,379 38 | Node37,577.3333333333334,1.0,1.0,1.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,628 39 | Node38,619.8333333333334,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0,627 40 | Node39,240.83333333333334,20.0,3.0,1.0,4.0,1.0,3.0,1.0,1.0,1.0,488 41 | Node40,636.5,1.0,630 42 | Node41,260.0,1.0,1.0,1.0,1.0,5.0,25.0,11.0,456 43 | Node42,639.5,1.0,1.0,1.0,1.0,646 44 | Node43,463.0,1.0,9.0,1.0,1.0,6.0,498 45 | Node44,575.0,1.0,1.0,577 46 | -------------------------------------------------------------------------------- /phypartspiecharts/phyparts_example/out.node.key: -------------------------------------------------------------------------------- 1 | 0 ((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1)0.99)1,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1)1,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1)1)1)1 2 | 1 (Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1)0.99)1 3 | 2 (Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1)0.99 4 | 3 ((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1 5 | 4 (Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1 6 | 5 ((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1 7 | 6 (Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1 8 | 7 (Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1 9 | 8 ((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1 10 | 9 (Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91 11 | 10 (Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1 12 | 11 (Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1 13 | 12 (Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1 14 | 13 ((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1)1,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1)1)1 15 | 14 (Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1)1 16 | 15 (Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1 17 | 16 ((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1)1 18 | 17 (Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1 19 | 18 ((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1 20 | 19 (Physcomitrella-patens-3403,Physcomitrella-patens-3139)1 21 | 20 (Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1 22 | 21 (Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1 23 | 22 (Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1 24 | 23 (Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1 25 | 24 (((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1 26 | 25 ((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1 27 | 26 (Physcomitrium-sp-3508,Physcomitrium-sp-3816)1 28 | 27 ((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1 29 | 28 (Physcomitrium-sp-3539,Physcomitrium-sp-3551)1 30 | 29 (Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94 31 | 30 ((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1 32 | 31 (Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1 33 | 32 ((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1 34 | 33 (Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1 35 | 34 (Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1 36 | 35 (Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1 37 | 36 (Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1 38 | 37 (Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1 39 | 38 (Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1 40 | 39 (Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1 41 | 40 ((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1)1)1,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)1)1 42 | 41 (Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1)1)1 43 | 42 (Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1)1 44 | 43 (Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1 45 | 44 (Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)1 46 | -------------------------------------------------------------------------------- /phypartspiecharts/phyparts_example/phyparts_dist.csv: -------------------------------------------------------------------------------- 1 | node,concord,genes-concord 2 | 0,645,2 3 | 1,59,197 4 | 2,50,226 5 | 3,76,216 6 | 4,478,1 7 | 5,634,6 8 | 6,626,2 9 | 7,231,195 10 | 8,250,185 11 | 9,132,252 12 | 10,638,0 13 | 11,482,11 14 | 12,602,7 15 | 13,315,79 16 | 14,584,1 17 | 15,342,146 18 | 16,518,33 19 | 17,641,3 20 | 18,640,4 21 | 19,646,0 22 | 20,635,1 23 | 21,600,0 24 | 22,261,196 25 | 23,439,90 26 | 24,638,7 27 | 25,638,3 28 | 26,495,5 29 | 27,556,52 30 | 28,384,93 31 | 29,143,273 32 | 30,639,5 33 | 31,636,1 34 | 32,560,39 35 | 33,549,7 36 | 34,308,153 37 | 35,390,115 38 | 36,194,185 39 | 37,618,10 40 | 38,615,12 41 | 39,246,242 42 | 40,629,1 43 | 41,263,193 44 | 42,644,2 45 | 43,463,35 46 | 44,575,2 -------------------------------------------------------------------------------- /phypartspiecharts/phyparts_example/phyparts_pies.csv: -------------------------------------------------------------------------------- 1 | node,adj_concord,adj_most_conflict,other_conflict,the_rest 2 | 0,99.53703703703704,0.15432098765432098,0.15432098765432098,0.15432098765432098 3 | 1,9.104938271604938,2.9320987654320985,27.469135802469136,60.49382716049383 4 | 2,7.716049382716049,2.9320987654320985,31.944444444444443,57.407407407407405 5 | 3,11.728395061728394,7.716049382716049,25.617283950617285,54.93827160493827 6 | 4,73.76543209876543,0.15432098765432098,0.0,26.080246913580247 7 | 5,97.8395061728395,0.30864197530864196,0.6172839506172839,1.2345679012345678 8 | 6,96.60493827160494,0.30864197530864196,0.0,3.0864197530864197 9 | 7,35.648148148148145,1.0802469135802468,29.01234567901235,34.25925925925926 10 | 8,38.58024691358025,5.092592592592593,23.456790123456788,32.870370370370374 11 | 9,20.37037037037037,6.481481481481481,32.407407407407405,40.74074074074074 12 | 10,98.4567901234568,0.0,0.0,1.5432098765432098 13 | 11,74.38271604938271,0.4629629629629629,1.2345679012345678,23.919753086419753 14 | 12,92.90123456790124,0.4629629629629629,0.6172839506172839,6.018518518518518 15 | 13,48.61111111111111,1.3888888888888888,10.802469135802468,39.19753086419753 16 | 14,90.12345679012346,0.15432098765432098,0.0,9.722222222222223 17 | 15,52.77777777777778,1.8518518518518516,20.679012345679013,24.691358024691358 18 | 16,79.93827160493827,0.7716049382716049,4.320987654320987,14.969135802469136 19 | 17,98.91975308641975,0.30864197530864196,0.15432098765432098,0.6172839506172839 20 | 18,98.76543209876543,0.4629629629629629,0.15432098765432098,0.6172839506172839 21 | 19,99.69135802469135,0.0,0.0,0.30864197530864196 22 | 20,97.99382716049382,0.15432098765432098,0.0,1.8518518518518516 23 | 21,92.5925925925926,0.0,0.0,7.4074074074074066 24 | 22,40.27777777777778,2.314814814814815,27.9320987654321,29.475308641975307 25 | 23,67.74691358024691,1.3888888888888888,12.5,18.3641975308642 26 | 24,98.4567901234568,0.30864197530864196,0.7716049382716049,0.4629629629629629 27 | 25,98.4567901234568,0.30864197530864196,0.15432098765432098,1.0802469135802468 28 | 26,76.38888888888889,0.30864197530864196,0.4629629629629629,22.839506172839506 29 | 27,85.80246913580247,1.8518518518518516,6.172839506172839,6.172839506172839 30 | 28,59.25925925925925,1.0802469135802468,13.271604938271606,26.38888888888889 31 | 29,22.067901234567902,1.8518518518518516,40.27777777777778,35.80246913580247 32 | 30,98.61111111111111,0.15432098765432098,0.6172839506172839,0.6172839506172839 33 | 31,98.14814814814815,0.15432098765432098,0.0,1.6975308641975309 34 | 32,86.41975308641975,1.2345679012345678,4.78395061728395,7.561728395061729 35 | 33,84.72222222222221,0.15432098765432098,0.9259259259259258,14.19753086419753 36 | 34,47.53086419753087,2.9320987654320985,20.679012345679013,28.858024691358025 37 | 35,60.18518518518518,2.1604938271604937,15.58641975308642,22.067901234567902 38 | 36,29.938271604938272,2.1604938271604937,26.38888888888889,41.51234567901235 39 | 37,95.37037037037037,0.4629629629629629,1.0802469135802468,3.0864197530864197 40 | 38,94.9074074074074,0.4629629629629629,1.3888888888888888,3.2407407407407405 41 | 39,37.96296296296296,3.0864197530864197,34.25925925925926,24.691358024691358 42 | 40,97.0679012345679,0.15432098765432098,0.0,2.7777777777777777 43 | 41,40.586419753086425,3.8580246913580245,25.925925925925924,29.629629629629626 44 | 42,99.38271604938271,0.15432098765432098,0.15432098765432098,0.30864197530864196 45 | 43,71.4506172839506,1.3888888888888888,4.012345679012346,23.14814814814815 46 | 44,88.73456790123457,0.15432098765432098,0.15432098765432098,10.95679012345679 -------------------------------------------------------------------------------- /phypartspiecharts/phyparts_example/species.tre: -------------------------------------------------------------------------------- 1 | (((Entosthodon-lindigii-3546:1,(Entosthodon-americanus-3894:1,((Entosthodon-planoconvexus-3114:1,Entosthodon-duriaei-3843:1)1:5.64578,((Entosthodon-attenuatus-3835:1,(Entosthodon-attenuatus-3479:1,Entosthodon-attenuatus-3543:1)1:0.300375)1:5.34711,((Entosthodon-bergianus-3509:1,(Entosthodon-obtusus-3347:1,Entosthodon-obtusus-3395:1)1:6.05287)0.91:0.0659619,(Entosthodon-sp-3837:1,(Physcomitrium-sp-3842:1,Entosthodon-subintegrus-3840:1)1:3.88757)1:3.21069)1:0.515246)1:4.56576)1:0.100426)0.99:0.0869556)1:0.129257,((Physcomitrium-hookeri-3412:1,(Physcomitrium-hookeri-3409:1,Physcomitrium-pyriforme-3404:1)1:0.546544)1:5.88447,((Physcomitrium-sp-3817:1,((Physcomitrella-patens-3403:1,Physcomitrella-patens-3139:1)1:6.06688,(Aphanorrhegma-serratum-3305:1,(Physcomitrium-collenchymatum-3480:1,(Physcomitrium-sp-3115:1,Physcomitrium-collenchymatum-3178:1)1:0.187039)1:5.9898)1:5.35659)1:4.45078)1:4.49357,(Physcomitridium-readeri-3892:1,(((Physcomitrium-sp-3508:1,Physcomitrium-sp-3816:1)1:4.0411,((Physcomitrium-sp-3539:1,Physcomitrium-sp-3551:1)1:1.24296,(Physcomitrium-japonicum-3413:1,Physcomitrium-japonicum-3411:1)0.94:0.0601077)1:2.19271)1:5.14283,((Physcomitrium-pyriforme-3118:1,Physcomitrium-pyriforme-3883:1)1:5.11929,((Physcomitrella-magdalenae-3844:1,(Physcomitrium-spathulatum-3549:1,(Physcomitrium-subsphaericum-3556:1,Physcomitrium-sp-3814:1)1:0.907898)1:0.492126)1:3.68456,(Physcomitrium-pyriforme-3787:1,(Physcomitrium-sp-3496:1,(Physcomitrium-pyriforme-3727:1,(Physcomitrium-pyriforme-3728:1,Physcomitrium-pyriforme-3886:1)1:0.193664)1:3.56346)1:3.46239)1:0.209688)1:2.35801)1:5.02628)1:5.25997)1:1.11675)1:2.2877)1:1.21442)1:2.98856,((Entosthodon-sp-3726:1,(Entosthodon-sp-3545:1,(Entosthodon-clavatus-3895:1,Entosthodon-clavatus-3896:1)1:1.56827)1:5.15235)1:0.329887,(Physcomitrellopsis-africana-3142:1,Entosthodon-smithhurstii-3465:2)1:4.30449)1:2.98856); 2 | -------------------------------------------------------------------------------- /phypartspiecharts/phypartspiecharts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | helptext= ''' 4 | Generate the "Pie Chart" representation of gene tree conflict from Smith et al. 2015 from 5 | the output of phyparts, the bipartition summary software described in the same paper. 6 | 7 | The input files include three files produced by PhyParts, and a file containing a species 8 | tree in Newick format (likely, the tree used for PhyParts). The output is an SVG containing 9 | the phylogeny along with pie charts at each node. 10 | 11 | Requirements: 12 | 13 | Python 3 14 | ete3 15 | matplotlib 16 | 17 | ''' 18 | 19 | import matplotlib,sys,argparse,re,json 20 | from ete3 import Tree, TreeStyle, TextFace,NodeStyle,faces, COLOR_SCHEMES 21 | 22 | 23 | #Read in species tree and convert to ultrametric 24 | 25 | #Match phyparts nodes to ete3 nodes 26 | def get_phyparts_nodes(sptree_fn,phyparts_root): 27 | sptree = Tree(sptree_fn) 28 | sptree.convert_to_ultrametric() 29 | 30 | phyparts_node_key = [line for line in open(phyparts_root+".node.key")] 31 | subtrees_dict = {n.split()[0]:Tree(n.split()[1]+";") for n in phyparts_node_key} 32 | subtrees_topids = {} 33 | for x in subtrees_dict: 34 | subtrees_topids[x] = subtrees_dict[x].get_topology_id() 35 | #print(subtrees_topids['1']) 36 | #print() 37 | for node in sptree.traverse(): 38 | node_topid = node.get_topology_id() 39 | if "Takakia_4343a" in node.get_leaf_names(): 40 | print(node_topid) 41 | print(node) 42 | for subtree in subtrees_dict: 43 | if node_topid == subtrees_topids[subtree]: 44 | node.name = subtree 45 | return sptree,subtrees_dict,subtrees_topids 46 | 47 | #Summarize concordance and conflict from Phyparts 48 | def get_concord_and_conflict(phyparts_root,subtrees_dict,subtrees_topids): 49 | 50 | with open(phyparts_root + ".concon.tre") as phyparts_trees: 51 | concon_tree = Tree(phyparts_trees.readline()) 52 | conflict_tree = Tree(phyparts_trees.readline()) 53 | 54 | concord_dict = {} 55 | conflict_dict = {} 56 | 57 | 58 | for node in concon_tree.traverse(): 59 | node_topid = node.get_topology_id() 60 | for subtree in subtrees_dict: 61 | if node_topid == subtrees_topids[subtree]: 62 | concord_dict[subtree] = node.support 63 | 64 | for node in conflict_tree.traverse(): 65 | node_topid = node.get_topology_id() 66 | for subtree in subtrees_dict: 67 | if node_topid == subtrees_topids[subtree]: 68 | conflict_dict[subtree] = node.support 69 | return concord_dict, conflict_dict 70 | 71 | #Generate Pie Chart data 72 | def get_pie_chart_data(phyparts_root,total_genes,concord_dict,conflict_dict): 73 | 74 | phyparts_hist = [line for line in open(phyparts_root + ".hist")] 75 | phyparts_pies = {} 76 | phyparts_dict = {} 77 | 78 | for n in phyparts_hist: 79 | n = n.split(",") 80 | tot_genes = float(n.pop(-1)) 81 | node_name = n.pop(0)[4:] 82 | concord = float(n.pop(0)) 83 | concord = concord_dict[node_name] 84 | all_conflict = conflict_dict[node_name] 85 | 86 | if len(n) > 0: 87 | most_conflict = max([float(x) for x in n]) 88 | else: 89 | most_conflict = 0.0 90 | 91 | adj_concord = (concord/total_genes) * 100 92 | adj_most_conflict = (most_conflict/total_genes) * 100 93 | other_conflict = (all_conflict - most_conflict) / total_genes * 100 94 | the_rest = (total_genes - concord - all_conflict) / total_genes * 100 95 | 96 | pie_list = [adj_concord,adj_most_conflict,other_conflict,the_rest] 97 | 98 | phyparts_pies[node_name] = pie_list 99 | 100 | phyparts_dict[node_name] = [int(round(concord,0)),int(round(tot_genes-concord,0))] 101 | 102 | return phyparts_dict, phyparts_pies 103 | 104 | 105 | def node_text_layout(mynode): 106 | F = faces.TextFace(mynode.name,fsize=20) 107 | faces.add_face_to_node(F,mynode,0,position="branch-right") 108 | 109 | #convert internal phypartspiechart.py data files to csv and export to current directory (for use as ggtree tree data in R) 110 | def pie_data_to_csv(phyparts_dict, phyparts_pies): 111 | phyparts_dist_bin = {} 112 | phyparts_pies_bin = {} 113 | dist_replaced = {} 114 | pies_replaced = {} 115 | 116 | phyparts_dist_bin = json.dumps(phyparts_dist) 117 | phyparts_pies_bin = json.dumps(phyparts_pies) 118 | 119 | 120 | dist_replaced = re.sub(r'{',r'node,concord,genes-concord\n',phyparts_dist_bin) 121 | dist_replaced = re.sub(r'"(\d*)":\s\[(\d*),\s(\d*)\],\s', r'\1,\2,\3\n', dist_replaced) 122 | dist_replaced = re.sub(r'"(\d*)":\s\[(\d*),\s(\d*)\]}', r'\1,\2,\3', dist_replaced) 123 | 124 | pies_replaced = re.sub(r'{',r'node,adj_concord,adj_most_conflict,other_conflict,the_rest\n',phyparts_pies_bin) 125 | pies_replaced = re.sub(r'"(\d*)":\s\[(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*)\],\s', r'\1,\2,\3,\4,\5\n', pies_replaced) 126 | pies_replaced = re.sub(r'"(\d*)":\s\[(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*)\]}', r'\1,\2,\3,\4,\5', pies_replaced) 127 | 128 | with open('phyparts_dist.csv','w') as file: 129 | for line in dist_replaced: 130 | file.write(line) 131 | with open('phyparts_pies.csv','w') as file: 132 | for line in pies_replaced: 133 | file.write(line) 134 | 135 | 136 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) 137 | parser.add_argument('species_tree',help="Newick formatted species tree topology.") 138 | parser.add_argument('phyparts_root',help="File root name used for Phyparts.") 139 | parser.add_argument('num_genes',type=int,default=0,help="Number of total gene trees. Used to properly scale pie charts.") 140 | parser.add_argument('--taxon_subst',help="Comma-delimted file to translate tip names.") 141 | parser.add_argument("--svg_name",help="File name for SVG generated by script",default="pies.svg") 142 | parser.add_argument("--show_nodes",help="Also show tree with nodes labeled same as PhyParts",action="store_true",default=False) 143 | parser.add_argument("--colors",help="Four colors of the pie chart: concordance (blue) top conflict (green), other conflict (red), no signal (gray)",nargs="+",default=["blue","green","red","dark gray"]) 144 | parser.add_argument("--no_ladderize",help="Do not ladderize the input species tree.",action="store_true",default=False) 145 | parser.add_argument("--to_csv",help="Output data files to csv for import into ggtree in R",action="store_true",default=False) 146 | 147 | args = parser.parse_args() 148 | if args.no_ladderize: 149 | ladderize=False 150 | else: 151 | ladderize=True 152 | plot_tree,subtrees_dict,subtrees_topids = get_phyparts_nodes(args.species_tree, args.phyparts_root) 153 | #print(subtrees_dict) 154 | concord_dict, conflict_dict = get_concord_and_conflict(args.phyparts_root,subtrees_dict,subtrees_topids) 155 | phyparts_dist, phyparts_pies = get_pie_chart_data(args.phyparts_root,args.num_genes,concord_dict,conflict_dict) 156 | 157 | if args.taxon_subst: 158 | taxon_subst = {line.split(",")[0]:line.rstrip().split(",")[1] for line in open(args.taxon_subst,'U')} 159 | for leaf in plot_tree.get_leaves(): 160 | try: 161 | leaf.name = taxon_subst[leaf.name] 162 | except KeyError: 163 | print(leaf.name) 164 | continue 165 | def phyparts_pie_layout(mynode): 166 | if mynode.name in phyparts_pies: 167 | pie= faces.PieChartFace(phyparts_pies[mynode.name], 168 | #colors=COLOR_SCHEMES["set1"], 169 | colors = args.colors, 170 | width=50, height=50) 171 | pie.border.width = None 172 | pie.opacity = 1 173 | faces.add_face_to_node(pie,mynode, 0, position="branch-right") 174 | 175 | concord_text = faces.TextFace(str(int(concord_dict[mynode.name]))+' ',fsize=20) 176 | conflict_text = faces.TextFace(str(int(conflict_dict[mynode.name]))+' ',fsize=20) 177 | 178 | faces.add_face_to_node(concord_text,mynode,0,position = "branch-top") 179 | faces.add_face_to_node(conflict_text,mynode,0,position="branch-bottom") 180 | 181 | 182 | else: 183 | F = faces.TextFace(mynode.name,fsize=20) 184 | faces.add_face_to_node(F,mynode,0,position="aligned") 185 | 186 | #Plot Pie Chart 187 | ts = TreeStyle() 188 | ts.show_leaf_name = False 189 | 190 | ts.layout_fn = phyparts_pie_layout 191 | nstyle = NodeStyle() 192 | nstyle["size"] = 0 193 | for n in plot_tree.traverse(): 194 | n.set_style(nstyle) 195 | n.img_style["vt_line_width"] = 0 196 | 197 | ts.draw_guiding_lines = True 198 | ts.guiding_lines_color = "black" 199 | ts.guiding_lines_type = 0 200 | ts.scale = 30 201 | ts.branch_vertical_margin = 10 202 | plot_tree.convert_to_ultrametric() 203 | if args.to_csv: 204 | pie_data_to_csv(phyparts_dist, phyparts_pies) 205 | 206 | if ladderize: 207 | plot_tree.ladderize(direction=1) 208 | my_svg = plot_tree.render(args.svg_name,tree_style=ts,w=595,dpi=300) 209 | 210 | if args.show_nodes: 211 | node_style = TreeStyle() 212 | node_style.show_leaf_name=False 213 | node_style.layout_fn = node_text_layout 214 | plot_tree.render("tree_nodes.pdf",tree_style=node_style) 215 | 216 | 217 | -------------------------------------------------------------------------------- /phypartspiecharts/reroot_trees.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from ete3 import Tree 3 | 4 | if len(sys.argv) < 3: 5 | print("Usage: python reroot_trees.py treefile.tre outgroup.list > rerooted.tre") 6 | 7 | # outgroup.list should be one sample name per line 8 | outgroup_names = [x.rstrip() for x in open(sys.argv[2])] 9 | 10 | for line in open(sys.argv[1]): 11 | t = Tree(line.rstrip()) 12 | outgroups_in_tree = list(set(t.get_leaf_names()).intersection(set(outgroup_names))) 13 | if len(outgroups_in_tree) > 1: 14 | ancestor = t.get_common_ancestor(outgroups_in_tree) 15 | if ancestor == t: 16 | ingroups_in_tree = list(set(t.get_leaf_names()).difference(set(outgroups_in_tree))) 17 | ancestor = t.get_common_ancestor(ingroups_in_tree) 18 | t.set_outgroup(ancestor) 19 | print(t.write()) 20 | else: 21 | t.set_outgroup(ancestor) 22 | print(t.write()) 23 | elif len(outgroups_in_tree) == 1: 24 | t.set_outgroup(outgroups_in_tree[0]) 25 | print(t.write()) 26 | else: 27 | continue 28 | --------------------------------------------------------------------------------