├── .gitignore
├── HybPiperUtils
    ├── README.md
    ├── fasta_merge.py
    └── filter_by_length.py
├── LICENSE
├── README.md
├── alleles_workflow
    ├── README.md
    ├── combine_alignments.py
    ├── create_alleles_alignments.sh
    ├── extract_phase_bcftools.sh
    ├── intron_exon_extractor.py
    └── map_to_supercontigs.sh
├── brlenoutliers
    └── brlen_outliers.py
├── haplonerate
    ├── README.md
    ├── haplonerate.py
    ├── haplonerate3N.py
    └── img
    │   └── AJB_Figure_1.pdf
├── homologizer
    ├── convert_to_nexus.py
    ├── label_swap.txt
    ├── readme.md
    ├── revbayes_template.txt
    ├── revscript_maker.py
    └── swap_labels.py
├── minorityreport
    ├── README.md
    ├── img
    │   ├── concordant.png
    │   └── conflict1.png
    └── minority_report.py
└── phypartspiecharts
    ├── PhyParts_PieCharts.ipynb
    ├── README.md
    ├── img
        ├── default_pies.jpg
        ├── pleuro_nodes.png
        └── sphag_taka.png
    ├── phyparts_example
        ├── out.concon.tre
        ├── out.hist
        ├── out.hist.alts
        ├── out.node.key
        ├── phyparts_dist.csv
        ├── phyparts_pies.csv
        ├── pies.svg
        └── species.tre
    ├── phypartspiecharts.py
    └── reroot_trees.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/HybPiperUtils/README.md:
--------------------------------------------------------------------------------
 1 | # HybPiper Utils
 2 | 
 3 | Assorted shell and Python scripts to work with data from HybPiper output.
 4 | 
 5 | ### Software
 6 | 
 7 | HybPiper: [www.github.com/mossmatters/HybPiper](www.github.com/mossmatters/HybPiper)
 8 | 
 9 | Most of the scripts require Python 3.0 or higher and `biopython` which can be installed using `conda`.
10 | 
11 | 
12 | #### `fasta_merge.py`
13 | 
14 | Script to merge (concatenate) alignments from different alignments for phylogenetic analysis.
15 | 
16 | ```
17 | usage: fasta_merge.py [-h] [--fastafiles FASTAFILES [FASTAFILES ...]]
18 |                       [--filelist FILELIST] [--raxml {DNA,WAG,JTT,CODON}]
19 | 
20 | This script will take a list of FASTA files and concatenate them for use in
21 | phylogenetic inference. The sequence headers (up until the first space) must be identical
22 | in each individual FASTA file.
23 | 
24 | Individual gene sequences should be aligned prior to running this script!
25 | 
26 | This script requires BioPython to read/write FASTA sequences.
27 | 
28 | optional arguments:
29 |   -h, --help            show this help message and exit
30 |   --fastafiles FASTAFILES [FASTAFILES ...]
31 |                         List of Fasta Files. Can use wildcard on Linux/Mac systems
32 |   --filelist FILELIST   File containing list of Fasta files. Alternative to --fastalist
33 |   --raxml {DNA,WAG,JTT,CODON}
34 |                         Create a partition file 'partitions.raxml' intended for raxml in the current directory. For amino acid sequences, select the substitution model. To specify a separate model for 1st/2nd vs. 3rd codon positions, select CODON.
35 | ```
36 | 
37 | ### `filter_by_length.py`
38 | 
39 | A script for filtering gene-sample combinations based on length filters for each gene. Two filters are available: minimum length, and percentage of mean length of the targets. The script will assume you have already run `hybpiper stats` and `hybpiper retrieve_sequences` as input.
40 | 
41 | As of HybPiper 2.1.6, the `hybpiper retrieve_sequences` command could only filter sequences at a "whole project" level - for example, removing a sample if fewer than 20% of genes were recovered.
42 | 
43 | Suggested Workflow:
44 | 
45 | 1. Run `hybpiper stats` to generate the `stats.tsv` and `lengths.tsv` files
46 | 2. Run `hybpiper retrieve_sequences` to create a folder of FASTA sequences
47 | 3. Run this script to create new FASTA files based on the per-gene filters. 
48 |     Also writes to standard output the `denylist` by gene, redirect this to save to a file.
49 |     
50 | The FASTA sequences will expect to have the naming scheme of HybPiper:
51 | 
52 | -    `geneName.FNA` for nucleotide exon files
53 | -    `geneName.FAA` for amino acid files
54 | -    `geneName_supercontig.fasta` for supercontig files
55 | -    `geneName_intron.fasta` for intron-only files
56 |     
57 | The geneNames will be taken from either the `hybpiper stats` file (`--lengthfile`) or a supplied list of gene sample combinations (`--denylist`, also produced by running this script). There are two filters, `--length_filter` for the minimum length to accept a sequence (for all genes) and `--percent_filter` for a fraction of the mean length determined from the `seq_lengths.tsv` file for each gene. For example:
58 | 
59 | ```
60 | python filter_by_length.py --lengthfile ../seq_lengths.tsv --seq_type FNA --percent_filter 0.1 > denylist.txt  
61 | ```
62 |     
63 | If you wish to filter intron or supercontig sequences, run a second time with the `--denylist` flag to skip the filtering based on lengths or percentages:
64 | 
65 | ```
66 | python filter_by_length.py --lengthfile ../seq_lengths.tsv --seq_type supercontig --denylist ../denylist.txt 
67 | ```
68 | 


--------------------------------------------------------------------------------
/HybPiperUtils/fasta_merge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | helptext ='''This script will take a list of FASTA files and concatenate them for use in 
  4 | phylogenetic inference. The sequence headers (up until the first space) must be identical
  5 | in each individual FASTA file.
  6 | 
  7 | Individual gene sequences should be aligned prior to running this script!
  8 | 
  9 | This script requires BioPython to read/write FASTA sequences.'''
 10 | 
 11 | import os,sys,argparse
 12 | from Bio import SeqIO
 13 | from Bio.SeqRecord import SeqRecord
 14 | from Bio.Seq import Seq
 15 | 
 16 | def read_sequences(fastafiles):
 17 |     '''Given a list of FASTA file names, read in each sequence to a dictionary of dictionaries, one per file'''
 18 |     return {filename:SeqIO.to_dict(SeqIO.parse(filename,'fasta')) for filename in fastafiles}
 19 | 
 20 | def get_unique_names(gene_dict):
 21 |     '''Given the dictionary of SeqRecord dictionaries, return a list of the unique sequence headers'''
 22 |     all_names = []
 23 |     for gene in gene_dict:
 24 |         all_names += list(gene_dict[gene].keys())
 25 |     return set(all_names)
 26 | 
 27 | def insert_sequences(gene_dict,unique_names):
 28 |     '''Given the dictionary of dictionaries, insert blank sequences if any are missing for a gene'''
 29 |     inserted_sequences = 0
 30 |     for gene in gene_dict:
 31 |         for name in unique_names:
 32 |             if name not in gene_dict[gene]:
 33 |                 gene_length = len(next(iter(gene_dict[gene].values())))
 34 |                 gene_dict[gene][name] = SeqRecord(Seq("-"*gene_length),id=name)
 35 |                 inserted_sequences += 1
 36 |     sys.stderr.write("{} Empty sequences inserted across all genes.\n".format(inserted_sequences))            
 37 |     return gene_dict
 38 | 
 39 | def concatenate_sequences(gene_dict,fastafiles,unique_names):
 40 |     '''Given a dictionary of dictionaries with complete sampling in each gene, write out concatenated sequences to stdout. Returns a list of partition lengths.'''    
 41 |     new_seq_dict = {}
 42 |     partition_lengths = []
 43 |     for gene in fastafiles:
 44 |         for name in unique_names:
 45 |             try:
 46 |                 new_seq_dict[name] += gene_dict[gene][name]
 47 |             except KeyError:
 48 |                 new_seq_dict[name] = gene_dict[gene][name]
 49 |         partition_lengths.append(len(next(iter(gene_dict[gene].values()))))
 50 |     for final_seq in new_seq_dict:
 51 |         SeqIO.write(new_seq_dict[final_seq],sys.stdout,'fasta')            
 52 |     final_seq_length = len(new_seq_dict[final_seq])
 53 |     sys.stderr.write("Final conatenated sequence length: {}\n".format(final_seq_length))
 54 |     return partition_lengths
 55 | 
 56 | def raxml_partition(fastafiles,partition_lengths,partition_type):
 57 |     '''Generate a raxml partition file for the given fastafiles. User specifies the partition type'''
 58 |     gene_start = 1
 59 |     partition_file = open("partition.raxml",'w')
 60 |     
 61 |     if partition_type == 'CODON':
 62 |         for g in range(len(fastafiles)):
 63 |             codon3_start = gene_start + 2
 64 |             codon3_end = gene_start + partition_lengths[g] - 1
 65 |             codon1_end = codon3_end - 2
 66 |             codon2_start = gene_start + 1
 67 |             codon2_end = codon3_end - 1
 68 |             partition_file.write("{},{}{}={}-{}\\3,{}-{}\\3\n".format("DNA",fastafiles[g],"12",gene_start,codon1_end,codon2_start,codon2_end))
 69 |             partition_file.write("{},{}{}={}-{}\\3\n".format("DNA",fastafiles[g],"3",codon3_start,codon3_end))
 70 |             gene_start = codon3_end + 1
 71 |     else:
 72 |         for g in range(len(fastafiles)):
 73 |             gene_end = gene_start + partition_lengths[g] - 1
 74 |             partition_file.write("{},{}={}-{}\n".format(partition_type,fastafiles[g],gene_start,gene_end))
 75 |             gene_start = gene_end + 1
 76 |         partition_file.close()    
 77 | 
 78 | 
 79 | 
 80 | 
 81 | def main():
 82 |     parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
 83 |     parser.add_argument("--fastafiles",nargs='+',help="List of Fasta Files. Can use wildcard on Linux/Mac systems")
 84 |     parser.add_argument("--filelist",help="File containing list of Fasta files. Alternative to --fastalist")
 85 |     parser.add_argument("--raxml",help="Create a partition file 'partitions.raxml' intended for raxml in the current directory. For amino acid sequences, select the substitution model. To specify a separate model for 1st/2nd vs. 3rd codon positions, select CODON.",
 86 |         choices = ['DNA','WAG','JTT','CODON'
 87 |                     ],default=None)
 88 |         
 89 |     if len(sys.argv) < 2:
 90 |         parser.print_help()
 91 |         sys.exit(1)
 92 |     
 93 |     args = parser.parse_args()
 94 |     
 95 |     if args.fastafiles:
 96 |         #print args.fastafiles
 97 |         if args.filelist:
 98 |             sys.stderr.write("Specify either a list of FASTA files or a file containing names, not both!\n")
 99 |             sys.exit(1)
100 |         else:
101 |             fastafiles = args.fastafiles    
102 |         
103 |     elif args.filelist:
104 |         #print args.filelist
105 |         if os.path.isfile(args.filelist):
106 |             fastafiles = [x.rstrip() for x in open(args.filelist)]
107 |         else:
108 |             sys.stderr.write("File containing list of FASTA files not found!")
109 |             sys.exit(1)    
110 |     
111 |     else:
112 |          sys.stderr.write("You must specify the FASTA files as a list or in a file.\n")
113 |          sys.exit(1)
114 | 
115 |     sys.stderr.write("{} FASTA files found.\n".format(len(fastafiles)))
116 |     gene_dict = read_sequences(fastafiles)
117 |     
118 |     sys.stderr.write("All sequences read successfully.\n")    
119 |     unique_names = get_unique_names(gene_dict)
120 |     sys.stderr.write("{} Unique names found. If you were expecting fewer sequences, check your IDs!\n".format(len(unique_names)))
121 |     gaps_inserted = insert_sequences(gene_dict,unique_names)
122 | 
123 |     partition_lengths = concatenate_sequences(gaps_inserted,fastafiles,unique_names)
124 |     
125 |     if args.raxml:
126 |         raxml_partition(fastafiles,partition_lengths,args.raxml)
127 | 
128 | if __name__ == "__main__":main()


--------------------------------------------------------------------------------
/HybPiperUtils/filter_by_length.py:
--------------------------------------------------------------------------------
  1 | import os,sys,argparse
  2 | 
  3 | from Bio import SeqIO
  4 | 
  5 | helptext ='''This script will filter output from HybPiper based on the output of hybpiper retrieve_sequences
  6 | 
  7 | As of HybPiper version 2.1.6, hybpiper retrieve_sequences only supports filtering based
  8 | on project-wide thresholds (i.e. number of total genes recovered). This script will allow 
  9 | filtering based on individual genes and the mean length or minimum length threshold.
 10 | 
 11 | 1. Run hybpiper stats to generate the stats.tsv and lengths.tsv files
 12 | 2. Run hybpiper retrieve_sequences to create a folder of FASTA sequences
 13 | 3. Run this script to create new FASTA files based on the per-gene filters. 
 14 |     Also writes to standard output the denylist by gene, redirect this to save to a file.
 15 |     
 16 | The FASTA sequences will expect to have the naming scheme of HybPiper:
 17 |     geneName.FNA for nucleotide exon files
 18 |     geneName.FAA for amino acid files
 19 |     geneName_supercontig.fasta for supercontig files
 20 |     geneName_intron.fasta for intron-only files
 21 |     
 22 | The geneNames will be taken from either the hybpiper stats file (--lengthfile) or a supplied
 23 |     list of gene sample combinations (--denylist, also produced by running this script)
 24 |     
 25 | If you wish to filter intron or supercontig sequences, run again with the --denylist flag
 26 |     to skip the filtering based on lengths.
 27 | '''
 28 | 
 29 | def filter_fastas(deny_dict,seq_type):
 30 |     if seq_type == "supercontig":
 31 |         seqend = "_supercontig.fasta"
 32 |     elif seq_type == "intron":
 33 |         seqend = "_intron.fasta"
 34 |     else:
 35 |         seqend = "." + seq_type
 36 |     
 37 |     fastafiles = [x for x in os.listdir() if x.endswith(seqend)]
 38 |     for f in fastafiles:
 39 |         geneName = f.replace(seqend,'')
 40 |         genedenylist = set(deny_dict[geneName])
 41 |         if seq_type == "supercontig" or  seq_type == "intron":
 42 |             newFn = f"{geneName}.filtered{seqend}"
 43 |         else:
 44 |             newFn = f"{geneName}.filtered.{seq_type}"
 45 |         with open(newFn,'w') as outfile:
 46 |             for seq in SeqIO.parse(f,'fasta'):
 47 |                 if seq.id in genedenylist:
 48 |                     continue
 49 |                 else:
 50 |                     SeqIO.write(seq,outfile,'fasta')
 51 |     return
 52 | 
 53 | def write_denylist(deny_dict):
 54 |     #with open(denylistfn,'w') as outfile:
 55 |     for gene in deny_dict:
 56 |         samples = ",".join(deny_dict[gene])
 57 |         sys.stdout.write(f"{gene}\t{samples}\n")
 58 |     return
 59 | 
 60 | def filter_seqs(gene_lengths,minLength,minPercent):
 61 |     '''Takes the sample-gene lengths and filters and returns a dictionary by gene of samples to be on the denylist'''
 62 |     
 63 |     deny_dict = {}
 64 |     total_deny = 0
 65 |     for gene in gene_lengths:
 66 |         deny_dict[gene] = []
 67 |         percentThresh = gene_lengths[gene]["mean_length"] * minPercent
 68 |         #print(gene,percentThresh)
 69 |         for sampleName in gene_lengths[gene]["sample_lengths"]:
 70 |             sampleLength = gene_lengths[gene]["sample_lengths"][sampleName]
 71 |             if sampleLength < minLength:
 72 |                 deny_dict[gene].append(sampleName)
 73 |                 total_deny += 1
 74 |                 continue
 75 |             if sampleLength < percentThresh:
 76 |                 deny_dict[gene].append(sampleName)
 77 |                 total_deny += 1
 78 |     sys.stderr.write(f"Filtered {total_deny} total sequences at {len(deny_dict)} genes based on parameters.")
 79 |     return deny_dict
 80 | 
 81 | def parse_seqlens(seqlens_fn):
 82 |     '''Takes the file name for the seqlengths output of hybpiper stats and returns:
 83 |     - a list of sample names
 84 |     - a dictionary for each gene containing:
 85 |         * the name of the gene as the dict key
 86 |         * "mean length":integer
 87 |         * "sample_lengths":{a dictionary of key:sample_lengths}'''
 88 |     
 89 |     sample_names = []
 90 |     gene_lengths = {}
 91 |     
 92 |     seqlens = open(seqlens_fn)
 93 |     genenames = seqlens.readline().rstrip().split("\t")[1:]
 94 |     meanlens = seqlens.readline().rstrip().split("\t")[1:]
 95 |     for geneNum in range(len(genenames)):
 96 |         gene_lengths[genenames[geneNum]] = {"mean_length":float(meanlens[geneNum]),"sample_lengths":{}}
 97 |     for line in seqlens:
 98 |         line = line.rstrip().split("\t")
 99 |         sampleName = line.pop(0)
100 |         sample_names.append(sampleName)
101 |         for geneNum in range(len(genenames)):
102 |             gene_lengths[genenames[geneNum]]["sample_lengths"][sampleName] = float(line[geneNum])
103 |     
104 |     return sample_names,gene_lengths
105 | 
106 | 
107 | def parse_denylist(denylist_fn):
108 |     '''parses the text file at denylist_fn and returns a dict with the geneName:[samplelist] pairs'''
109 |     deny_dict = {}
110 |     total_deny = 0
111 |     for line in open(denylist_fn):
112 |         line = line.rstrip().split("\t")
113 |         try:
114 |             samples = line[1].split(",")
115 |         except IndexError:
116 |             samples = []
117 |         total_deny += len(samples)
118 |         deny_dict[line[0]] = samples
119 |     sys.stderr.write(f"Found {total_deny} total samples at {len(deny_dict)} genes in the denylist {denylist_fn}")
120 |     return deny_dict
121 | 
122 | 
123 | def main():
124 |     parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
125 |     parser.add_argument("--denylist",help="Text file containing gene-sample combinations to omit. \n The format of the file should be one gene per line, a tab, \n and then a comma-delimited list of samples to disallow: \n    gene[tab]sample,sample,sample ",default=None)
126 |     parser.add_argument("--lengthfile",help="Output of hybpiper stats, with list of genes in first row, \n mean target lengths in second row, and sample recovery in other rows.")
127 |     parser.add_argument("--seq_type",help="File seq_type for all FASTA files to filter in current directory. \n For example, the amino acid output of HybPiper would be: FAA",choices=["FNA","FAA","supercontig","intron"])
128 |     parser.add_argument("--length_filter",help="Minimum length to allow a sequence \n in nucleotides for DNA or amino acids for protein sequences",default=0,type=int)
129 |     parser.add_argument("--percent_filter",help="Minimum fraction (between 0 and 1) of the mean target length to allow a sequence for a gene. \n Lengths taken from HybPiper stats file.",default=0,type=float)
130 |     
131 |     if len(sys.argv) < 2:
132 |         parser.print_help()
133 |         sys.exit(1)
134 |     
135 |     args = parser.parse_args()
136 |     
137 |     if args.denylist:
138 |         deny_dict = parse_denylist(args.denylist)
139 |     else:
140 |         sample_names,gene_lengths = parse_seqlens(args.lengthfile)
141 |         deny_dict = filter_seqs(gene_lengths,args.length_filter,args.percent_filter)
142 |         write_denylist(deny_dict)
143 |         
144 |     filter_fastas(deny_dict,args.seq_type)
145 |     
146 | 
147 | if __name__ == "__main__":main()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Matt Johnson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # phyloscripts by mossmatters
 2 | ----
 3 | Helper scripts for processing and visualization of phylogenetics datasets, mostly written in Python. For more information about each of the scripts, visit the appropriate subdirectory.
 4 | 
 5 | 
 6 | # [Alleles Workflow](https://github.com/mossmatters/phyloscripts/tree/master/alleles_workflow)
 7 | 
 8 | Python and shell scripts for phasing alleles from HybSeq data, as done in [Kates et al. AJB 2018](https://www.ncbi.nlm.nih.gov/pubmed/29729187).
 9 | 
10 | # [PhypartsPieCharts](https://github.com/mossmatters/phyloscripts/tree/master/phypartspiecharts)
11 | 
12 | Script for plotting pie charts from bipartition analysis on a species phylogeny.
13 | 
14 | 
15 | # [HybpiperUtils](https://github.com/mossmatters/phyloscripts/tree/master/hybpiperutils)
16 | 
17 | Scripts for working with target capture data recovered or assembled using [HybPiper](github.com/mossmatters/hybpiper), including `fasta_merge.py` from version 1.3.1.
18 | 
19 | # [Minority Report](https://github.com/mossmatters/phyloscripts/tree/master/minorityreport)
20 | 
21 | Display minority bipartitions on a species tree.
22 | 
23 | # [Haplonerate](https://github.com/mossmatters/phyloscripts/tree/master/haplonerate)
24 | 
25 | Adjust phased allelic sequences to have phased alleles only in the largest phase block, with ambiguity codes elsewhere. Used in [Kates et al. AJB 2018](https://www.ncbi.nlm.nih.gov/pubmed/29729187)
26 | 
27 | # Branch Length Outlier
28 | 
29 | Identify extreme branch lengths in a set of gene trees. Generates images of offending trees so that manual curation of every gene tree is not necessary.
30 | 


--------------------------------------------------------------------------------
/alleles_workflow/README.md:
--------------------------------------------------------------------------------
  1 | # Alleles from HybSeq Data
  2 | 
  3 | Assorted shell and Python scripts to generate allele data from HybPiper output.
  4 | 
  5 | ### Software
  6 | 
  7 | HybPiper (to generate supercontigs): www.github.com/mossmatters/HybPiper
  8 | 
  9 | Picard: http://broadinstitute.github.io/picard/
 10 | 
 11 | GATK: https://software.broadinstitute.org/gatk/download/
 12 | 
 13 | WhatsHap: http://whatshap.readthedocs.io
 14 | 
 15 | 
 16 | ### Prerequisites
 17 | 
 18 | Run the main HybPiper script `reads_first.py` followed by `intronerate.py` to generate supercontigs for each recovered gene. This will function as "reference sequence" for identifying variants.
 19 | 
 20 | 
 21 | Concatenate all the supercontigs into a single file. Given a directory gerated by HybPiper called `prefix`:
 22 | 
 23 | ```cat prefix/*/prefix/sequences/intron/*_supercontig.fasta > prefix.supercontigs.fasta```
 24 | 
 25 | Concatenate all of the GFF intron/exon boundary annotations:
 26 | 
 27 | ```cat prefix/*/prefix/sequences/intronerate.gff > prefix.intronerate.fasta```
 28 | 
 29 | ### `map_to_supercontigs.sh`
 30 | 
 31 | This script will:
 32 | 
 33 | 1. Map paired-end reads to the supercontigs using `bwa mem`.
 34 | 2. Remove duplicate reads using `picard`.
 35 | 3. Identify variant sites using `gatk HaplotypeCaller`
 36 | 4. Filter variant sites to identify only SNPs.
 37 | 5. Generate a new set of reference sequences with SNPs replaced by IUPAC ambiguity codes, using `gatk FastaAlternateReferenceMaker`
 38 | 
 39 | **IMPORTANT: Modify the lines indicating paths to the picard and gatk jarfiles!**
 40 | 
 41 | 
 42 | Command line:
 43 | `bash map_to_supercontigs.sh HybPiperPrefix read1.fq read2.fq`
 44 | 
 45 | The Output will be Directory containing several BAM files, VCF files containing all variants and another with only SNPs, and an alternate reference file with SNPs replaced with IUPAC ambiguity bases.
 46 | 
 47 | ### Phase alleles with WhatsHap
 48 | 
 49 | **Note**: WhatsHap requires Python 3. I used a conda environment specifically for whatshap to avoid conflicts with other Python environments.
 50 | 
 51 | Running the above script on each sample will create a set of directories containing the BAM and VCF files. To use the read data to phase this, first create a text file containing the names of these directories, one per line, called `namelist.txt`. To loop over these and generate the Phased output:
 52 | 
 53 | ```
 54 | while read i; do cd $i; whatshap phase -o \
 55 | $i.supercontigs.fasta.snps.whatshap.vcf \
 56 | $i.supercontigs.fasta.snps.vcf \
 57 | $i.supercontigs.fasta.marked.bam; 
 58 | whatshap stats \
 59 | --gtf $i.whatshap.gtf \
 60 | --tsv $i.whatshap.stats.tsv \
 61 | $i.supercontigs.fasta.snps.whatshap.vcf; 
 62 | cd ..; 
 63 | done < ../namelist.txt
 64 | ```
 65 | This will create a new VCF with phase information, and the stats command will also generate some table summarizing the phasing results.
 66 | 
 67 | ### `extract_phase_bcftools.sh`
 68 | 
 69 | Run this script to generate separate files for each of the alleles generated by WhatsHap using `bcftools`.
 70 | 
 71 | Command: `extract_phase_bcftools.sh prefix`
 72 | 
 73 | 
 74 | `haplonerate.py` can be found in this same GitHub Repository.
 75 | 
 76 | The files are now ready to run `haplonerate.py` which can be found here: https://github.com/mossmatters/phyloscripts/tree/master/haplonerate
 77 | 
 78 | Name the output file from `haplonerate.py` as `prefix.supercontigs.alleles.fasta`
 79 | 
 80 | ### `intron_exon_extractor.py`
 81 | 
 82 | Generates separate files for the intron, exon, and supercontig sequences for one sample. Will generate a separate file for each sequence. Can handle default HybPiper, IUPAC-coded, and phased-allele datasets where `_h1` and `_h2` are appended to the sample name.
 83 | 
 84 | Command: `python intron_exon_extractor.py prefix`
 85 | 
 86 | ### `create_alleles_alignments.sh`
 87 | 
 88 | Generate combined intron and exon alignments from phased sequences:
 89 | 
 90 | 1. Align exon sequences using MACSE
 91 | 2. Align intron sequences using mafft
 92 | 3. Trim both alignments using TrimAl
 93 | 4. Combine intron and exon alignments and generate a RAXML partition file using `combine_alignments.py`
 94 | 
 95 | **NOTE: This script is provided only to show parameter settings! The PATH to files and executables is specific to our computer!**
 96 | 
 97 | ### `combine_alignments.py`
 98 | 
 99 | Given an exon alignment and an intron alignment for the same gene, combine them and also output a partition file for RAXML based on codon position and intron location.
100 | 
101 | Command `python combine_alignments.py exon.fasta intron.fasta geneName`
102 | 
103 | *Requires BioPython*


--------------------------------------------------------------------------------
/alleles_workflow/combine_alignments.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #Script to combine exon and intron alignments for a gene and generate a RAxML partition file.
 3 | 
 4 | import sys,os
 5 | from Bio import SeqIO
 6 | from Bio.Seq import Seq
 7 | from Bio.SeqRecord import SeqRecord
 8 | 
 9 | if len(sys.argv) < 4:
10 |     print("Usage: python combine_alignments.py exon.fasta intron.fasta geneName")
11 |     sys.exit(1)
12 |     
13 | exon_fn = sys.argv[1]
14 | intron_fn = sys.argv[2]
15 | geneName = sys.argv[3]
16 | 
17 | exon_dict = SeqIO.to_dict(SeqIO.parse(exon_fn,'fasta'))
18 | exonLength = len(next(exon_dict.itervalues()))
19 | with open("{}.combined.fasta".format(geneName),'w') as outfile:
20 |     
21 |     if os.path.isfile(intron_fn):
22 |         for seq in SeqIO.parse(intron_fn,'fasta'):
23 |             intronLength = len(seq)
24 |             sampleID = seq.id.split("-")[0]
25 |             newseq = exon_dict[sampleID].seq + seq.seq
26 |             outfile.write(">{}\n{}\n".format(sampleID,newseq))
27 |         partition = """DNA, codons1-2 = 1-{}\\3, 2-{}\\3
28 | DNA, codon3 = 3-{}\\3
29 | DNA, intron = {}-{}
30 |  
31 | """.format(exonLength, exonLength, exonLength, exonLength+1,exonLength+intronLength)
32 |            
33 |             
34 |             
35 |             
36 |             
37 |             
38 |             
39 | #        if seq.id.startswith("McBryde"):
40 | #            sampleID = "MV2"
41 | #        else:
42 |         
43 |             
44 |             
45 |             
46 |     else:
47 |         for sampleID in exon_dict:
48 |             newseq = exon_dict[sampleID].seq
49 |             outfile.write(">{}\n{}\n".format(sampleID,newseq))
50 |         partition = """DNA, codons1-2 = 1-{}\\3, 2-{}\\3
51 | DNA, codon3 = 3-{}\\3 
52 | """.format(exonLength, exonLength, exonLength, exonLength+1)
53 | 
54 |     
55 |         
56 | 
57 | 
58 | 
59 | 
60 | with open("{}.combined.partition".format(geneName),'w') as partitionfile:
61 |     partitionfile.write(partition)


--------------------------------------------------------------------------------
/alleles_workflow/create_alleles_alignments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #PBS -q default
 4 | #PBS -l nodes=1:ppn=12
 5 | #PBS -j oe
 6 | #PBS -o iupac_alignments.out
 7 | 
 8 | set -eo pipefail
 9 | 
10 | # Shell script to recreate the IUPAC ambiguity coded alignments for Artocarpus.
11 | 
12 | 
13 | cd ~/Projects/artocarpus/alleles_paper/haplotype_sequences
14 | 
15 | genelist=/home/mjohnson/Projects/artocarpus/alleles_paper/singlecopy_genelist.txt
16 | namelist=/home/mjohnson/Projects/artocarpus/alleles_paper/namelist_ajb.txt
17 | 
18 | ##########EXONS############
19 | 
20 | ###### Exon sequences generated from HybPiper output:
21 | 
22 | mkdir -p exon
23 | rm exon/* 
24 | parallel "cat /home/mjohnson/Projects/artocarpus/alleles_paper/haplotype_sequences/{1}/exon/{2}.alleles.FNA >> exon/{2}.alleles.FNA" :::: $namelist :::: $genelist
25 | 
26 | ##### Alignments with MACSE
27 | 
28 | parallel --eta macse -prog alignSequences -seq exon/{}.alleles.FNA :::: $genelist
29 | 
30 | ##### Replace frame shifts ! with gaps -
31 | 
32 | mkdir -p macse
33 | rm macse/*
34 | mv exon/*_macse* macse
35 | 
36 | parallel sed -i "s/\!/-/g" macse/{}.alleles_macse_NT.fasta :::: $genelist
37 | 
38 | ##### Trim alignments to retain only sites present in 75% of taxa
39 | 
40 | mkdir -p exon_trimmed
41 | rm exon_trimmed/*
42 | 
43 | parallel "trimal -gt 0.75 -in macse/{}.alleles_macse_NT.fasta -out exon_trimmed/{}.alleles.macse.trimmed.FNA" :::: $genelist
44 | 
45 | #Fix McBryde-MV2
46 | 
47 | parallel sed -i -E 's/McBryde-MV2/McBryde/g' exon_trimmed/{}.macse.trimmed.FNA :::: $genelist
48 | 
49 | ###########INTRONS#########
50 | 
51 | ##### Intron sequences generated from HybPiper (intronerate.py):
52 | 
53 | mkdir -p intron
54 | rm intron/*
55 | parallel "cat /home/mjohnson/Projects/artocarpus/alleles_paper/haplotype_sequences/{1}/intron/{2}.intron.alleles.fasta >> intron/{2}.intron.alleles.fasta" :::: $namelist :::: $genelist
56 | 
57 | # Remove gene name from intron sequence files
58 | 
59 | #parallel sed -i -E 's/-.+$//g' intron/{}.intron.alleles.fasta :::: $genelist
60 | 
61 | # Align intron sequences with MAFFT. Timeout because of known huge sequence.
62 | mkdir -p mafft
63 | rm mafft/*
64 | 
65 | parallel --timeout 4000% --eta "mafft --maxiterate 1000 --globalpair --preservecase intron/{}.intron.alleles.fasta > mafft/{}.intron.alleles.mafft.fasta" :::: $genelist
66 | 
67 | # Trim alignments to retain only sites present in 75% of taxa
68 | 
69 | mkdir -p intron_trimmed
70 | rm intron_trimmed/*
71 | 
72 | parallel "trimal -gt 0.75 -in mafft/{}.intron.alleles.mafft.fasta -out intron_trimmed/{}.intron.alleles.mafft.trimmed.fasta" :::: $genelist
73 | 
74 | 
75 | # Combine alignments
76 | 
77 | #parallel python ../../combine_alignments.py exon_trimmed/{}.iupac.macse.trimmed.FNA intron_trimmed/{}.intron.iupac.mafft.trimmed.fasta {} :::: $genelist
78 | 
79 | #mkdir -p ../artocarpus_alignments/default/
80 | #mv *.fasta ../artocarpus_alignments/default/
81 | #mv *.partition ../artocarpus_alignments/default
82 | 


--------------------------------------------------------------------------------
/alleles_workflow/extract_phase_bcftools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | #Script to prepare phased haplotype sequences for each for one sample. 
 4 | 
 5 | prefix=$1
 6 | genelist=genelist.txt
 7 | mkdir -p $prefix
 8 | cd $prefix
 9 | rm -r *
10 | 
11 | #Run bcftools to extract sequences
12 | 
13 | bgzip -c $prefix.supercontigs.fasta.snps.whatshap.vcf > $prefix.supercontigs.fasta.snps.whatshap.vcf.gz
14 | tabix $prefix.supercontigs.fasta.snps.whatshap.vcf.gz
15 | mkdir -p phased_bcftools
16 | rm phased_bcftools/*
17 | 
18 | parallel "samtools faidx $iupac_dir/$prefix.supercontigs.fasta $prefix-{1} | bcftools consensus -H 1 $prefix.supercontigs.fasta.snps.whatshap.vcf.gz > phased_bcftools/$prefix-{1}.phased.fasta" :::: $genelist 
19 | parallel "samtools faidx $iupac_dir/$prefix.supercontigs.fasta $prefix-{1} | bcftools consensus -H 2 $prefix.supercontigs.fasta.snps.whatshap.vcf.gz >> phased_bcftools/$prefix-{1}.phased.fasta" :::: $genelist 
20 | 
21 | cd ..
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/alleles_workflow/intron_exon_extractor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Script to use the GFF files from intronerate and the ambiguity-encoded FASTA files to generate separate intron and exon files for each gene.
  3 | 
  4 | import re,sys,os,errno,shutil
  5 | from Bio import SeqIO
  6 | from Bio.Seq import Seq
  7 | from Bio.SeqRecord import SeqRecord
  8 | 
  9 | def mkdir_p(path):
 10 |     try:
 11 |         os.makedirs(path)
 12 |     except OSError as exc: # Python >2.5
 13 |         if exc.errno == errno.EEXIST and os.path.isdir(path):
 14 |             pass
 15 |         else: raise
 16 | 
 17 | my_re = re.compile(r"([0-9]+ )(.+):1")
 18 | 
 19 | #Fix the names in IUPAC file
 20 | 
 21 | prefix=sys.argv[1]
 22 | 
 23 | if os.path.isdir(prefix):
 24 |     os.chdir(prefix)
 25 | 
 26 | if os.path.isfile("{}.supercontigs.fasta.iupac".format(prefix)):
 27 |     with open("{}.supercontigs.iupac.fasta".format(prefix),'w') as outfile:
 28 |         for seq in SeqIO.parse("{}.supercontigs.fasta.iupac".format(prefix),'fasta'):
 29 |             seq.id = my_re.sub("\g<2>",seq.description)
 30 |             seq.description = ''
 31 |             SeqIO.write(seq,outfile,'fasta')
 32 |         
 33 | 
 34 | #Parse GFF into dictionaries for each gene (one for introns, one for exons)
 35 | # ASSUMES THE GFF IS SORTED WITHIN EACH GENE!!!!
 36 | 
 37 | intron_dict = {}
 38 | exon_dict = {}
 39 | 
 40 | gff_fn = prefix+".intronerate.gff"#sys.argv[2]
 41 | for line in open(gff_fn):
 42 |     line=line.split()
 43 |     if line[2] == "exon":
 44 |         try:
 45 |             exon_dict[line[0]].append((int(line[3])-1,int(line[4])))
 46 |         except KeyError:
 47 |             exon_dict[line[0]] = [(int(line[3])-1,int(line[4]))]
 48 | #    elif line[2] == "intron":
 49 | #        try:
 50 | #            intron_dict[line[0]].append((int(line[3])-1,int(line[4])))
 51 | #        except KeyError:
 52 | #            intron_dict[line[0]] = [(int(line[3])-1,int(line[4]))]
 53 | 
 54 | try:
 55 |     supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.iupac.fasta".format(prefix),'fasta'))
 56 |     dataType = "iupac"
 57 | except IOError:
 58 |     try:
 59 |         supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.alleles.fasta".format(prefix),'fasta'))
 60 |         dataType = "alleles"
 61 |     except IOError:
 62 |         try:
 63 |             supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.svdq.fasta".format(prefix),'fasta'))
 64 |             dataType = "svdq"
 65 |         except IOError:
 66 |             supercontig_dict = SeqIO.to_dict(SeqIO.parse("{}.supercontigs.default.fasta".format(prefix),'fasta'))
 67 |             dataType = 'default'
 68 |         
 69 |     
 70 | for gene in exon_dict:
 71 |     try:
 72 |         geneLength = len(supercontig_dict[gene])
 73 |     except KeyError:
 74 |         haploGeneName = "{}_h1-{}".format(prefix,gene.split("-")[-1])
 75 |         geneLength = len(supercontig_dict[haploGeneName])
 76 |     exon_ranges = exon_dict[gene]
 77 | #    intron_dict[gene] = [(0,exon_dict[0][0]),exon_dict[0][1]]
 78 |     for exon_interval in range(len(exon_ranges)+1):
 79 |         if exon_interval == 0:
 80 |             intron_dict[gene] = [(0,exon_ranges[exon_interval][0]-1)]
 81 |         elif exon_interval == len(exon_ranges)  :
 82 |             intron_dict[gene].append((exon_ranges[-1][1],geneLength))
 83 | 
 84 |         else:
 85 |             start = exon_ranges[exon_interval - 1][1] 
 86 |             stop = exon_ranges[exon_interval][0] - 1
 87 |             intron_dict[gene].append((start,stop))
 88 | #print(intron_dict["NZ866-gene001.single"])
 89 |     
 90 |     
 91 | 
 92 |             
 93 | 
 94 | newseq = ''
 95 | 
 96 | for seqType in ["exon","intron","supercontig"]:
 97 |     if os.path.exists(seqType):
 98 |         shutil.rmtree(seqType)
 99 |     os.makedirs(seqType)
100 | 
101 | for gene in supercontig_dict:
102 |     if gene.startswith("McBryde"):
103 |         geneName = gene.split("-")[2]
104 |         sampleName = gene.split("-")[1]
105 |     else:
106 |         geneName = gene.split("-")[1]
107 |         sampleName = gene.split("-")[0]
108 |             
109 |     with open("exon/{}.{}.FNA".format(geneName,dataType),'a') as exonout:
110 |         newseq = ''
111 |         exonLookupName = supercontig_dict[gene].id.replace("_h1",'')
112 |         exonLookupName = exonLookupName.replace("_h2",'')
113 |         if exonLookupName not in exon_dict:
114 |             continue
115 |         for gff_interval in exon_dict[exonLookupName]:
116 |             newseq += supercontig_dict[gene].seq[gff_interval[0]:gff_interval[1]]
117 |         exonout.write(">{}\n{}\n".format(sampleName,newseq))
118 |     with open("intron/{}.intron.{}.fasta".format(geneName,dataType),'a') as intronout:
119 |         newseq=''
120 |         for gff_interval in intron_dict[exonLookupName]:
121 |             newseq += supercontig_dict[gene].seq[gff_interval[0]:gff_interval[1]]
122 |         intronout.write(">{}\n{}\n".format(sampleName,newseq))
123 |     
124 |     with open("supercontig/{}.supercontig.{}.fasta".format(geneName,dataType),'a') as supercontigout:
125 |         supercontigout.write(">{}\n{}\n".format(sampleName,supercontig_dict[gene].seq))
126 |         


--------------------------------------------------------------------------------
/alleles_workflow/map_to_supercontigs.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #PBS -q default
  4 | #PBS -j oe
  5 | #PBS -o ambiguity.out
  6 | #PBS -l nodes=1:ppn=1
  7 | #PBS -t 1-24
  8 | 
  9 | #cd $TMPDIR #Projects/artocarpus/alleles_paper/iupac_sequences
 10 | #prefix=$(tail -n $PBS_ARRAYID /home/mjohnson/Projects/artocarpus/alleles_paper/namelist_ajb.txt | head -1)
 11 | 
 12 | # This workflow will take the supercontig output of HybPiper and return a supercontig that 
 13 | # contains heterozygous positions as ambiguity bases. Uses paired reads.
 14 | 
 15 | #The script should be run on a FASTA file containing all the supercontigs of interest.
 16 | 
 17 | 
 18 | if [[ $# -eq 0 ]] ; then
 19 |     echo 'usage: hybpiper_ambiguity.sh supercontig.fasta readfile1.fq readfile2.fq'
 20 |     exit 1
 21 | fi
 22 | 
 23 | #########CHANGE THESE PATHS AS NEEDED###########
 24 | 
 25 | gatkpath=/opt/Software/GenomeAnalysisTK.jar
 26 | picardpath=/opt/Software/picard/build/libs/picard.jar
 27 | 
 28 | #############COMMAND LINE ARGUMENTS############
 29 | 
 30 | prefix=$1
 31 | read1fq=$2
 32 | read2fq=$3
 33 | 
 34 | mkdir $prefix
 35 | cd $prefix
 36 | 
 37 | #while read i
 38 | #do
 39 | #cat ~/Projects/artocarpus/alleles_paper/hybpiper/$prefix/$i/$prefix/sequences/intron/"$i"_supercontig.fasta
 40 | #done < ~/Projects/artocarpus/alleles_paper/newtargets_genelist.txt >> $prefix.supercontigs.fasta
 41 | 
 42 | supercontig=$prefix.supercontigs.fasta
 43 | 
 44 | #read1fq=~/Projects/artocarpus/alleles_paper/reads/"$prefix".R1.paired.fastq
 45 | #read2fq=~/Projects/artocarpus/alleles_paper/reads/"$prefix".R2.paired.fastq
 46 | 
 47 | #####STEP ZERO: Make Reference Databases
 48 | 
 49 | java -jar $picardpath CreateSequenceDictionary \
 50 | R=$supercontig 
 51 | bwa index $supercontig
 52 | samtools faidx $supercontig
 53 | 
 54 | #####STEP ONE: Map reads
 55 | 
 56 | echo "Mapping Reads"
 57 | 
 58 | bwa mem $supercontig $read1fq $read2fq | samtools view -bS - | samtools sort - -o $supercontig.sorted.bam
 59 | 
 60 | java -jar $picardpath FastqToSam  \
 61 | F1=$read1fq \
 62 | F2=$read2fq \
 63 | O=$supercontig.unmapped.bam \
 64 | SM=$supercontig
 65 | 
 66 | java -jar $picardpath MergeBamAlignment \
 67 | ALIGNED=$supercontig.sorted.bam \
 68 | UNMAPPED=$supercontig.unmapped.bam \
 69 | O=$supercontig.merged.bam \
 70 | R=$supercontig
 71 | 
 72 | #####STEP TWO: Mark duplicates
 73 | 
 74 | echo "Marking Duplicates"
 75 | java -jar $picardpath MarkDuplicates \
 76 | I=$supercontig.merged.bam \
 77 | O=$supercontig.marked.bam \
 78 | M=$supercontig.metrics.txt
 79 | 
 80 | #######STEP THREE: Identify variants, select only SNPs
 81 | 
 82 | echo "Identifying variants"
 83 | 
 84 | samtools index $supercontig.marked.bam
 85 | #samtools mpileup -B -f $supercontig $supercontig.marked.bam -v -u > $supercontig.vcf
 86 | 
 87 | java -jar $gatkpath \
 88 | -R $supercontig \
 89 | -T HaplotypeCaller \
 90 | -I $supercontig.marked.bam \
 91 | -o $supercontig.vcf
 92 | 
 93 | 
 94 | 
 95 | time java -jar $gatkpath \
 96 | -T SelectVariants \
 97 | -R $supercontig \
 98 | -V $supercontig.vcf \
 99 | -selectType SNP \
100 | -o $supercontig.snps.vcf 
101 | 
102 | 
103 | ######STEP FOUR: Output new supercontig FASTA with ambiguity codes
104 | 
105 | echo "Generating IUPAC FASTA file"
106 | 
107 | java -jar $gatkpath \
108 | -T FastaAlternateReferenceMaker \
109 | -R $supercontig \
110 | -o $supercontig.iupac \
111 | -V $supercontig.snps.vcf \
112 | -IUPAC $supercontig
113 | 
114 | cd ..
115 | cp -r $prefix /home/mjohnson/Projects/artocarpus/alleles_paper/iupac_sequences/$prefix
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/brlenoutliers/brlen_outliers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | helptext='''This script identfies branch length outliers on a phylogeny. An outlier is a 
  4 | branch with a length that exceeds a percentage of the maximum depth of the tree.
  5 | 
  6 | The input is a file containing one tree in newick format, and an optional file containing
  7 | a list of outgroup taxa (one per line). 
  8 | 
  9 | The output will be an ASCII depiction of each branch with a length exceeding the threshold
 10 | (default 25% for ingroups, 75% for outgroups). A PNG file will also be generated for the 
 11 | tree, with outgroup taxa in blue and branch length outliers in red.
 12 | 
 13 | Dependencies: 
 14 | Python > 2.7
 15 | ETE3 installed with all graphical dependencies
 16 | '''
 17 | 
 18 | #Given a tree, determine if there are long branch lengths
 19 | 
 20 | import sys, argparse, os
 21 | from ete3 import Tree,TreeStyle,TextFace,NodeStyle
 22 | 
 23 | 
 24 | # outgroups=set([x.rstrip() for x in open(sys.argv[2])])
 25 | # outgroups_in_tree = list(set(t.get_leaf_names()).intersection(set(outgroups)))
 26 | # ingroups_in_tree = list(set(t.get_leaf_names()).difference(set(outgroups)))
 27 | # 
 28 | # if len(outgroups_in_tree) > 1:
 29 | #     ancestor = t.get_common_ancestor(outgroups_in_tree)
 30 | #     try:
 31 | #         t.set_outgroup(ancestor)
 32 | #         #ingroup_monophyly = t.check_monophyly(ingroups_in_tree,"name")
 33 | #         #if not ingroup_monophyly[0]:
 34 | #         #    sys.stdout.write("Ingroup polyphyletic! for {}\n".format(sys.argv[1]))
 35 | #         #    print(ingroup_monophyly)
 36 | #     except:
 37 | #         sys.stdout.write("Ingroup not monophyletic for {}!\n".format(sys.argv[1]))
 38 | #         sys.exit(1)
 39 | #     #print t.write()
 40 | # elif len(outgroups_in_tree) == 1:
 41 | #     t.set_outgroup(outgroups_in_tree[0])
 42 | #     ancestor = t.get_leaves_by_name(outgroups_in_tree[0])[0]
 43 | #     #print t.write()
 44 | # else:
 45 | #     sys.stdout.write("no outgroups found for {}!\n".format(sys.argv[1]))
 46 | #     sys.exit(1)
 47 | 
 48 | #ingroup = ancestor.get_sisters()[0].detach()
 49 | #ingroup_depth = ingroup.get_farthest_node()[1]
 50 | 
 51 | 
 52 | 
 53 | #print(outgroups)
 54 | def get_bad_nodes(t,inlen,outlen,leaflen,outgroups=None):
 55 |     bad_nodes = []
 56 |     tree_depth = t.get_farthest_node()[1]
 57 |     for node in t.traverse():
 58 |         isOutgroup = True
 59 |         for leaf in node.get_leaves():
 60 |             if outgroups:
 61 |                 if leaf.name not in outgroups:
 62 |                     isOutgroup = False
 63 |             else:
 64 |                 isOutgroup=False
 65 |         if isOutgroup:
 66 |             if node.dist > tree_depth * outlen:
 67 |                 print(node)
 68 |                 bad_nodes.append(node)
 69 |         elif node.is_leaf():
 70 |             if node.dist > tree_depth * leaflen:
 71 |                 print(node)
 72 |                 bad_nodes.append(node)
 73 |         elif node.dist > tree_depth * inlen:
 74 |             print(node)
 75 |             bad_nodes.append(node)
 76 |     return bad_nodes        
 77 | 
 78 | def make_png(t,bad_nodes,png_name,outgroups=None):
 79 |     for n in t.traverse():
 80 |         n.img_style["size"]=0
 81 |         if n in bad_nodes:
 82 |             nstyle = NodeStyle()
 83 |             nstyle["hz_line_color"] = "red"
 84 |             nstyle["hz_line_width"] = 3
 85 |             n.set_style(nstyle)
 86 |         if n.is_leaf():
 87 |             if outgroups:
 88 |                 if n.name in outgroups:
 89 |                     name_face = TextFace(n.name, fgcolor="blue")
 90 |                 else:
 91 |                     name_face = TextFace(n.name, fgcolor="black")
 92 |             else:
 93 |                 name_face = TextFace(n.name, fgcolor="black")    
 94 |             n.add_face(name_face,0,"branch-right")
 95 | 
 96 |     if len(bad_nodes) > 0:
 97 |         ts = TreeStyle()
 98 |         ts.show_leaf_name = False
 99 |         gene_name = png_name
100 |         ts.title.add_face(TextFace(gene_name,fsize=15,bold=True),0)
101 |         my_png = t.render(png_name,tree_style=ts)     
102 | 
103 | def main():
104 |     parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
105 |     parser.add_argument("treefile",help="File containing one tree in newick format")
106 |     parser.add_argument("--outgroups",help="File containing list of outgroup taxa, one per line")
107 |     parser.add_argument("--png",help="Name of png file, default is same as tree file name",default=None)
108 |     parser.add_argument("--inlen",help="Percentage of max tree depth for ingroup outliers default = %(default)s",default=0.25,type=float)
109 |     parser.add_argument("--outlen",help="Percentage of max tree depth for outgroup outliers default = %(default)s",default=0.75,type=float)
110 |     parser.add_argument("--leaflen",help="Percentage of max tree depth for leaf outliers default = %(default)s",default=0.25,type=float)
111 | 
112 |     if len(sys.argv) == 1:
113 |         parser.print_help()
114 |         sys.exit(1)
115 |     args = parser.parse_args()
116 |     
117 |     if os.path.isfile(args.treefile):
118 |         t = Tree(args.treefile)
119 |     else:
120 |         print("Treefile {} not found!\n".format(args.treefile))
121 |         sys.exit(1)    
122 |     if args.png:
123 |         if args.png.endswith(".png"):
124 |             png_name = args.png
125 |         else:
126 |             png_name =  args.png + ".png"    
127 |     else:
128 |         png_name = os.path.basename(args.treefile).split(".")[0] + ".png"
129 |     
130 |     if args.outgroups:
131 |         outgroups = set([x.rstrip() for x in open(args.outgroups)])
132 |     else:
133 |         outgroups = None
134 |     
135 |     bad_nodes = get_bad_nodes(t,args.inlen,args.outlen,args.leaflen,outgroups=outgroups)
136 |     if len(bad_nodes) > 0:
137 |         make_png(t,bad_nodes,png_name,outgroups=outgroups)
138 |     #else:
139 |         #print("No outliers found for {}\n".format(os.path.basename(args.treefile)))
140 |         
141 |     
142 |     
143 |     
144 |     
145 |     
146 |     
147 |     
148 |     
149 | if __name__ == "__main__":main()    
150 |     
151 |     
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/haplonerate/README.md:
--------------------------------------------------------------------------------
 1 | # Haplonerate
 2 | 
 3 | After using a read-backed phasing algorithm, such as GATK or WhatsHap, allelic sequences generally contain phased alleles throughout the entire reference sequence. However, there is frequently more than one phase block, especially if the data was generated with targeted sequencing (HybSeq) where multiple exons were recovered from a single locus.
 4 | 
 5 | In the example below, variant sites in the blue (left) block cannot be phased with alleles in the yellow (right) block.
 6 | 
 7 | ![](img/AJB_Figure_1.pdf)
 8 | 
 9 | One solution is to reduce the sequence to the longest phase block, deleting other sites. This is not ideal for phylogenetic analysis, as the deleted sites may retain informative sites among species. It will also result in sequences that are of variable lengths across individuals, which may affect alignment.
10 | 
11 | Another solution would be to (hard) mask the sequence outside the longest phase block with Ns. This retains the sequence length but intra-individual informative sites are still lost.
12 | 
13 | Finally, phased alleles can be retained in the longest phase block, variant sites are replaced with with ambiguity codes in other regions. 
14 | 
15 | This script takes two files, containing phased haplotype sequences for one
16 | or more genes and edits the sequences to retain only variable sites in the largest phase
17 | block. The two haplotype sequences can be generated by bcftools, for example.
18 | 
19 | 
20 | 
21 | 
22 | ## Setup
23 | 
24 | 1. Following variant calling, generate a new reference sequence that contains ambiguity codes. In GATK, use `FastaAlternateReferenceMaker`. 
25 | 1. Run a Read-backed phasing algorithm, such as WhatsHap (http://whatshap.readthedocs.io/), which generates a phased VCF file and a GTF file containing the locations of phase blocks.
26 | 2. Generate separate phased sequences for each gene in FASTA format, for example using `bcftools consensus` in the samtools package.
27 | 3. Run `haplonerate.py` to adjust the sequences.
28 | 
29 | 
30 | ## Input
31 | 
32 | **Required**: 
33 | 
34 | * GTF file annotating the locations of phase blocks.
35 | * Two FASTA file containing sequences for one or more genes. The script assumes that the sequences are paired-- the first sequence in the first file corresponds to the first sequence in the second file.
36 | 
37 | **Options**
38 | 
39 | `--output` Specifies an output file for the edited sequences (default is `stdout`). Both alleles are written to the same file with `_h1` or `_h2` appended to the name.
40 | 
41 | `--block` Specifies a file for printing the phase block information for each gene (number of blocks, length of gene, and length of longest block).
42 | 
43 | `--edit` Which editing method is preferred. There are three options for editing the output sequences:
44 | 
45 | * **delete**: retain only the longest phase block, delete the rest of the sequence
46 | * **ref**: use reference sequences to fill the rest of the sequence outside the longest block (default)
47 | * **mask**: fill sequence not in the longest phase block with N
48 | 
49 | If `--edit ref` is used, the reference sequence must be supplied with `--reference`.
50 | 
51 | ## Example Usages
52 | 
53 | **Default usage (`--edit ref`)**
54 | 
55 | `haplonerate.py whatshap.gtf haplotype_h1.fasta haplotype_h2.fasta --reference ambiguity_ref.fasta`
56 | 
57 | ### Triploid Data
58 | 
59 | Use the `haplonerate3N.py` script instead:
60 | 
61 | `haplonerate.py whatshap.gtf haplotype_h1.fasta haplotype_h2.fasta haplotype_h3.fasta --reference ambiguity_ref.fasta --edit delete`
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/haplonerate/haplonerate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import sys,os,argparse
  5 | from Bio import SeqIO
  6 | from Bio.Seq import Seq
  7 | from Bio.SeqRecord import SeqRecord
  8 | 
  9 | helptext = '''This script takes two files, containing phased haplotype sequences for one
 10 | or more genes and edits the sequences to retain only variable sites in the largest phase
 11 | block. The two haplotype sequences can be generated by bcftools, for example.
 12 | 
 13 | There are three options for editing the sequences:
 14 | 
 15 | delete: retain only the longest phase block, delete the rest of the sequence
 16 | ref: use reference sequences to fill the rest of the sequence outside the longest block
 17 | mask: fill sequence not in the longest phase block with N
 18 | 
 19 | haplonerate reads the phase blocks from a GTF file, such as the one produced by the stats 
 20 | function in whatshap
 21 | 
 22 | '''
 23 | 
 24 | def get_gtf_dict(gtf_fn):
 25 |     gtf_dict = {}
 26 |     for line in open(gtf_fn):
 27 |         line = line.split()
 28 |         geneName = line[0]#.split("-")[-1]
 29 |         phase_range = (int(line[3]),int(line[4]))
 30 |         try:
 31 |             gtf_dict[geneName].append(phase_range)
 32 |         except KeyError:
 33 |             gtf_dict[geneName] = [phase_range]
 34 |     return gtf_dict
 35 | 
 36 | 
 37 | 
 38 | #prefix = sys.argv[1]
 39 | 
 40 | #Use the bcftools method to extract haplotypes from each gene
 41 | 
 42 | #seqdirectory = "/home/mjohnson/Projects/artocarpus/alleles_paper/iupac_sequences/{}".format(prefix)
 43 | #geneList = set([x.rstrip() for x in open("/home/mjohnson/Projects/artocarpus/alleles_paper/newtargets_genelist.txt")])
 44 | #os.chdir(prefix)
 45 | 
 46 | #Read in the ambiguity coded sequences into a dictionary
 47 | 
 48 | #iupac_dict = SeqIO.to_dict(SeqIO.parse("{}/{}.supercontigs.iupac.fasta".format(seqdirectory,prefix),'fasta'))
 49 | 
 50 | 
 51 | 
 52 | #Use the GTF from Whatshap to determine the longest phase block for the sequence.
 53 | 
 54 | 
 55 | def getLargestPhaseBlock(ranges,seqLength):
 56 |     '''Given the phase blocks for a sequence, return the most inclusive range'''
 57 |     longestblock = 0
 58 |     for r in range(len(ranges)):
 59 |         if r == 0:
 60 |             start = 1
 61 |             if len(ranges) > 1:
 62 |                 end = ranges[r+1][0] - 1
 63 |             else:    
 64 |                 end = seqLength
 65 |             
 66 |         elif r == len(ranges) - 1 :
 67 |             start = ranges[r-1][1] + 1
 68 |             end = seqLength
 69 |             
 70 |         else:
 71 |             start = ranges[r-1][1] + 1
 72 |             end = ranges[r+1][0] - 1
 73 |             
 74 |             
 75 |         if end - start > longestblock:
 76 |             most_inclusive_range = (start,end)
 77 |             longestblock = end - start
 78 |     #print seqLength,ranges,most_inclusive_range    
 79 |     return most_inclusive_range
 80 | 
 81 | def insertPhase(iupacSeq,haploSeq,phaseBlock,newSeqID):
 82 |     '''Given an IUPAC sequence, the phased haplotype sequence, and the longest phaseBlock,
 83 |     Return one sequence with phased characters in the block, IUPAC sequences outside it'''
 84 |     
 85 |     newSeq = ''
 86 |     for c in range(len(iupacSeq.seq)):
 87 |         if  phaseBlock[0] -1 <= c <= phaseBlock[1] - 1:
 88 |             newSeq += haploSeq.seq[c]
 89 |         else:
 90 |             newSeq += iupacSeq.seq[c]
 91 |     return SeqRecord(Seq(newSeq),id=newSeqID,description='')
 92 |     
 93 | 
 94 | def replace_with_ref(seq1,seq2,ref,phaseBlock):
 95 |     if seq1.seq == seq2.seq:
 96 |         return [SeqRecord(ref.seq,id=ref.id,description='')]
 97 |     else:
 98 |         haplo1 = insertPhase(ref,seq1,phaseBlock,"{}_h1".format(seq1.id))
 99 |         haplo2 = insertPhase(ref,seq2,phaseBlock,"{}_h2".format(seq2.id))
100 |         return [haplo1,haplo2]
101 | 
102 | def delete_extra(seq1,seq2,ref,phaseBlock):
103 |     if seq1.seq == seq2.seq:
104 |         return [SeqRecord(ref.seq,id=ref.id,description='')]
105 |     else:
106 |         haplo1 = SeqRecord(seq1.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h1".format(seq1.id),description='')
107 |         haplo2 = SeqRecord(seq2.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h2".format(seq2.id),description='')
108 |         return [haplo1,haplo2]
109 | 
110 | def main():
111 |     parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
112 |     parser.add_argument("gtf",help="gtf file annotating the positions of phase blocks for each gene")
113 |     parser.add_argument("haplotype_files",help="Two FASTA files containing sequences for one or more genes",nargs="+")
114 |     parser.add_argument("--reference","-r",help="FASTA file of reference sequences, required with --edit ref")
115 |     parser.add_argument("--edit",help="How to deal with sites outside longest phase block. Default: ref",default="ref",choices=["ref","delete","mask"])
116 |     parser.add_argument("--output",'-o',help="Output FASTA containing haplotype sequences for each gene. default = stdout",default=sys.stdout)
117 |     parser.add_argument("--block",help="file to write phase block information") 
118 |     args = parser.parse_args()
119 |     
120 |     if len(args.haplotype_files) != 2:
121 |         print("Please supply exactly two haplotype FASTA files!\n")
122 |         sys.exit(1)
123 |     
124 |     gtf_dict = get_gtf_dict(args.gtf)
125 |     
126 |     #if args.edit == "ref":
127 |     ref_dict = SeqIO.to_dict(SeqIO.parse(args.reference,'fasta'))
128 |     
129 |     haplotype1_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[0],'fasta'))
130 |     haplotype2_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[1],'fasta'))
131 |     geneList = set(haplotype1_dict.keys())
132 |     seqs_to_write = []
133 |     phase_report = []
134 |     for gene in geneList:
135 |         if gene in gtf_dict:
136 |             if gene in ref_dict:
137 |                 phaseBlock = getLargestPhaseBlock(gtf_dict[gene], len(ref_dict[gene]))
138 |                 phase_report.append("{}\t{}\t{}\t{}\t{}".format(gene,len(gtf_dict[gene]),len(ref_dict[gene]),phaseBlock[0],phaseBlock[1]))
139 |                 if args.edit == 'ref':
140 |                     seqs_to_write += replace_with_ref(haplotype1_dict[gene],haplotype2_dict[gene],ref_dict[gene],phaseBlock)
141 |                 elif args.edit == "delete":
142 |                     seqs_to_write += delete_extra(haplotype1_dict[gene],haplotype2_dict[gene],ref_dict[gene],phaseBlock)
143 |         else:
144 |             if gene in ref_dict:
145 |                 seqs_to_write += [SeqRecord(ref_dict[gene].seq,id=gene,description='')]
146 |     SeqIO.write(seqs_to_write,args.output,'fasta')
147 |     if args.block:
148 |         with open(args.block,'w') as outfile:
149 |             outfile.write("\n".join(phase_report))
150 |             
151 | 
152 | if __name__ == "__main__":main()
153 |             
154 | 
155 | 
156 | 
157 | 
158 | #For sites in the longest block, replace sequences in iupac sequence with phased sequence.
159 | 
160 | #Use the intron/exon extractor as before
161 | 
162 | 


--------------------------------------------------------------------------------
/haplonerate/haplonerate3N.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import sys,os,argparse
  5 | from Bio import SeqIO
  6 | from Bio.Seq import Seq
  7 | from Bio.SeqRecord import SeqRecord
  8 | 
  9 | helptext = '''This script takes two files, containing phased haplotype sequences for one
 10 | or more genes and edits the sequences to retain only variable sites in the largest phase
 11 | block. The two haplotype sequences can be generated by bcftools, for example.
 12 | 
 13 | There are three options for editing the sequences:
 14 | 
 15 | delete: retain only the longest phase block, delete the rest of the sequence
 16 | ref: use reference sequences to fill the rest of the sequence outside the longest block
 17 | mask: fill sequence not in the longest phase block with N
 18 | 
 19 | haplonerate reads the phase blocks from a GTF file, such as the one produced by the stats 
 20 | function in whatshap
 21 | 
 22 | '''
 23 | 
 24 | def get_gtf_dict(gtf_fn):
 25 |     gtf_dict = {}
 26 |     for line in open(gtf_fn):
 27 |         line = line.split()
 28 |         geneName = line[0]#.split("-")[-1]
 29 |         phase_range = (int(line[3]),int(line[4]))
 30 |         try:
 31 |             gtf_dict[geneName].append(phase_range)
 32 |         except KeyError:
 33 |             gtf_dict[geneName] = [phase_range]
 34 |     return gtf_dict
 35 | 
 36 | 
 37 | 
 38 | #prefix = sys.argv[1]
 39 | 
 40 | #Use the bcftools method to extract haplotypes from each gene
 41 | 
 42 | #seqdirectory = "/home/mjohnson/Projects/artocarpus/alleles_paper/iupac_sequences/{}".format(prefix)
 43 | #geneList = set([x.rstrip() for x in open("/home/mjohnson/Projects/artocarpus/alleles_paper/newtargets_genelist.txt")])
 44 | #os.chdir(prefix)
 45 | 
 46 | #Read in the ambiguity coded sequences into a dictionary
 47 | 
 48 | #iupac_dict = SeqIO.to_dict(SeqIO.parse("{}/{}.supercontigs.iupac.fasta".format(seqdirectory,prefix),'fasta'))
 49 | 
 50 | 
 51 | 
 52 | #Use the GTF from Whatshap to determine the longest phase block for the sequence.
 53 | 
 54 | 
 55 | def getLargestPhaseBlock(ranges,seqLength):
 56 |     '''Given the phase blocks for a sequence, return the most inclusive range'''
 57 |     longestblock = 0
 58 |     for r in range(len(ranges)):
 59 |         if r == 0:
 60 |             start = 1
 61 |             if len(ranges) > 1:
 62 |                 end = ranges[r+1][0] - 1
 63 |             else:    
 64 |                 end = seqLength
 65 |             
 66 |         elif r == len(ranges) - 1 :
 67 |             start = ranges[r-1][1] + 1
 68 |             end = seqLength
 69 |             
 70 |         else:
 71 |             start = ranges[r-1][1] + 1
 72 |             end = ranges[r+1][0] - 1
 73 |             
 74 |             
 75 |         if end - start > longestblock:
 76 |             most_inclusive_range = (start,end)
 77 |             longestblock = end - start
 78 |     #print seqLength,ranges,most_inclusive_range    
 79 |     return most_inclusive_range
 80 | 
 81 | def insertPhase(iupacSeq,haploSeq,phaseBlock,newSeqID):
 82 |     '''Given an IUPAC sequence, the phased haplotype sequence, and the longest phaseBlock,
 83 |     Return one sequence with phased characters in the block, IUPAC sequences outside it'''
 84 |     
 85 |     newSeq = ''
 86 |     for c in range(len(iupacSeq.seq)):
 87 |         if  phaseBlock[0] -1 <= c <= phaseBlock[1] - 1:
 88 |             newSeq += haploSeq.seq[c]
 89 |         else:
 90 |             newSeq += iupacSeq.seq[c]
 91 |     return SeqRecord(Seq(newSeq),id=newSeqID,description='')
 92 |     
 93 | 
 94 | def replace_with_ref(seq1,seq2,ref,phaseBlock):
 95 |     if seq1.seq == seq2.seq:
 96 |         return [SeqRecord(ref.seq,id=ref.id,description='')]
 97 |     else:
 98 |         haplo1 = insertPhase(ref,seq1,phaseBlock,"{}_h1".format(seq1.id))
 99 |         haplo2 = insertPhase(ref,seq2,phaseBlock,"{}_h2".format(seq2.id))
100 |         return [haplo1,haplo2]
101 | 
102 | def delete_extra(seq1,seq2,seq3,ref,phaseBlock):
103 | #    if seq1.seq == seq2.seq:
104 | #        return [SeqRecord(ref.seq,id=ref.id,description='')]
105 | #    else:
106 |     
107 |     haplo1 = SeqRecord(seq1.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h1".format(seq1.id),description='')
108 |     haplo2 = SeqRecord(seq2.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h2".format(seq2.id),description='')
109 |     haplo3 = SeqRecord(seq3.seq[phaseBlock[0]:phaseBlock[1]],id="{}_h3".format(seq1.id),description='')
110 |     return [haplo1,haplo2,haplo3]
111 | 
112 | def main():
113 |     parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
114 |     parser.add_argument("gtf",help="gtf file annotating the positions of phase blocks for each gene")
115 |     parser.add_argument("haplotype_files",help="Three FASTA files containing sequences for one or more genes",nargs="+")
116 |     parser.add_argument("--reference","-r",help="FASTA file of reference sequences, required with --edit ref")
117 |     parser.add_argument("--edit",help="How to deal with sites outside longest phase block. Default: ref",default="ref",choices=["ref","delete","mask"])
118 |     parser.add_argument("--output",'-o',help="Output FASTA containing haplotype sequences for each gene. default = stdout",default=sys.stdout)
119 |     parser.add_argument("--block",help="file to write phase block information") 
120 |     args = parser.parse_args()
121 |     
122 |     if len(args.haplotype_files) != 3:
123 |         print("Please supply exactly three haplotype FASTA files!\n")
124 |         sys.exit(1)
125 |     
126 |     gtf_dict = get_gtf_dict(args.gtf)
127 |     
128 |     #if args.edit == "ref":
129 |     ref_dict = SeqIO.to_dict(SeqIO.parse(args.reference,'fasta'))
130 |     
131 |     haplotype1_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[0],'fasta'))
132 |     haplotype2_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[1],'fasta'))
133 |     haplotype3_dict = SeqIO.to_dict(SeqIO.parse(args.haplotype_files[2],'fasta'))
134 |     geneList = set(haplotype1_dict.keys())
135 |     seqs_to_write = []
136 |     phase_report = []
137 |     for gene in geneList:
138 |         if gene in gtf_dict:
139 |             if gene in ref_dict:
140 |                 phaseBlock = getLargestPhaseBlock(gtf_dict[gene], len(ref_dict[gene]))
141 |                 phase_report.append("{}\t{}\t{}\t{}\t{}".format(gene,len(gtf_dict[gene]),len(ref_dict[gene]),phaseBlock[0],phaseBlock[1]))
142 |                 if args.edit == 'ref':
143 |                     seqs_to_write += replace_with_ref(haplotype1_dict[gene],haplotype2_dict[gene],ref_dict[gene],phaseBlock)
144 |                 elif args.edit == "delete":
145 |                     seqs_to_write += delete_extra(haplotype1_dict[gene],haplotype2_dict[gene],haplotype3_dict[gene],ref_dict[gene],phaseBlock)
146 |         else:
147 |             if gene in ref_dict:
148 |                 seqs_to_write += [SeqRecord(ref_dict[gene].seq,id=gene,description='')]
149 |     SeqIO.write(seqs_to_write,args.output,'fasta')
150 |     if args.block:
151 |         with open(args.block,'w') as outfile:
152 |             outfile.write("\n".join(phase_report))
153 |             
154 | 
155 | if __name__ == "__main__":main()
156 |             
157 | 
158 | 
159 | 
160 | 
161 | #For sites in the longest block, replace sequences in iupac sequence with phased sequence.
162 | 
163 | #Use the intron/exon extractor as before
164 | 
165 | 


--------------------------------------------------------------------------------
/haplonerate/img/AJB_Figure_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/haplonerate/img/AJB_Figure_1.pdf


--------------------------------------------------------------------------------
/homologizer/convert_to_nexus.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Script to convert Physcomitrium/Entosthodon phased FASTA gene files into NEXUS format
 3 | # Only samples in the list will be retained
 4 | # The gene name will also be stripped when writing out
 5 | import sys
 6 | from Bio import AlignIO,SeqIO
 7 | from Bio.Alphabet import generic_dna
 8 | from Bio.Nexus import Nexus
 9 | from io import StringIO
10 | 
11 | 
12 | 
13 | samples_to_keep = ["Physcomitrium-immersum-3176_h1",
14 | "Physcomitrium-immersum-3176_h2",
15 | "Entosthodon-hungaricus-3838_h1",
16 | "Entosthodon-hungaricus-3838_h2",
17 | "Physcomitrium-pyriforme-3410_h1",
18 | "Physcomitrium-pyriforme-3410_h2"]
19 | 
20 | alignment_fn = sys.argv[1]
21 | geneID = alignment_fn.split(".")[0]
22 | 
23 | reduced_alignment_fn = "{}.revbayes.nexus".format(geneID)
24 | 
25 | seqs_to_write = []
26 | with open(reduced_alignment_fn,'w') as outfile:
27 | 	for seq in SeqIO.parse(alignment_fn,'fasta', alphabet=generic_dna):
28 | 		seq.id = seq.id.replace("-{}".format(geneID),"")
29 | 
30 | 		if "_h" in seq.id:
31 | 			if seq.id in samples_to_keep:
32 | 				seqs_to_write.append(seq)
33 | 		else:
34 | 			seqs_to_write.append(seq)
35 | 	#SeqIO.write(seqs_to_write,reduced_alignment_fn,'nexus')
36 | 
37 | output = StringIO()
38 | SeqIO.write(seqs_to_write, output, 'nexus')
39 | p = Nexus.Nexus()
40 | p.read(output.getvalue())
41 | p.write_nexus_data(reduced_alignment_fn, interleave=False)
42 | 
43 | #AlignIO.convert(reduced_alignment_fn,'fasta',reduced_alignment_fn.replace("fasta","nexus") ,'nexus',generic_dna,interleave=False)
44 | 
45 | 


--------------------------------------------------------------------------------
/homologizer/label_swap.txt:
--------------------------------------------------------------------------------
1 | Entosthodon-hungaricus-3838_h1	Entosthodon-hungaricus-3838_h2
2 | Physcomitrium-immersum-3176_h1	Physcomitrium-immersum-3176_h2
3 | Physcomitrium-pyriforme-3410_h1	Physcomitrium-pyriforme-3410_h2


--------------------------------------------------------------------------------
/homologizer/readme.md:
--------------------------------------------------------------------------------
 1 | # Homologizer
 2 | 
 3 | > "It's too late to homologize..." -Not One Republic
 4 | 
 5 | It's not too late! Using the setHomeologPhase function in [RevBayes](revbayes.github.io), gene alignments with pairs of sequences can be adjusted so that labels across genes are homologized. Once adjusted, the data from many genes can be analyzed together, for example using a concatenated supermatrix or summary species tree analysis. 
 6 | 
 7 | ### Step 0: Input files
 8 | 
 9 | Gene alignments that include samples that have two homeolog sequences. 
10 | The sequences within each gene should be phased. 
11 | For target capture data, see the `alleles_workflow` and `haplonerate` methods in this same [Phyloscripts](https://github.com/mossmatters/phyloscripts) repository for tips on generating phased haplotypes within a gene sequence. The gene alignments can be in FASTA or NEXUS format, but must end with a regular suffix. For example, all fasta files must end with `.fasta` or `.fa`. Place all of the alignment files in a directory. **Do not place any other files in that directory**.
12 | 
13 | Users must also prepare a text file containing the naming scheme for sequences, separated by commas. For example:
14 | 
15 | 	Entosthodon-hungaricus-3838_h1,Entosthodon-hungaricus-3838-h2
16 | 	Physcomitrium-immersum-3176_h1,Physcomitrium-immersum-3176_h2
17 | 	Physcomitrium-pyriforme-3410_h1,Physcomitrium-pyriforme-3410_h2
18 | 
19 | These labels will be used during the RevBayes script to swap labels for the polyploid individuals.
20 | 
21 | **In this version, only two sequences per individual are supported**
22 | 
23 | This repository also contains `revbayes_template.txt` which is adapted from Will Freyman's [repository](https://github.com/wf8/homeolog_phasing). This file contains all the options for running RevBayes MCMC using the combined alignment. Adjust the parameters as needed (for example, number of generations or substitution model).
24 | 
25 | ### Step 1: Making RevBayes scripts
26 | 
27 | Given a set of gene alignments, we will need to prepare RevBayes files that have the appropriate alleles to switch. The paired labels are taken from the label swap file, and is added to a basic template for RevBayes (taken from Will Freyman's version).
28 | 
29 | **For now, only genes with no missing labels are accepted**
30 | 
31 | For ease of analysis, all the genes are split up into chunks of genes (by default, max 25 genes). This means that for 250 loci, there will be 10 RevBayes scripts generated.
32 | 
33 | Sequences are converted NEXUS for RevBayes.
34 | 
35 | ### Step 2: Run RevBayes on sets of genes
36 |    
37 | Issues:
38 | 
39 | - MPIrun version of RevBayes crashes, probably a memory error
40 | - Single-thread version of RevBayes grabs all RAM on the machine (256 GB on one machine!)
41 | 
42 | ### Step 3: Summarize RevBayes output
43 | 
44 | 
45 | The script `swap_labels.py` reads the RevBayes log files and calculates the posterior probability of label swapping. A threshold can be picked (set to 95% by default), if the swapping PP is below this, both sequences are deleted from that sample. The output 
46 | 
47 | # TODO:
48 | 
49 | - Get RevBayes running more efficiently
50 | - Accomodate missing labels
51 | - How to summarize across different chunks of genes. Each chunk will be homologized, but no guarantee of this across chunks.


--------------------------------------------------------------------------------
/homologizer/revbayes_template.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | num_loci = alignments.size()
 3 | num_swap = label_swap.size()
 4 | for (i in 1:num_loci) {
 5 |     data[i] = readDiscreteCharacterData(alignments[i][1])
 6 | }
 7 | 
 8 | 
 9 | 
10 | # add missing taxa
11 | for (i in 1:num_loci) {
12 |     for (j in 1:num_loci) {
13 |         data[i].addMissingTaxa(data[j].taxa())
14 |     }
15 | }
16 | num_tips = data[1].ntaxa()
17 | 
18 | # set initial phase
19 | for (i in 1:num_loci) {
20 |     for (j in 1:num_swap){
21 |         data[i].setHomeologPhase(label_swap[j][1],label_swap[j][1])
22 |     }    
23 | }
24 | 
25 | mvi = 0
26 | 
27 | n_branches = 2 * num_tips - 3
28 | for (i in 1:n_branches) {
29 |     branch_lengths[i] ~ dnExponential(10)
30 |     moves[++mvi] = mvScale(branch_lengths[i], lambda=1.0, weight=2)
31 | }
32 | topology ~ dnUniformTopology(data[1].taxa())
33 | moves[++mvi] = mvNNI(topology, weight=20.0)
34 | moves[++mvi] = mvSPR(topology, weight=20.0)
35 | tree := treeAssembly(topology, branch_lengths)
36 | 
37 | for (i in 1:num_loci) {
38 |     
39 |     # gtr for each locus
40 |     er_prior <- v(1,1,1,1,1,1)
41 |     er[i] ~ dnDirichlet(er_prior)
42 |     moves[++mvi] = mvSimplexElementScale(er[i], weight=3)
43 | 
44 |     pi_prior <- v(1,1,1,1)
45 |     pi[i] ~ dnDirichlet(pi_prior)
46 |     moves[++mvi] = mvSimplexElementScale(pi[i], weight=3)
47 | 
48 |     Q[i] := fnGTR(er[i], pi[i])
49 | 
50 |     ctmc[i] ~ dnPhyloCTMC(tree=tree, Q=Q[i], type="DNA")
51 |     ctmc[i].clamp(data[i])
52 | }
53 | 
54 | w = 1/8
55 | for (i in 1:num_loci) {
56 |     # switch phasing proposals
57 |     for (j in 1:num_swap){
58 |     moves[++mvi] = mvHomeologPhase(ctmc[i],label_swap[j][1],label_swap[j][2],weight=w)
59 |     }
60 | }
61 | 
62 | mymodel = model(Q)
63 | 
64 | monitors[1] = mnModel(filename=output_file + ".log", printgen=10)
65 | monitors[2] = mnFile(filename=output_file + ".trees", printgen=10, tree)
66 | monitors[3] = mnScreen(printgen=10)
67 | for (i in 1:num_loci){
68 |     monitors[i+3] = mnHomeologPhase(filename=output_file + "_" + alignments[i][1] + "_phase.log", printgen=10, ctmc[i])
69 | }
70 | mymcmc = mcmc(mymodel, monitors, moves)
71 | mymcmc.run(generations=30000)
72 |     
73 | treetrace = readTreeTrace(output_file + ".trees", treetype="clock", burnin=0.25)
74 | map_tree = mapTree(treetrace, output_file + ".tree")
75 | 
76 | mymcmc.operatorSummary()
77 | 


--------------------------------------------------------------------------------
/homologizer/revscript_maker.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # This script will generate the RevBayes scripts necessary to run Homologizer on sets of genes.
  4 | # 
  5 | # Matt Johnson, Texas Tech University
  6 | 
  7 | 
  8 | ######### TO DO / CONSIDERATIONS
  9 | #
 10 | # RevBayes can read the label-swap from a txt file
 11 | 
 12 | # What will RevBayes do when a taxon that has homeologs is missing?
 13 | # There's code to fill in missing taxa but I suppose it will fill in both alleles
 14 | # This SHOULD result in a 50/50 probability for that gene
 15 | # Will be accommodated by the script that does the label switching, need to remember that
 16 | #    maybe the taxon isn't present
 17 | # I'm also not sure how many genes will only have one homeolog (and therefore no _h1 or _h2)
 18 | # This way, should be able to use one taxon-swap file for all genes
 19 | # First version: just use genes with complete sampling.
 20 | 
 21 | 
 22 | # Also need to get it to read the correct alignments
 23 | # Is it possible to pass command line arguments to revbayes scripts?
 24 | #    Guessing not, so Python can write a header for the script call the appropriate
 25 | #    
 26 | # When doing subsets, how do we know that the phase is the same from subset to subset?
 27 | #   Could do one final analysis with one gene picked from each of the subsets
 28 | 
 29 | import argparse,pathlib,os
 30 | from Bio import AlignIO,SeqIO
 31 | from Bio.Alphabet import generic_dna,generic_protein
 32 | from Bio.Nexus import Nexus
 33 | from io import StringIO
 34 | 
 35 | 
 36 | helptext = '''
 37 | ###### REQUIREMENTS
 38 | #
 39 | #   Python > 3.5
 40 | #   Biopython
 41 | 
 42 | ###### INPUTS
 43 | #
 44 | #   Aligned sequence files in a format readable by BioPython
 45 | #   Max number of genes to include per RevBayes run
 46 | 
 47 | ##### OUTPUTS
 48 | #   For each run of RevBayes:
 49 | #       a) a text file containing the locations of the gene alignments in the subset
 50 | #       b) a RevBayes script to run a subset of genes
 51 | #   A text file containing the names of the RevBayes scripts (could be used for SGE array jobs)
 52 | '''
 53 | 
 54 | def convert_to_nexus(alignment_fn,genedir,input_type="fasta",seqtype = 'dna'):
 55 |     if seqtype == 'dna':
 56 |         seqs_to_write = [seq for seq in SeqIO.parse(os.path.join(genedir,alignment_fn),input_type, alphabet=generic_dna)]
 57 |     else:
 58 |         seqs_to_write = [seq for seq in SeqIO.parse(os.path.join(genedir,alignment_fn),input_type, alphabet=generic_protein)]
 59 |     new_fn = ".".join(alignment_fn.split(".")[:-1]) + ".nexus"    
 60 |     output = StringIO()
 61 |     SeqIO.write(seqs_to_write, output, 'nexus')
 62 |     p = Nexus.Nexus()
 63 |     p.read(output.getvalue())
 64 |     p.write_nexus_data(os.path.join("nexusfiles",new_fn), interleave=False)
 65 | 
 66 | 
 67 | 
 68 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
 69 | parser.add_argument("genedir",help="directory containing all of the gene alignments. No other files should be in this directory.")
 70 | parser.add_argument("swaplist",help="file containing labels to swap, two per sample, separated by a comma")
 71 | parser.add_argument("--numgenes","-n",help="number of genes to include in each RevBayes job",type=int,default=25)
 72 | parser.add_argument("--alignfiletype","-a",help="Alignment file type. Must be one used by BioPython",default="fasta")
 73 | parser.add_argument("revbayestemplate",help="Template for RevBayes. This script will prepend with alignment and swap info")
 74 | args = parser.parse_args()
 75 | 
 76 | 
 77 | 
 78 | alignments = os.listdir(args.genedir)
 79 | 
 80 | ######## CONVERT TO NEXUS
 81 | 
 82 | pathlib.Path("nexusfiles").mkdir(parents=True, exist_ok=True)
 83 | 
 84 | if args.alignfiletype != "nexus":
 85 |     for fn in alignments:
 86 |         convert_to_nexus(fn,args.genedir)
 87 |     
 88 | split_gene_list = [alignments[x:x+args.numgenes] for x in range(0, len(alignments), args.numgenes)]
 89 | print("Will generate {} RevBayes scripts with a max of {} genes".format(len(split_gene_list),args.numgenes))
 90 | 
 91 | ######## WRITE REVBAYES SCRIPTS
 92 | revbayes_header = '''
 93 | ####ADD THESE FROM PYTHON SCRIPT
 94 | output_file = "alleles.{}"
 95 | label_swap = readTable("{}",delimiter=",")
 96 | alignments = readTable("{}")
 97 | #######
 98 | '''
 99 | 
100 | pathlib.Path("revbayes_scripts").mkdir(parents=True, exist_ok=True)
101 | revbayes_text = open(args.revbayestemplate).read()
102 | 
103 | for gs in range(len(split_gene_list)):
104 |     genelist_file = "revbayes_scripts/genelist.{}.txt".format(gs)
105 |     with open(genelist_file,'w') as outfile:
106 |         for gene in split_gene_list[gs]:
107 |             outfile.write(args.genedir + "/" + gene+"\n")
108 |             
109 |     with open("revbayes_scripts/homeolog_phase.{}.Rev".format(gs),'w') as revbayes_out:
110 |         revbayes_out.write(revbayes_header.format(gs,args.swaplist,genelist_file))
111 |         revbayes_out.write(revbayes_text)
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/homologizer/swap_labels.py:
--------------------------------------------------------------------------------
 1 | #Script to swap labels in alignments based on the results of RevBayes
 2 | 
 3 | 
 4 | helptext = '''
 5 | ###### Input
 6 | #
 7 | # List of genes (from first script)
 8 | # RevBayes output log files
 9 | 
10 | ###### Options
11 | #
12 | # Swap threshold- based on posterior distribution, below threshold sample is deleted (default 95%)
13 | # Burnin percentage (default 10%)
14 | # Output alignment file type (default FASTA)
15 | 
16 | ##### Output
17 | #
18 | # Directory containing alignments of specified type with the labels swapped.
19 | 
20 | '''
21 | 
22 | import argparse,pathlib,os
23 | import pandas as pd
24 | from Bio import SeqIO, AlignIO
25 | 
26 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
27 | parser.add_argument('genelist',help = "File containing list of gene alignments.")
28 | parser.add_argument('initalAlignmentDir',help="Directory containing RevBayes logs for each alignment")
29 | parser.add_argument('--swappct','-s',help="Posterior probability to do swapping. Below this, both alleles from the sample are deleted.",default=95)
30 | parser.add_argument('--burnin','-b',help="Percentage of RevBayes to discard as burnin",default=10)
31 | parser.add_argument('--outfiletype','-f',help="Alignment file type to output",default='fasta')
32 | parser.add_argument('--infiletype','-i',help="Alignment file type to output",default='fasta')
33 | 
34 | args = parser.parse_args()
35 | 
36 | 
37 | ###### READ IN GENELIST
38 | 
39 | genelist = [x.rstrip() for x in open(args.genelist)]
40 | subset_num = args.genelist.split(".")[1]
41 | outputdir = args.initalAlignmentDir+"_swapped"
42 | pathlib.Path(outputdir).mkdir(parents=True, exist_ok=True)
43 | 
44 | for gene in genelist:
45 | ###### READ IN REVBAYES LOGS
46 |     logfilepath = "alleles.{}_{}_phase.log".format(subset_num,gene)
47 |     gene_logfile = pd.read_csv(logfilepath,header=0,index_col=0,sep="\t") 
48 |     dim_logfile = gene_logfile.shape
49 |     burnin = int(dim_logfile[0]/args.burnin)
50 |     gene_swap_dict = {}
51 |     for h in range(dim_logfile[1]):
52 |         pp = gene_logfile.iloc[burnin:,h].value_counts()/dim_logfile[0] * 100
53 |         if pp[0] <  args.swappct:
54 |             #None will indicate the sequence should be skipped when re-writing
55 |             gene_swap_dict[pp.name] = None
56 |             print("PP for {} in gene {} was {}".format(pp.name,gene,pp[0]))
57 |         else:
58 |             gene_swap_dict[pp.name] = pp.index[0]
59 |     with open(os.path.join(outputdir,os.path.split(gene)[1]),'w') as outfile:
60 |         for seq in SeqIO.parse(gene,args.infiletype):
61 |             if seq.id in gene_swap_dict:
62 |                 if gene_swap_dict[seq.id]:
63 |                     seq.id = gene_swap_dict[seq.id]
64 |                     SeqIO.write(seq,outfile,args.outfiletype)
65 |             else:
66 |                 SeqIO.write(seq,outfile,args.outfiletype)
67 | 
68 | ##### READ IN ALIGNMENT AND SWAP LABELS
69 | 
70 | 


--------------------------------------------------------------------------------
/minorityreport/README.md:
--------------------------------------------------------------------------------
 1 | # Minority Report
 2 | 
 3 | This script will summarize the concordant and conflicting bipartitions found in a Phyparts analysis by plotting them on the species tree. 
 4 | 
 5 | One specific node must be specified. You can get the numerical identifier for the target node by running `phypartspiecharts.py` with the `--show_nodes` flag.
 6 | 
 7 | ## Dependencies
 8 | 
 9 | [ETE3](etetoolkit.org) with Python > 2.7
10 | 
11 | [Linux convert](https://www.imagemagick.org/script/convert.php) (part of ImageMagick)
12 | 
13 | ## Usage
14 | 
15 | Run this from the same directory as the Phyparts ouptut.
16 | 
17 | ```
18 | python minority_report.py species.tre phyparts_root 31 3
19 | ```
20 | 
21 | This command will display the concordant and discordant bipartitions on `species.tre` using the `phyparts_root.alts`  and `phyparts_root.hist` files for node number 31. Only bipartitions occurring in at least 3 gene trees will be displayed.
22 | 
23 | ## Output
24 | 
25 | A PDF will be generated in the current directory using the Linux tool `convert`. The first page of the PDF is the species phylogeny with the selected bipartition highlighted in blue text. In this example, a clade is highlighted on the species tree. The number of concordant gene trees (25, in this example) is indicated at the top:
26 | 
27 | ![](img/concordant.png)
28 | 
29 | All subsequent pages show the *same species tree topology* but alternative, conflicting bipartitions are highlighted. For example, this image shows an alternative bipartition for the same node as above:
30 | 
31 | ![](img/conflict1.png)
32 | 
33 | 
34 | Species GW1701 and NZ609 are found in a bipartition with the top clade in 20 gene trees (almost as many as the concordant bipartition!).
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/minorityreport/img/concordant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/minorityreport/img/concordant.png


--------------------------------------------------------------------------------
/minorityreport/img/conflict1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/minorityreport/img/conflict1.png


--------------------------------------------------------------------------------
/minorityreport/minority_report.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os,sys,argparse,subprocess
  3 | from ete3 import Tree,TreeStyle,TextFace,NodeStyle
  4 | 
  5 | helptext = '''This script will print tree figures representing the minority bipartitions
  6 | found by PhyParts. Given the species tree used for PhyParts, the PhyParts output file 'root'
  7 | and the node of interest, one PNG file will be generated per minority bipartition, and 
  8 | the decendants from the bipartition will be color-coded on the species tree.
  9 | '''
 10 | 
 11 | #color_it = ["Entosthodon-hungaricus-3177","Entosthodon-attenuatus-3479","Physcomitrium-spathulatum-3549","Physcomitrium-pyriforme-3727","Physcomitrium-pyriforme-3728","Physcomitrium-immersum-3176","Physcomitrium-pyriforme-3118","Physcomitrella-magdalenae-3844","Physcomitrium-hookeri-3412","Physcomitrium-sp-3842","Entosthodon-attenuatus-3835","Physcomitrium-hookeri-3409","Physcomitrium-pyriforme-3798","Physcomitrium-sp-3115","Physcomitrium-pyriforme-3387","Entosthodon-obtusus-3347","Physcomitrium-pyriforme-3404","Aphanorrhegma-serratum-3305","Physcomitridium-readeri-3892","Entosthodon-americanus-3894","Entosthodon-lindigii-3546","Physcomitrium-sp-3508","Physcomitrium-sp-3672","Entosthodon-attenuatus-3543","Physcomitrium-pyriforme-3555","Physcomitrella-patens-3403","Physcomitrium-collenchymatum-3480","Entosthodon-obtusus-3395","Physcomitrium-eurystomum-3841","Physcomitrium-sp-3551","Physcomitrium-subsphaericum-3556","Physcomitrium-collenchymatum-3178","Physcomitrium-sp-3496","Physcomitrella-patens-3139","Physcomitrium-japonicum-3413","Physcomitrium-japonicum-3411","Physcomitrium-pyriforme-3787","Physcomitrium-pyriforme-3886","Entosthodon-sp-3837","Entosthodon-subintegrus-3840","Physcomitrium-eurystomum-3392","Physcomitrium-sp-3539","Physcomitrium-pyriforme-3883","Physcomitrium-sp-3816","Entosthodon-bergianus-3509","Physcomitrium-sp-3817","Physcomitrium-sp-3814"]
 12 | 
 13 | def get_alternative_bipartitions(node,phyparts_root,min_alt):
 14 |     alt_biparts = []
 15 |     alt_counts = []
 16 |     for line in open(phyparts_root + ".hist.alts"):
 17 |         line = line.split()
 18 |         nodenum = int(line[2])
 19 |         if nodenum == node:
 20 |             bipart2 =  line[4].rstrip().split(",")
 21 |             bipart1 = line[3].split(":")[1].split(",")
 22 |             numalt = int(line[3].split(":")[0].replace(")","").replace("(",""))
 23 |             if numalt >= min_alt:
 24 |                 alt_biparts.append((bipart1,bipart2))
 25 |                 alt_counts.append(numalt)
 26 |     return alt_biparts,alt_counts
 27 |     
 28 | def render_tree(species_tree,bipart1,num_alt,png_fn,replace_taxon=None):
 29 |     color1 = "blue"
 30 |     color2 = "black"
 31 |     ts=TreeStyle()
 32 |     ts.show_leaf_name=False
 33 |     ts.show_scale=False
 34 |     nstyle = NodeStyle()
 35 |     nstyle["size"] = 0
 36 | 
 37 |     ts.title.add_face(TextFace("{} bipartition in {} gene trees".format(png_fn,num_alt),fsize=15,bold=True),0)
 38 |     plot_tree = species_tree
 39 |     for node in plot_tree.traverse():
 40 |         node.set_style(nstyle)
 41 |         if node.name in bipart1:
 42 |             name_face = TextFace(node.name,fgcolor=color1)
 43 |         else:
 44 |             name_face = TextFace(node.name,fgcolor=color2)
 45 |         node.add_face(name_face,0,'branch-right')
 46 |     if replace_taxon:
 47 |         for leaf in plot_tree.get_leaves:
 48 |             try:
 49 |                 leaf.name=taxon_subst[leaf.name]
 50 |             except KeyError:
 51 |                 continue
 52 |     plot_tree.convert_to_ultrametric()
 53 |     plot_tree.render(png_fn,tree_style=ts,w=600)        
 54 | 
 55 | def majority_tree(species_tree,node_num,phyparts_root):
 56 |     
 57 |     num_concord = sum([1 for line in open("{}.concord.node.{}".format(phyparts_root,node_num))])
 58 |     png_fn = "node_{}_speciestree.png".format(node_num,num_concord)
 59 |     for line in open(phyparts_root+".node.key"):
 60 |         node = int(line.split()[0])
 61 |         if node == node_num:
 62 |             subtree = Tree(line.rstrip().split()[1]+";")
 63 |             subtree_bipart = subtree.get_leaf_names()
 64 |             render_tree(species_tree,subtree_bipart,num_concord,png_fn)
 65 |                     
 66 | 
 67 | def main():
 68 |     parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
 69 |     parser.add_argument('species_tree',help="Newick formatted species tree topology.")
 70 |     parser.add_argument('phyparts_root',help="File root name used for Phyparts.")
 71 |     parser.add_argument('node_num',type=int,default=0,help="Node number from Phyparts. To see a tree with numbered nodes, run phypartspiecharts.py with --show_nodes.")
 72 |     parser.add_argument('min_alt',type=int,default=0,help="Only print alternative bipartitions if they occur in this many gene trees")
 73 |     parser.add_argument('--taxon_subst',help="Comma-delimted file to translate tip names.")
 74 | 
 75 |     args = parser.parse_args()
 76 |     
 77 |     try: 
 78 |         subprocess.check_output('which convert',shell=True)
 79 |         convert = True
 80 |     except: 
 81 |         convert = False
 82 |     if args.taxon_subst:
 83 |         taxon_subst = {line.split(",")[0]:line.split(",")[1] for line in open(args.taxon_subst,'U')}
 84 |     else:
 85 |         taxon_subst = None
 86 | 
 87 | 
 88 |     alt_bipart,alt_counts = get_alternative_bipartitions(args.node_num,args.phyparts_root,args.min_alt)
 89 |     print("{} alternative bipartitions occurred in more than {} gene trees\n".format(len(alt_counts),args.min_alt))
 90 |     
 91 |     species_tree = Tree(args.species_tree)
 92 |     species_tree.ladderize(direction=1)
 93 |     majority_tree(species_tree,args.node_num,args.phyparts_root)
 94 |     
 95 |     for alt in range(len(alt_counts)):
 96 |         png_fn = "node_{}_alt_{}.png".format(args.node_num,alt)
 97 |         species_tree = Tree(args.species_tree)
 98 |         species_tree.ladderize(direction=1)
 99 |         render_tree(species_tree,alt_bipart[alt][0],alt_counts[alt],png_fn,replace_taxon=taxon_subst)
100 |     if convert:
101 |         os.system("convert node_{}_speciestree.png node_{}_alt_*.png node_{}.pdf".format(args.node_num,args.node_num,args.node_num))
102 |         os.system("rm node_{}*.png".format(args.node_num))
103 | 
104 | #t = Tree("((((Discelium-nudum-3746:1,Encalypta-intermedia-3219:1)1:1.02738,((Timmia-austriaca-3619:1,Entosthodon-pulchellus-3120:1)1:0.678047,(Chamaebryum-pottioides-3630:1,Chamaebryum-pottioides-3573:1)1:5.61798)1:0.843377)1:3.51197,(((Funaria-flavicans-4092:1,Funaria-hygrometrica-3891:1)1:3.87102,((Funaria-sp-3541:1,(Funaria-hygcalvescens-3633:1,Funaria-sp-3514:1)1:0.397782)1:2.28115,(Funaria-microstoma-3834:1,(Funaria-arctica-3544:1,(Funaria-polaris-3542:1,(Funaria-arctica-3833:1,((Funaria-hygrometrica-3476:1,(Funaria-hygrometrica-3388:1,Funaria-hygrometrica-3179:1)0.38:0.00555864)1:0.739336,((Funaria-sp-3882:1,Funaria-sp-3393:1)1:0.648175,(Funaria-hygrometrica-3515:1,Funaria-hygrometrica-3632:1)0.64:0.0737125)1:0.445965)1:0.943796)1:0.575318)1:0.240042)1:0.139316)1:0.88789)1:2.00293)1:5.75785,(((Physcomitrellopsis-africana-3142:1,Entosthodon-smithhurstii-3465:1)1:2.40187,(Entosthodon-sp-3726:1,(Entosthodon-sp-3545:1,(Entosthodon-clavatus-3896:1,Entosthodon-clavatus-3895:1)1:0.669355)1:3.49935)1:0.193954)1:2.82861,(((Entosthodon-hungaricus-3177:1,Entosthodon-americanus-3894:1)1:1.55658,(Entosthodon-lindigii-3546:1,((Entosthodon-muhlenbergii-3893:1,(Entosthodon-planoconvexus-3114:1,Entosthodon-duriaei-3843:1)1:2.39883)1:3.99232,((Entosthodon-attenuatus-3835:1,(Entosthodon-attenuatus-3479:1,Entosthodon-attenuatus-3543:1)1:0.133086)1:3.23622,((Entosthodon-sp-3837:1,(Physcomitrium-sp-3842:1,Entosthodon-subintegrus-3840:1)1:2.61412)1:2.65073,(Entosthodon-bergianus-3509:1,(Entosthodon-obtusus-3395:1,Entosthodon-obtusus-3347:1)1:3.84659)0.91:0.0632156)1:0.36051)1:3.21301)0.92:0.0664078)0.81:0.0496285)1:0.110983,((Physcomitrium-hookeri-3412:1,(Physcomitrium-hookeri-3409:1,Physcomitrium-pyriforme-3404:1)1:0.375291)1:3.74685,((Physcomitridium-readeri-3892:1,(((Physcomitrium-eurystomum-3392:1,Physcomitrium-eurystomum-3841:1)0.46:0.0189278,(Physcomitrium-pyriforme-3555:1,Physcomitrium-pyriforme-3387:1)0.55:0.0260365)1:0.212244,(((Physcomitrium-sp-3816:1,Physcomitrium-sp-3508:1)1:2.44605,((Physcomitrium-sp-3551:1,Physcomitrium-sp-3539:1)1:1.2462,(Physcomitrium-japonicum-3413:1,Physcomitrium-japonicum-3411:1)0.78:0.0484692)1:1.21849)1:3.30359,((Physcomitrium-pyriforme-3118:1,Physcomitrium-pyriforme-3883:1)1:3.0447,(Physcomitrium-pyriforme-3787:1,((Physcomitrella-magdalenae-3844:1,(Physcomitrium-spathulatum-3549:1,(Physcomitrium-sp-3814:1,Physcomitrium-subsphaericum-3556:1)1:0.625116)1:0.522302)1:1.95296,(Physcomitrium-sp-3496:1,(Physcomitrium-pyriforme-3798:1,(Physcomitrium-pyriforme-3727:1,(Physcomitrium-pyriforme-3886:1,Physcomitrium-pyriforme-3728:1)1:0.129654)1:0.441593)1:1.5649)1:1.80883)0.59:0.0382)1:1.16855)1:2.09587)1:0.55998)1:0.590922)1:0.23246,(Physcomitrium-sp-3817:1,((Physcomitrella-patens-3403:1,Physcomitrella-patens-3139:1)1:4.05812,((Physcomitrium-sp-3672:1,(Aphanorrhegma-serratum-3305:1,(Physcomitrium-collenchymatum-3480:1,(Physcomitrium-sp-3115:1,Physcomitrium-collenchymatum-3178:1)0.49:0.035722)1:2.5141)1:0.389341)0.88:0.152428,Physcomitrium-immersum-3176:1)1:0.479012)1:1.1982)1:2.30257)1:0.817867)1:1.25824)1:4.29516)1:0.799009)1:5.4761)1:1,Goniomitrium-africanum-4081:1);")
105 | #t.ladderize(direction=1)
106 | #ts = TreeStyle()
107 | #ts.show_leaf_name = False
108 | 
109 | if __name__ == "__main__":main()
110 | 


--------------------------------------------------------------------------------
/phypartspiecharts/README.md:
--------------------------------------------------------------------------------
 1 | # PhypartsPieCharts
 2 | 
 3 | Using the output of PhyParts (https://bitbucket.org/blackrim/phyparts), plot pie charts on the species phylogeny showing the percentage of concordant gene trees, percentage in the top alternative bipartition, other conflicting topologies, and uninformative genes.
 4 | 
 5 | For more information about PhyParts, consult the [original paper by Smith et al.](https://bmcevolbiol.biomedcentral.com/articles/10.1186/s12862-015-0423-0) and the Python notebook [here](PhyParts_PieCharts.ipynb).
 6 | 
 7 | **Dependencies**
 8 | 
 9 | Requires [ETE3](http://etetoolkit.org/) and Python > 2.7
10 | 
11 | 
12 | **Sample Usage**
13 | 
14 | ```
15 | python phypartspiecharts.py species.tre phyparts_root 158
16 | ```
17 | Run the script from the directory containing the Phyparts output files. 
18 | 
19 | `species.tre` is the rooted species phylogeny used in Phyparts.
20 | 
21 | `phyparts_root` is the basename of the phyparts output. The default is `out`. The important output files are `out.concon.tre`, `out.hist`, `out.alt`, and `out.key`.
22 | 
23 | Finally, indicate the number of gene trees used in Phyparts. This is used to properly calculate the pie chart percentages.
24 | 
25 | The default output will be `pies.svg` containing a ladderized version of your species tree and pie charts on each node. The default color scheme is the percentage of gene trees that are:
26 | 
27 | * Blue: concordant
28 | * Green: the top alternative biparttion
29 | * Red: all other alternative bipartitions
30 | * Black: uninformative for that node
31 | 
32 | Numbers above and below the branch also indicate the number of concordant and conflicting gene trees, respectively.
33 | 
34 | **Example Output**
35 | 
36 | ![](img/default_pies.jpg)
37 | 
38 | **Other options**
39 | 
40 | * `--svg_name` Change the name of the svg image file. Default: `pies.svg`
41 | * `--show_nodes` Display a tree with nodes labeled by the Phyparts numbering scheme. Useful for further inspection of alternative bipartitions (i.e. `minority_report.py`). The tree is opened in a new window, so this may not work for remote logins (use `ssh -Y`). 
42 | * `--taxon_subst` Provide a comma delimited file that replaces tip labels in `species.tre` with a new label. Useful for converting accession numbers to species names, for example.
43 | * `--colors` Provide custom colors for the pie chart wedges. RGB triplets (comma-separated), hexadecimal, and named colors can be used and should be separated by a space. The colors will correspond to the same order as above.
44 | * `--to_csv` Generates CSV files for the categories and nodes, for input into ggTree in R (code provided by [Ben Cooper](https://github.com/benjamin-j-cooper)).
45 | 
46 | **Run with Example Data**
47 | 
48 | Data from [Medina et al. JSE 2019](https://onlinelibrary.wiley.com/doi/full/10.1111/jse.12516)
49 | 
50 | ```
51 | cd phyparts_example
52 | python ../phypartspiecharts.py species.tre out 648
53 | ```
54 | 
55 | ### Reroot Script
56 | 
57 | Also includes a script for rerooting gene and species trees. Requires ETE3.
58 | 
59 | Usage:
60 | 
61 | `python reroot_trees.py my.tree outgroup.list > rerooted.tre`
62 | 
63 | Where `outgroup.list` is a text file containing a list of outgroup names found on the tree (one per line). Script will attempt to find the LCA of the outgroup names, set that LCA as the root branch, and print the rooted tree. 
64 | 
65 | To run on a set of unrooted gene trees with GNU `parallel`: Assuming each gene is in the format `geneName.tre` and there is a list of `geneName` in a file called `genelist.txt`:
66 | 
67 | `parallel "python reroot_trees.py {}.tre outgroup.list > {}.rerooted.tre" :::: genelist.txt`
68 | 
69 | **Warning**: If no branch can be found, the tree file will be empty! You can remove empty tree files with:
70 | 
71 | `find . -size 0 -delete`
72 | 
73 | Remember to re-root the species tree as well before running PhyParts.
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/phypartspiecharts/img/default_pies.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/phypartspiecharts/img/default_pies.jpg


--------------------------------------------------------------------------------
/phypartspiecharts/img/pleuro_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/phypartspiecharts/img/pleuro_nodes.png


--------------------------------------------------------------------------------
/phypartspiecharts/img/sphag_taka.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mossmatters/phyloscripts/c8c00307882a887f043572f0cbd79d30f9ad59b5/phypartspiecharts/img/sphag_taka.png


--------------------------------------------------------------------------------
/phypartspiecharts/phyparts_example/out.concon.tre:
--------------------------------------------------------------------------------
1 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)478,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)231)626,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)638)132,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)602)482)250)634)76)50)59,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)342)584,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)646,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)261)600)635)640)641,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)495,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)384,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)143)556)638,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)636,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)390)308)549,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)246)615)618)194)560)639)638)439)518)315)645,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)463)644)263,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)575)629);
2 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)195)2,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)0)252,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)7)11)185)6)216)226)197,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)146)1,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)0,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)196)0)1)4)3,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)5,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)93,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)273)52)3,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)115)153)7,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)242)12)10)185)39)5)7)90)33)79)2,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)35)2)193,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)2)1);
3 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)0.9784027768416858,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)0.79076569047233)0.9689967641335626,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1.0)0.34872974537672113,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)0.9479351387882053)0.9049856997476616)0.5858268861374235)0.9371291522677113)0.25882761838229973)0.16382415706238607)0.18651191047865784,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)0.7500526160703238)0.9824436644806955,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1.0,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)0.6447504894707947)1.0)0.9831630546463646)0.9624808797655535)0.970304429867128,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)0.910410615398184,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)0.795980949498425,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.4428717628115614)0.8033376969061944)0.9318395066925902,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)0.9747441895741165,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)0.7243414002705952)0.6132456034656285)0.9506041771913188,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)0.7523018242875253)0.9274210646341674)0.9093078758550853)0.6420089852288237)0.8305051571490991)0.9520143595012353)0.9200661096764626)0.8018909077533978)0.8479602575782158)0.6423148907671082)0.9750481474391287,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)0.8859424320701869)0.9711731195998496)0.7098496912720081,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)0.9767909279348379)0.983124949874508);
4 | (((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)0.9784027768416858,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)0.79076569047233)0.9689967641335626,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1.0)0.35059059605189336,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)0.9518845860911305)0.9121982699563427)0.5870778339468724)0.9371708799789142)0.25882761838229973)0.1665969369584339)0.1895506573461794,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)0.7628637350241585)0.983021142874864,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1.0,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)0.6671950717062091)1.0)0.9833173867152819)0.9625055731351766)0.970304429867128,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)0.910410615398184,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)0.796639738720315,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.4428717628115614)0.8044187220999806)0.9321209146933542,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)0.9747441895741165,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)0.7246592227996926)0.6315598024116463)0.9523422941390248,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)0.7557833731708657)0.9279206152360917)0.9143761929210199)0.6752825223370501)0.8305051571490991)0.9520143595012353)0.9200661096764626)0.8018909077533978)0.8483042656155186)0.6433538869820215)0.9750481474391287,((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)0.8859424320701869)0.9713464955049456)0.7173285514839135,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)0.9767909279348379)0.9833173867152819);
5 | 


--------------------------------------------------------------------------------
/phypartspiecharts/phyparts_example/out.hist:
--------------------------------------------------------------------------------
 1 | Node0,645.0,1.0,1.0,1.0,647
 2 | Node1,61.0,4.0,19.0,16.0,1.0,1.0,6.0,1.0,3.0,1.0,4.0,2.0,4.0,1.0,1.0,1.0,10.0,1.0,1.0,1.0,11.0,1.0,2.0,1.0,1.0,1.0,1.0,14.0,1.0,1.0,1.0,16.0,5.0,1.0,2.0,4.0,9.0,2.0,10.0,1.0,1.0,4.0,10.0,4.0,2.0,5.0,1.0,3.0,1.0,9.0,4.0,1.0,11.0,1.0,2.0,1.0,256
 3 | Node2,48.0,4.0,19.0,1.0,1.0,6.0,1.0,3.0,1.0,4.0,2.0,4.0,1.0,1.0,9.0,1.0,10.0,19.0,1.0,1.0,1.0,11.0,5.0,1.0,2.0,1.0,1.0,1.0,12.0,1.0,14.0,1.0,1.0,1.0,1.0,16.0,5.0,1.0,2.0,4.0,2.0,10.0,1.0,7.0,1.0,4.0,2.0,5.0,1.0,9.0,8.0,1.0,11.0,1.0,2.0,1.0,276
 4 | Node3,76.0,4.0,19.0,1.0,6.0,3.0,1.0,4.0,2.0,1.0,1.0,9.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,2.0,1.0,8.0,1.0,15.0,1.0,14.0,1.0,50.0,1.0,1.0,1.0,5.0,1.0,2.0,4.0,2.0,10.0,1.0,2.0,1.0,12.0,9.0,8.0,1.0,1.0,292
 5 | Node4,478.0,1.0,479
 6 | Node5,634.5,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,640
 7 | Node6,626.0,2.0,628
 8 | Node7,231.0,7.0,6.0,2.0,426
 9 | Node8,250.5,1.0,1.0,33.0,3.0,1.0,6.0,1.0,8.0,1.0,1.0,4.0,1.0,1.0,1.0,13.0,1.0,2.0,3.0,1.0,23.0,435
10 | Node9,131.5,33.0,21.0,15.0,1.0,14.0,42.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,23.0,384
11 | Node10,637.5,638
12 | Node11,536.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0,493
13 | Node12,548.0,2.0,3.0,1.0,1.0,609
14 | Node13,316.5,1.0,1.0,5.0,1.0,1.0,1.0,2.0,1.0,4.0,2.0,4.0,9.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,2.0,1.0,1.0,5.0,1.0,1.0,1.0,2.0,7.0,3.0,1.0,1.0,4.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,394
15 | Node14,608.0,1.0,585
16 | Node15,318.0,9.0,12.0,488
17 | Node16,516.5,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,551
18 | Node17,641.0,1.0,2.0,644
19 | Node18,640.5,3.0,1.0,644
20 | Node19,646.0,646
21 | Node20,638.1666666666667,1.0,636
22 | Node21,620.6666666666667,600
23 | Node22,236.6666666666667,15.0,11.0,457
24 | Node23,439.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,9.0,1.0,1.0,1.0,1.0,1.0,7.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,529
25 | Node24,638.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,645
26 | Node25,640.8333333333333,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,641
27 | Node26,495.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,500
28 | Node27,554.8333333333333,7.0,1.0,4.0,1.0,6.0,2.0,12.0,1.0,1.0,1.0,3.0,5.0,10.0,4.0,608
29 | Node28,382.3333333333333,3.0,6.0,7.0,6.0,3.0,6.0,477
30 | Node29,143.0,9.0,7.0,7.0,6.0,10.0,5.0,3.0,12.0,2.0,3.0,6.0,416
31 | Node30,639.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,644
32 | Node31,636.0,1.0,1.0,1.0,637
33 | Node32,560.0,1.0,1.0,1.0,7.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,3.0,8.0,1.0,1.0,1.0,8.0,599
34 | Node33,573.3333333333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,556
35 | Node34,284.33333333333337,9.0,12.0,1.0,18.0,12.0,19.0,1.0,1.0,1.0,1.0,6.0,461
36 | Node35,389.3333333333333,14.0,12.0,1.0,12.0,6.0,1.0,1.0,7.0,1.0,6.0,505
37 | Node36,235.0,3.0,1.0,1.0,1.0,1.0,1.0,14.0,1.0,8.0,1.0,1.0,1.0,10.0,1.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,8.0,379
38 | Node37,577.3333333333334,1.0,1.0,1.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,628
39 | Node38,619.8333333333334,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0,627
40 | Node39,240.83333333333334,20.0,3.0,1.0,4.0,1.0,3.0,1.0,1.0,1.0,488
41 | Node40,636.5,1.0,630
42 | Node41,260.0,1.0,1.0,1.0,1.0,5.0,25.0,11.0,456
43 | Node42,639.5,1.0,1.0,1.0,1.0,646
44 | Node43,463.0,1.0,9.0,1.0,1.0,6.0,498
45 | Node44,575.0,1.0,1.0,577
46 | 


--------------------------------------------------------------------------------
/phypartspiecharts/phyparts_example/out.node.key:
--------------------------------------------------------------------------------
 1 | 0 ((Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1)0.99)1,((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1)1,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1)1)1)1
 2 | 1 (Entosthodon-lindigii-3546,(Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1)0.99)1
 3 | 2 (Entosthodon-americanus-3894,((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1)0.99
 4 | 3 ((Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1,((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1)1
 5 | 4 (Entosthodon-planoconvexus-3114,Entosthodon-duriaei-3843)1
 6 | 5 ((Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1,((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1)1
 7 | 6 (Entosthodon-attenuatus-3835,(Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1)1
 8 | 7 (Entosthodon-attenuatus-3479,Entosthodon-attenuatus-3543)1
 9 | 8 ((Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91,(Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1)1
10 | 9 (Entosthodon-bergianus-3509,(Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1)0.91
11 | 10 (Entosthodon-obtusus-3347,Entosthodon-obtusus-3395)1
12 | 11 (Entosthodon-sp-3837,(Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1)1
13 | 12 (Physcomitrium-sp-3842,Entosthodon-subintegrus-3840)1
14 | 13 ((Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1)1,((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1)1)1
15 | 14 (Physcomitrium-hookeri-3412,(Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1)1
16 | 15 (Physcomitrium-hookeri-3409,Physcomitrium-pyriforme-3404)1
17 | 16 ((Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1,(Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1)1
18 | 17 (Physcomitrium-sp-3817,((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1)1
19 | 18 ((Physcomitrella-patens-3403,Physcomitrella-patens-3139)1,(Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1)1
20 | 19 (Physcomitrella-patens-3403,Physcomitrella-patens-3139)1
21 | 20 (Aphanorrhegma-serratum-3305,(Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1)1
22 | 21 (Physcomitrium-collenchymatum-3480,(Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1)1
23 | 22 (Physcomitrium-sp-3115,Physcomitrium-collenchymatum-3178)1
24 | 23 (Physcomitridium-readeri-3892,(((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1)1
25 | 24 (((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1,((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1)1
26 | 25 ((Physcomitrium-sp-3508,Physcomitrium-sp-3816)1,((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1)1
27 | 26 (Physcomitrium-sp-3508,Physcomitrium-sp-3816)1
28 | 27 ((Physcomitrium-sp-3539,Physcomitrium-sp-3551)1,(Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94)1
29 | 28 (Physcomitrium-sp-3539,Physcomitrium-sp-3551)1
30 | 29 (Physcomitrium-japonicum-3413,Physcomitrium-japonicum-3411)0.94
31 | 30 ((Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1,((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1)1
32 | 31 (Physcomitrium-pyriforme-3118,Physcomitrium-pyriforme-3883)1
33 | 32 ((Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1,(Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1)1
34 | 33 (Physcomitrella-magdalenae-3844,(Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1)1
35 | 34 (Physcomitrium-spathulatum-3549,(Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1)1
36 | 35 (Physcomitrium-subsphaericum-3556,Physcomitrium-sp-3814)1
37 | 36 (Physcomitrium-pyriforme-3787,(Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1)1
38 | 37 (Physcomitrium-sp-3496,(Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1)1
39 | 38 (Physcomitrium-pyriforme-3727,(Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1)1
40 | 39 (Physcomitrium-pyriforme-3728,Physcomitrium-pyriforme-3886)1
41 | 40 ((Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1)1)1,(Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)1)1
42 | 41 (Entosthodon-sp-3726,(Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1)1)1
43 | 42 (Entosthodon-sp-3545,(Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1)1
44 | 43 (Entosthodon-clavatus-3895,Entosthodon-clavatus-3896)1
45 | 44 (Physcomitrellopsis-africana-3142,Entosthodon-smithhurstii-3465)1
46 | 


--------------------------------------------------------------------------------
/phypartspiecharts/phyparts_example/phyparts_dist.csv:
--------------------------------------------------------------------------------
 1 | node,concord,genes-concord
 2 | 0,645,2
 3 | 1,59,197
 4 | 2,50,226
 5 | 3,76,216
 6 | 4,478,1
 7 | 5,634,6
 8 | 6,626,2
 9 | 7,231,195
10 | 8,250,185
11 | 9,132,252
12 | 10,638,0
13 | 11,482,11
14 | 12,602,7
15 | 13,315,79
16 | 14,584,1
17 | 15,342,146
18 | 16,518,33
19 | 17,641,3
20 | 18,640,4
21 | 19,646,0
22 | 20,635,1
23 | 21,600,0
24 | 22,261,196
25 | 23,439,90
26 | 24,638,7
27 | 25,638,3
28 | 26,495,5
29 | 27,556,52
30 | 28,384,93
31 | 29,143,273
32 | 30,639,5
33 | 31,636,1
34 | 32,560,39
35 | 33,549,7
36 | 34,308,153
37 | 35,390,115
38 | 36,194,185
39 | 37,618,10
40 | 38,615,12
41 | 39,246,242
42 | 40,629,1
43 | 41,263,193
44 | 42,644,2
45 | 43,463,35
46 | 44,575,2


--------------------------------------------------------------------------------
/phypartspiecharts/phyparts_example/phyparts_pies.csv:
--------------------------------------------------------------------------------
 1 | node,adj_concord,adj_most_conflict,other_conflict,the_rest
 2 | 0,99.53703703703704,0.15432098765432098,0.15432098765432098,0.15432098765432098
 3 | 1,9.104938271604938,2.9320987654320985,27.469135802469136,60.49382716049383
 4 | 2,7.716049382716049,2.9320987654320985,31.944444444444443,57.407407407407405
 5 | 3,11.728395061728394,7.716049382716049,25.617283950617285,54.93827160493827
 6 | 4,73.76543209876543,0.15432098765432098,0.0,26.080246913580247
 7 | 5,97.8395061728395,0.30864197530864196,0.6172839506172839,1.2345679012345678
 8 | 6,96.60493827160494,0.30864197530864196,0.0,3.0864197530864197
 9 | 7,35.648148148148145,1.0802469135802468,29.01234567901235,34.25925925925926
10 | 8,38.58024691358025,5.092592592592593,23.456790123456788,32.870370370370374
11 | 9,20.37037037037037,6.481481481481481,32.407407407407405,40.74074074074074
12 | 10,98.4567901234568,0.0,0.0,1.5432098765432098
13 | 11,74.38271604938271,0.4629629629629629,1.2345679012345678,23.919753086419753
14 | 12,92.90123456790124,0.4629629629629629,0.6172839506172839,6.018518518518518
15 | 13,48.61111111111111,1.3888888888888888,10.802469135802468,39.19753086419753
16 | 14,90.12345679012346,0.15432098765432098,0.0,9.722222222222223
17 | 15,52.77777777777778,1.8518518518518516,20.679012345679013,24.691358024691358
18 | 16,79.93827160493827,0.7716049382716049,4.320987654320987,14.969135802469136
19 | 17,98.91975308641975,0.30864197530864196,0.15432098765432098,0.6172839506172839
20 | 18,98.76543209876543,0.4629629629629629,0.15432098765432098,0.6172839506172839
21 | 19,99.69135802469135,0.0,0.0,0.30864197530864196
22 | 20,97.99382716049382,0.15432098765432098,0.0,1.8518518518518516
23 | 21,92.5925925925926,0.0,0.0,7.4074074074074066
24 | 22,40.27777777777778,2.314814814814815,27.9320987654321,29.475308641975307
25 | 23,67.74691358024691,1.3888888888888888,12.5,18.3641975308642
26 | 24,98.4567901234568,0.30864197530864196,0.7716049382716049,0.4629629629629629
27 | 25,98.4567901234568,0.30864197530864196,0.15432098765432098,1.0802469135802468
28 | 26,76.38888888888889,0.30864197530864196,0.4629629629629629,22.839506172839506
29 | 27,85.80246913580247,1.8518518518518516,6.172839506172839,6.172839506172839
30 | 28,59.25925925925925,1.0802469135802468,13.271604938271606,26.38888888888889
31 | 29,22.067901234567902,1.8518518518518516,40.27777777777778,35.80246913580247
32 | 30,98.61111111111111,0.15432098765432098,0.6172839506172839,0.6172839506172839
33 | 31,98.14814814814815,0.15432098765432098,0.0,1.6975308641975309
34 | 32,86.41975308641975,1.2345679012345678,4.78395061728395,7.561728395061729
35 | 33,84.72222222222221,0.15432098765432098,0.9259259259259258,14.19753086419753
36 | 34,47.53086419753087,2.9320987654320985,20.679012345679013,28.858024691358025
37 | 35,60.18518518518518,2.1604938271604937,15.58641975308642,22.067901234567902
38 | 36,29.938271604938272,2.1604938271604937,26.38888888888889,41.51234567901235
39 | 37,95.37037037037037,0.4629629629629629,1.0802469135802468,3.0864197530864197
40 | 38,94.9074074074074,0.4629629629629629,1.3888888888888888,3.2407407407407405
41 | 39,37.96296296296296,3.0864197530864197,34.25925925925926,24.691358024691358
42 | 40,97.0679012345679,0.15432098765432098,0.0,2.7777777777777777
43 | 41,40.586419753086425,3.8580246913580245,25.925925925925924,29.629629629629626
44 | 42,99.38271604938271,0.15432098765432098,0.15432098765432098,0.30864197530864196
45 | 43,71.4506172839506,1.3888888888888888,4.012345679012346,23.14814814814815
46 | 44,88.73456790123457,0.15432098765432098,0.15432098765432098,10.95679012345679


--------------------------------------------------------------------------------
/phypartspiecharts/phyparts_example/species.tre:
--------------------------------------------------------------------------------
1 | (((Entosthodon-lindigii-3546:1,(Entosthodon-americanus-3894:1,((Entosthodon-planoconvexus-3114:1,Entosthodon-duriaei-3843:1)1:5.64578,((Entosthodon-attenuatus-3835:1,(Entosthodon-attenuatus-3479:1,Entosthodon-attenuatus-3543:1)1:0.300375)1:5.34711,((Entosthodon-bergianus-3509:1,(Entosthodon-obtusus-3347:1,Entosthodon-obtusus-3395:1)1:6.05287)0.91:0.0659619,(Entosthodon-sp-3837:1,(Physcomitrium-sp-3842:1,Entosthodon-subintegrus-3840:1)1:3.88757)1:3.21069)1:0.515246)1:4.56576)1:0.100426)0.99:0.0869556)1:0.129257,((Physcomitrium-hookeri-3412:1,(Physcomitrium-hookeri-3409:1,Physcomitrium-pyriforme-3404:1)1:0.546544)1:5.88447,((Physcomitrium-sp-3817:1,((Physcomitrella-patens-3403:1,Physcomitrella-patens-3139:1)1:6.06688,(Aphanorrhegma-serratum-3305:1,(Physcomitrium-collenchymatum-3480:1,(Physcomitrium-sp-3115:1,Physcomitrium-collenchymatum-3178:1)1:0.187039)1:5.9898)1:5.35659)1:4.45078)1:4.49357,(Physcomitridium-readeri-3892:1,(((Physcomitrium-sp-3508:1,Physcomitrium-sp-3816:1)1:4.0411,((Physcomitrium-sp-3539:1,Physcomitrium-sp-3551:1)1:1.24296,(Physcomitrium-japonicum-3413:1,Physcomitrium-japonicum-3411:1)0.94:0.0601077)1:2.19271)1:5.14283,((Physcomitrium-pyriforme-3118:1,Physcomitrium-pyriforme-3883:1)1:5.11929,((Physcomitrella-magdalenae-3844:1,(Physcomitrium-spathulatum-3549:1,(Physcomitrium-subsphaericum-3556:1,Physcomitrium-sp-3814:1)1:0.907898)1:0.492126)1:3.68456,(Physcomitrium-pyriforme-3787:1,(Physcomitrium-sp-3496:1,(Physcomitrium-pyriforme-3727:1,(Physcomitrium-pyriforme-3728:1,Physcomitrium-pyriforme-3886:1)1:0.193664)1:3.56346)1:3.46239)1:0.209688)1:2.35801)1:5.02628)1:5.25997)1:1.11675)1:2.2877)1:1.21442)1:2.98856,((Entosthodon-sp-3726:1,(Entosthodon-sp-3545:1,(Entosthodon-clavatus-3895:1,Entosthodon-clavatus-3896:1)1:1.56827)1:5.15235)1:0.329887,(Physcomitrellopsis-africana-3142:1,Entosthodon-smithhurstii-3465:2)1:4.30449)1:2.98856);
2 | 


--------------------------------------------------------------------------------
/phypartspiecharts/phypartspiecharts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | helptext= '''
  4 | Generate the "Pie Chart" representation of gene tree conflict from Smith et al. 2015 from
  5 | the output of phyparts, the bipartition summary software described in the same paper.
  6 | 
  7 | The input files include three files produced by PhyParts, and a file containing a species
  8 | tree in Newick format (likely, the tree used for PhyParts). The output is an SVG containing
  9 | the phylogeny along with pie charts at each node.
 10 | 
 11 | Requirements:
 12 | 
 13 | Python 3
 14 | ete3
 15 | matplotlib
 16 | 
 17 | '''
 18 | 
 19 | import matplotlib,sys,argparse,re,json
 20 | from ete3 import Tree, TreeStyle, TextFace,NodeStyle,faces, COLOR_SCHEMES
 21 | 
 22 | 
 23 | #Read in species tree and convert to ultrametric
 24 | 
 25 | #Match phyparts nodes to ete3 nodes
 26 | def get_phyparts_nodes(sptree_fn,phyparts_root):
 27 | 	sptree = Tree(sptree_fn)
 28 | 	sptree.convert_to_ultrametric()
 29 | 
 30 | 	phyparts_node_key = [line for line in open(phyparts_root+".node.key")]
 31 | 	subtrees_dict = {n.split()[0]:Tree(n.split()[1]+";") for n in phyparts_node_key}
 32 | 	subtrees_topids = {}
 33 | 	for x in subtrees_dict:
 34 | 		subtrees_topids[x] = subtrees_dict[x].get_topology_id()
 35 | 	#print(subtrees_topids['1'])
 36 | 	#print()
 37 | 	for node in sptree.traverse():
 38 | 		node_topid = node.get_topology_id()
 39 | 		if "Takakia_4343a" in node.get_leaf_names():
 40 | 			print(node_topid)
 41 | 			print(node)
 42 | 		for subtree in subtrees_dict:
 43 | 			if node_topid == subtrees_topids[subtree]:
 44 | 				node.name = subtree
 45 | 	return sptree,subtrees_dict,subtrees_topids
 46 | 
 47 | #Summarize concordance and conflict from Phyparts
 48 | def get_concord_and_conflict(phyparts_root,subtrees_dict,subtrees_topids):
 49 | 
 50 | 	with open(phyparts_root + ".concon.tre") as phyparts_trees:
 51 | 		concon_tree = Tree(phyparts_trees.readline())
 52 | 		conflict_tree = Tree(phyparts_trees.readline())
 53 | 
 54 | 	concord_dict = {}
 55 | 	conflict_dict = {}
 56 | 
 57 | 
 58 | 	for node in concon_tree.traverse():
 59 | 		node_topid = node.get_topology_id()
 60 | 		for subtree in subtrees_dict:
 61 | 			if node_topid == subtrees_topids[subtree]:
 62 | 				concord_dict[subtree] = node.support
 63 | 	
 64 | 	for node in conflict_tree.traverse():
 65 | 		node_topid = node.get_topology_id()
 66 | 		for subtree in subtrees_dict:
 67 | 			if node_topid == subtrees_topids[subtree]:
 68 | 				conflict_dict[subtree] = node.support
 69 | 	return concord_dict, conflict_dict    
 70 |     
 71 | #Generate Pie Chart data
 72 | def get_pie_chart_data(phyparts_root,total_genes,concord_dict,conflict_dict):
 73 | 
 74 | 	phyparts_hist = [line for line in open(phyparts_root + ".hist")]
 75 | 	phyparts_pies = {}
 76 | 	phyparts_dict = {}
 77 | 
 78 | 	for n in phyparts_hist:
 79 | 		n = n.split(",")
 80 | 		tot_genes = float(n.pop(-1))
 81 | 		node_name = n.pop(0)[4:]
 82 | 		concord = float(n.pop(0))
 83 | 		concord = concord_dict[node_name]
 84 | 		all_conflict = conflict_dict[node_name]
 85 | 	
 86 | 		if len(n) > 0:   
 87 | 			most_conflict = max([float(x) for x in n])
 88 | 		else:
 89 | 			most_conflict = 0.0
 90 | 	
 91 | 		adj_concord = (concord/total_genes) * 100 
 92 | 		adj_most_conflict = (most_conflict/total_genes) * 100
 93 | 		other_conflict = (all_conflict - most_conflict) / total_genes * 100
 94 | 		the_rest = (total_genes - concord - all_conflict) / total_genes * 100
 95 | 	
 96 | 		pie_list = [adj_concord,adj_most_conflict,other_conflict,the_rest]
 97 | 		
 98 | 		phyparts_pies[node_name] = pie_list
 99 | 	
100 | 		phyparts_dict[node_name] = [int(round(concord,0)),int(round(tot_genes-concord,0))]
101 | 		
102 | 	return phyparts_dict, phyparts_pies    
103 | 
104 | 
105 | def node_text_layout(mynode):
106 | 	F = faces.TextFace(mynode.name,fsize=20)
107 | 	faces.add_face_to_node(F,mynode,0,position="branch-right")
108 | 
109 | #convert internal phypartspiechart.py data files to csv and export to current directory (for use as ggtree tree data in R)
110 | def pie_data_to_csv(phyparts_dict, phyparts_pies):
111 | 	phyparts_dist_bin = {}
112 | 	phyparts_pies_bin = {}
113 | 	dist_replaced = {}
114 | 	pies_replaced = {}
115 | 
116 | 	phyparts_dist_bin = json.dumps(phyparts_dist)
117 | 	phyparts_pies_bin = json.dumps(phyparts_pies)
118 | 
119 | 
120 | 	dist_replaced = re.sub(r'{',r'node,concord,genes-concord\n',phyparts_dist_bin)
121 | 	dist_replaced = re.sub(r'"(\d*)":\s\[(\d*),\s(\d*)\],\s', r'\1,\2,\3\n', dist_replaced)
122 | 	dist_replaced = re.sub(r'"(\d*)":\s\[(\d*),\s(\d*)\]}', r'\1,\2,\3', dist_replaced)
123 | 
124 | 	pies_replaced = re.sub(r'{',r'node,adj_concord,adj_most_conflict,other_conflict,the_rest\n',phyparts_pies_bin)
125 | 	pies_replaced = re.sub(r'"(\d*)":\s\[(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*)\],\s', r'\1,\2,\3,\4,\5\n', pies_replaced)
126 | 	pies_replaced = re.sub(r'"(\d*)":\s\[(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*),\s(\d*.\d*)\]}', r'\1,\2,\3,\4,\5', pies_replaced)
127 | 
128 | 	with open('phyparts_dist.csv','w') as file:
129 | 		for line in dist_replaced:
130 | 			file.write(line)
131 | 	with open('phyparts_pies.csv','w') as file:
132 | 		for line in pies_replaced:
133 | 			file.write(line)
134 | 
135 | 
136 | parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
137 | parser.add_argument('species_tree',help="Newick formatted species tree topology.")
138 | parser.add_argument('phyparts_root',help="File root name used for Phyparts.")
139 | parser.add_argument('num_genes',type=int,default=0,help="Number of total gene trees. Used to properly scale pie charts.")
140 | parser.add_argument('--taxon_subst',help="Comma-delimted file to translate tip names.")
141 | parser.add_argument("--svg_name",help="File name for SVG generated by script",default="pies.svg")
142 | parser.add_argument("--show_nodes",help="Also show tree with nodes labeled same as PhyParts",action="store_true",default=False)
143 | parser.add_argument("--colors",help="Four colors of the pie chart: concordance (blue) top conflict (green), other conflict (red), no signal (gray)",nargs="+",default=["blue","green","red","dark gray"])	
144 | parser.add_argument("--no_ladderize",help="Do not ladderize the input species tree.",action="store_true",default=False)
145 | parser.add_argument("--to_csv",help="Output data files to csv for import into ggtree in R",action="store_true",default=False)
146 | 
147 | args = parser.parse_args()
148 | if args.no_ladderize:
149 |     ladderize=False
150 | else:
151 |     ladderize=True
152 | plot_tree,subtrees_dict,subtrees_topids = get_phyparts_nodes(args.species_tree, args.phyparts_root)
153 | #print(subtrees_dict)
154 | concord_dict, conflict_dict = get_concord_and_conflict(args.phyparts_root,subtrees_dict,subtrees_topids)
155 | phyparts_dist, phyparts_pies = get_pie_chart_data(args.phyparts_root,args.num_genes,concord_dict,conflict_dict)
156 | 
157 | if args.taxon_subst:
158 | 	taxon_subst = {line.split(",")[0]:line.rstrip().split(",")[1] for line in open(args.taxon_subst,'U')}
159 | 	for leaf in plot_tree.get_leaves():
160 | 		try:
161 | 			leaf.name = taxon_subst[leaf.name]
162 | 		except KeyError:
163 | 			print(leaf.name)
164 | 			continue
165 | def phyparts_pie_layout(mynode):
166 |     if mynode.name in phyparts_pies:
167 |         pie= faces.PieChartFace(phyparts_pies[mynode.name],
168 |                               #colors=COLOR_SCHEMES["set1"],
169 |                               colors = args.colors,
170 |                               width=50, height=50)
171 |         pie.border.width = None
172 |         pie.opacity = 1
173 |         faces.add_face_to_node(pie,mynode, 0, position="branch-right")
174 |         
175 |         concord_text = faces.TextFace(str(int(concord_dict[mynode.name]))+'   ',fsize=20)
176 |         conflict_text = faces.TextFace(str(int(conflict_dict[mynode.name]))+'   ',fsize=20)
177 |         
178 |         faces.add_face_to_node(concord_text,mynode,0,position = "branch-top")
179 |         faces.add_face_to_node(conflict_text,mynode,0,position="branch-bottom")
180 |         
181 |         
182 |     else:
183 |         F = faces.TextFace(mynode.name,fsize=20)
184 |         faces.add_face_to_node(F,mynode,0,position="aligned")
185 | 
186 | #Plot Pie Chart	
187 | ts = TreeStyle()
188 | ts.show_leaf_name = False
189 | 
190 | ts.layout_fn = phyparts_pie_layout
191 | nstyle = NodeStyle()
192 | nstyle["size"] = 0
193 | for n in plot_tree.traverse():
194 | 	n.set_style(nstyle)
195 | 	n.img_style["vt_line_width"] = 0
196 | 
197 | ts.draw_guiding_lines = True
198 | ts.guiding_lines_color = "black"
199 | ts.guiding_lines_type = 0
200 | ts.scale = 30
201 | ts.branch_vertical_margin = 10
202 | plot_tree.convert_to_ultrametric()
203 | if args.to_csv:
204 |     pie_data_to_csv(phyparts_dist, phyparts_pies)
205 | 
206 | if ladderize:
207 |     plot_tree.ladderize(direction=1)    
208 | my_svg = plot_tree.render(args.svg_name,tree_style=ts,w=595,dpi=300)
209 | 
210 | if args.show_nodes:
211 | 	node_style = TreeStyle()
212 | 	node_style.show_leaf_name=False
213 | 	node_style.layout_fn = node_text_layout
214 | 	plot_tree.render("tree_nodes.pdf",tree_style=node_style)
215 | 
216 |      
217 |     


--------------------------------------------------------------------------------
/phypartspiecharts/reroot_trees.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from ete3 import Tree
 3 | 
 4 | if len(sys.argv) < 3:
 5 |     print("Usage: python reroot_trees.py treefile.tre outgroup.list > rerooted.tre")
 6 |     
 7 | # outgroup.list should be one sample name per line
 8 | outgroup_names = [x.rstrip() for x in open(sys.argv[2])]
 9 | 
10 | for line in open(sys.argv[1]):
11 |     t = Tree(line.rstrip())
12 |     outgroups_in_tree = list(set(t.get_leaf_names()).intersection(set(outgroup_names)))
13 |     if len(outgroups_in_tree) > 1:
14 |         ancestor = t.get_common_ancestor(outgroups_in_tree)
15 |         if ancestor == t:
16 |             ingroups_in_tree = list(set(t.get_leaf_names()).difference(set(outgroups_in_tree)))
17 |             ancestor = t.get_common_ancestor(ingroups_in_tree)
18 |             t.set_outgroup(ancestor)
19 |             print(t.write())
20 |         else:
21 |             t.set_outgroup(ancestor)
22 |             print(t.write())
23 |     elif len(outgroups_in_tree) == 1:
24 |         t.set_outgroup(outgroups_in_tree[0])
25 |         print(t.write())
26 |     else:
27 |         continue
28 | 


--------------------------------------------------------------------------------