├── treesort ├── __init__.py ├── version.py ├── lm_outlier_detector.py ├── tree_indexer.py ├── jc_reassortment_test.py ├── helpers.py ├── reassortment_utils.py ├── parsimony.py ├── options.py ├── cli.py └── reassortment_inference.py ├── conda-requirements.txt ├── tutorial ├── figures │ ├── TreeSort-logo-150.png │ ├── TreeSort-logo-300.png │ ├── TreeSort-illustration.png │ ├── swH1-reassortment-ex1.png │ ├── swH1-reassortment-ex2.png │ ├── swH1-reassortment-ex3.png │ └── swH1-reassortment-ex4.png └── swH1-parsed │ ├── descriptor.csv │ ├── HA-swine_H1_HANA.fasta.aln.treetime │ ├── root_to_tip_regression.pdf │ └── outliers.tsv │ └── NA-swine_H1_HANA.fasta.aln.treetime │ ├── root_to_tip_regression.pdf │ └── outliers.tsv ├── treesort.py ├── examples └── descriptor-huH1N1-wgs.csv ├── .gitignore ├── LICENSE ├── setup.py ├── treetime-root.py ├── prepare_dataset.sh └── README.md /treesort/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /treesort/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.3.1' 2 | -------------------------------------------------------------------------------- /conda-requirements.txt: -------------------------------------------------------------------------------- 1 | fasttree 2 | iqtree 3 | mafft 4 | pip 5 | smof 6 | -------------------------------------------------------------------------------- /tutorial/figures/TreeSort-logo-150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/figures/TreeSort-logo-150.png -------------------------------------------------------------------------------- /tutorial/figures/TreeSort-logo-300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/figures/TreeSort-logo-300.png -------------------------------------------------------------------------------- /tutorial/figures/TreeSort-illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/figures/TreeSort-illustration.png -------------------------------------------------------------------------------- /tutorial/figures/swH1-reassortment-ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/figures/swH1-reassortment-ex1.png -------------------------------------------------------------------------------- /tutorial/figures/swH1-reassortment-ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/figures/swH1-reassortment-ex2.png -------------------------------------------------------------------------------- /tutorial/figures/swH1-reassortment-ex3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/figures/swH1-reassortment-ex3.png -------------------------------------------------------------------------------- /tutorial/figures/swH1-reassortment-ex4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/figures/swH1-reassortment-ex4.png -------------------------------------------------------------------------------- /treesort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from treesort.cli import run_treesort_cli 4 | 5 | 6 | if __name__ == '__main__': 7 | run_treesort_cli() 8 | -------------------------------------------------------------------------------- /tutorial/swH1-parsed/descriptor.csv: -------------------------------------------------------------------------------- 1 | *HA,HA-swine_H1_HANA.fasta.aln,HA-swine_H1_HANA.fasta.aln.rooted.tre 2 | NA,NA-swine_H1_HANA.fasta.aln,NA-swine_H1_HANA.fasta.aln.rooted.tre 3 | -------------------------------------------------------------------------------- /tutorial/swH1-parsed/HA-swine_H1_HANA.fasta.aln.treetime/root_to_tip_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/swH1-parsed/HA-swine_H1_HANA.fasta.aln.treetime/root_to_tip_regression.pdf -------------------------------------------------------------------------------- /tutorial/swH1-parsed/NA-swine_H1_HANA.fasta.aln.treetime/root_to_tip_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flu-crew/TreeSort/HEAD/tutorial/swH1-parsed/NA-swine_H1_HANA.fasta.aln.treetime/root_to_tip_regression.pdf -------------------------------------------------------------------------------- /examples/descriptor-huH1N1-wgs.csv: -------------------------------------------------------------------------------- 1 | PB2, ../huH1N1/USA/PB2.final.aln, ../huH1N1/USA/PB2.fasttree.tre 2 | PB1, ../huH1N1/USA/PB1.final.aln, ../huH1N1/USA/PB1.fasttree.tre 3 | PA, ../huH1N1/USA/PA.final.aln, ../huH1N1/USA/PA.fasttree.tre 4 | *HA, ../huH1N1/USA/HA.final.aln, ../huH1N1/USA/HA.final.aln.rooted.tre 5 | NP, ../huH1N1/USA/NP.final.aln, ../huH1N1/USA/NP.fasttree.tre 6 | NA, ../huH1N1/USA/NA.final.aln, ../huH1N1/USA/NA.fasttree.tre 7 | MP, ../huH1N1/USA/MP.final.aln, ../huH1N1/USA/MP.fasttree.tre 8 | NS, ../huH1N1/USA/NS.final.aln, ../huH1N1/USA/NS.fasttree.tre 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Phylogenetic/fasta test files. 2 | *.tre 3 | *.nexus 4 | *.aln 5 | *.fasta 6 | *.fna 7 | 8 | # TreeTime output 9 | treetime* 10 | 11 | # General 12 | *.pdf 13 | *.ppt 14 | *.pptx 15 | *.zip 16 | *.csv 17 | 18 | 19 | # Compiled Python bytecode and related files 20 | *.py[cod] 21 | dist/ 22 | build/ 23 | *.egg-info/ 24 | __pycache__/ 25 | 26 | # Log files 27 | *.log 28 | 29 | # JetBrains IDE 30 | .idea/ 31 | 32 | # Unit test reports 33 | TEST*.xml 34 | .pytest* 35 | 36 | # Generated by MacOS 37 | .DS_Store 38 | 39 | # Python virtual environment 40 | venv/ 41 | 42 | # Simulation files 43 | sims/ 44 | 45 | # Other 46 | testfiles/ 47 | misc/ 48 | -------------------------------------------------------------------------------- /tutorial/swH1-parsed/NA-swine_H1_HANA.fasta.aln.treetime/outliers.tsv: -------------------------------------------------------------------------------- 1 | given_date apparent_date residual 2 | A/swine/Oklahoma/A02245577/2020|1A.3.3.2|LAIV-98|TLLPPT|2020-03-24 2020.23 1994.2189896759946 -4.114862948315013 3 | A/swine/Nebraska/A01378047/2021|1B.2.2.2|LAIV-98|LLLLPT|2021-01-28 2021.08 1994.4302255610737 -4.215913494082321 4 | A/swine/Iowa/A02478443/2019|1A.3.3.2|LAIV-98|LLLLPP|2019-04-26 2019.32 1993.7862012468288 -4.039369525073382 5 | A/swine/Nebraska/A02479104/2020|1A.2-3-like|LAIV-98|LLLLPT|2020-02-25 2020.15 1994.4343152730062 -4.068143334517119 6 | A/swine/Iowa/A02525361/2021|1B.2.2.1|LAIV-98|TLLTPT|2021-04-09 2021.27 1995.965872599665 -4.003036213590898 7 | A/swine/Indiana/A02636016/2021|1A.1.1|LAIV-98|TTTPPT|2021-08-02 2021.58 1993.7861821241984 -4.396897696196047 8 | A/swine/Iowa/A02524534/2020|1B.2.2.2|LAIV-98|LLLPPT|2020-08-12 2020.61 1994.8706124236035 -4.071893053407719 9 | A/swine/Iowa/A02271349/2018|1A.2-3-like|pdm|LLPLPP|2018-12-04 2018.92 1990.468299250318 -4.500972771648383 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Flu-crew at the National Animal Disease Center 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /treesort/lm_outlier_detector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from sklearn.linear_model import LinearRegression 4 | 5 | 6 | # This is an earlier idea of using linear regression outliers for reassortment detection. 7 | class LMOutlierDetector(object): 8 | trained_reg: LinearRegression 9 | iqd: float 10 | q3: float 11 | 12 | def __init__(self, sibling_dists_s1: np.ndarray, sibling_dists_s2: np.ndarray): 13 | assert len(sibling_dists_s1) >= 10 14 | self.sibling_dists_s1 = sibling_dists_s1 15 | self.sibling_dists_s2 = sibling_dists_s2 16 | 17 | reg: LinearRegression = LinearRegression(fit_intercept=True).fit( 18 | sibling_dists_s1.reshape(-1, 1), sibling_dists_s2.reshape(-1, 1)) 19 | residuals: np.ndarray = sibling_dists_s2 - reg.predict(sibling_dists_s1.reshape(-1, 1)).reshape(-1) 20 | residuals.sort() 21 | print(residuals) 22 | q1 = residuals[round(len(residuals) / 4) - 1] 23 | q3 = residuals[round(len(residuals) * 3 / 4) - 1] 24 | self.iqd = q3 - q1 25 | self.trained_reg = reg 26 | self.q3 = q3 27 | print(f'IQD {self.iqd}, Q1 {q1}, Q3 {self.q3}') 28 | 29 | def get_residual(self, x: float, y: float) -> float: 30 | residual = y - self.trained_reg.predict(np.array([[x]], dtype=float)) 31 | return residual[0, 0] 32 | 33 | def is_outlier(self, x: float, y: float, iqd_mult=2) -> bool: 34 | residual = self.get_residual(x, y) 35 | return residual >= self.q3 + self.iqd * iqd_mult 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | from treesort.version import __version__ 4 | 5 | with open("README.md", "r") as fh: 6 | long_description = fh.read() 7 | 8 | setup( 9 | install_requires=[ 10 | 'scipy>=1.7.0', 11 | 'biopython>=1.67', 12 | 'dendropy>=4.5.0', 13 | 'phylo-treetime>=0.9.4', 14 | 'matplotlib' 15 | ], 16 | name="TreeSort", 17 | version=__version__, 18 | author="Alexey Markin", 19 | author_email="alex.markin57@gmail.com", 20 | license='MIT', 21 | description="Virus reassortment inference software." 22 | "Infers both recent and ancestral reassortment and uses flexible molecular clock constraints.", 23 | long_description=long_description, 24 | long_description_content_type="text/markdown", 25 | url="https://github.com/flu-crew/TreeSort", 26 | packages=["treesort"], 27 | classifiers=[ 28 | "Programming Language :: Python :: 3", 29 | "Programming Language :: Python :: 3.6", 30 | "Programming Language :: Python :: 3.7", 31 | "Programming Language :: Python :: 3.8", 32 | "Programming Language :: Python :: 3.9", 33 | "Programming Language :: Python :: 3.10", 34 | "Programming Language :: Python :: 3.11", 35 | "Programming Language :: Python :: 3.12", 36 | "Topic :: Scientific/Engineering :: Bio-Informatics", 37 | "License :: OSI Approved :: MIT License", 38 | "Operating System :: OS Independent", 39 | ], 40 | entry_points={"console_scripts": ["treesort=treesort.cli:run_treesort_cli"]}, 41 | py_modules=["treesort"], 42 | ) 43 | -------------------------------------------------------------------------------- /treesort/tree_indexer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from dendropy import TaxonNamespace, Tree 3 | 4 | 5 | class InvalidArgumentError(Exception): 6 | def __init__(self, name: str, value: str, message=''): 7 | super(InvalidArgumentError, self).__init__('Invalid argument %s: %s; %s' % (name, value, message)) 8 | self.name = name 9 | self.value = value 10 | 11 | 12 | class TreeIndexer: 13 | """ 14 | Adds 'index' field to all nodes and taxa on the passed trees. 15 | It ensures that the taxa are indexed consistently across trees. 16 | """ 17 | 18 | def __init__(self, taxon_namespace: TaxonNamespace): 19 | self.label_mapping = {} 20 | index = 0 21 | for taxon in taxon_namespace: 22 | self.label_mapping[taxon.label] = index 23 | index += 1 24 | 25 | def index_tree(self, tree: Tree): 26 | for leaf_node in tree.leaf_nodes(): 27 | if leaf_node.taxon.label in self.label_mapping: 28 | leaf_node.taxon.index = self.label_mapping[leaf_node.taxon.label] 29 | else: 30 | print(leaf_node.taxon.label) 31 | print(self.label_mapping) 32 | raise InvalidArgumentError('tree', '', 'Input tree should be over the initially specified taxon set') 33 | 34 | node_id = 0 35 | for node in tree.postorder_internal_node_iter(): 36 | node.index = node_id 37 | # node.annotations.add_new('id', node_id) 38 | node_id += 1 39 | for node in tree.leaf_node_iter(): 40 | node.index = node_id 41 | node_id += 1 42 | tree.annotations.add_new('indexed', str(True)) 43 | 44 | @staticmethod 45 | def is_indexed(tree: Tree) -> bool: 46 | return tree.annotations.get_value('indexed', str(False)) == str(True) 47 | -------------------------------------------------------------------------------- /treesort/jc_reassortment_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scipy.stats import binomtest 3 | import math 4 | from typing import Tuple 5 | 6 | 7 | def jc_pvalue(subs: int, sites: int, ml_distance: float, rate_ratio=1.0, allowed_deviation=1.5): 8 | """ 9 | We assume the Jukes-Cantor substitution model and test whether the observed number of substitutions was likely 10 | to come from the observed time interval (ml_distance). The method assumes strict molecular clock (but with deviation) 11 | :param subs: Number of observed substitutions in the second gene segment 12 | :param sites: Number of sites in the second gene segment 13 | :param ml_distance: Expected number of substitutions per site in the first gene segment 14 | :param rate_ratio: Ratio in global substitution rates between the second and first segments 15 | :param pvalue_threshold: p-values below this threshold will be inferred as reassortments 16 | :param allowed_deviation: Should be >=1: allowed deviation from the strict molecular clock in each segment 17 | :return: the pvalue of observing the number of subs over the ml_distance edge. 18 | """ 19 | if ml_distance < 1 / sites: 20 | ml_distance = 1 / sites 21 | max_deviation = allowed_deviation * allowed_deviation 22 | sub_probability = 0.75 - 0.75 * (math.exp(-(4 * ml_distance * rate_ratio * max_deviation) / 3)) 23 | pvalue = binomtest(subs, sites, p=sub_probability, alternative='greater').pvalue 24 | # if pvalue < 0.001: 25 | # print(subs, sites, ml_distance, sub_probability, pvalue) 26 | return pvalue 27 | 28 | 29 | class JCReassortmentTester(object): 30 | 31 | def __init__(self, sites: int, rate_ratio: float, pvalue_threshold: float, allowed_deviation: float): 32 | self.sites = sites 33 | self.rate_ratio = rate_ratio 34 | self.pvalue_threshold = pvalue_threshold 35 | self.allowed_deviation = allowed_deviation 36 | 37 | def is_reassorted(self, subs: int, ml_distance: float) -> Tuple[bool, float]: 38 | pvalue = jc_pvalue(subs, self.sites, ml_distance, self.rate_ratio, self.allowed_deviation) 39 | return (pvalue < self.pvalue_threshold), pvalue 40 | -------------------------------------------------------------------------------- /treetime-root.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | @author: Alexey Markin 4 | """ 5 | import sys 6 | import os 7 | import subprocess 8 | from Bio import SeqIO 9 | from datetime import datetime 10 | import re 11 | 12 | 13 | def extract_dates(path: str, format='%Y-%m-%d') -> str: 14 | records = SeqIO.parse(path, 'fasta') 15 | file_name = path + '.dates.csv' 16 | dates_file = open(file_name, 'w+') 17 | dates_file.write('name, date\n') 18 | for record in records: 19 | name = record.name 20 | # date_str = name.split('|')[-1] 21 | for token in name.split('|'): 22 | if re.fullmatch(r'[\d\-/]{4,}', token) and not re.fullmatch(r'\d{5,}', token): 23 | try: 24 | if token.count('/') == 2: 25 | try: 26 | date = datetime.strptime(token, '%m/%d/%Y') 27 | except ValueError as e: 28 | date = datetime.strptime(token, '%Y/%m/%d') 29 | elif token.count('/') == 1: 30 | date = datetime.strptime(token, '%m/%Y') 31 | elif token.count('-') == 2: 32 | date = datetime.strptime(token, '%Y-%m-%d') 33 | elif token.count('-') == 1: 34 | date = datetime.strptime(token, '%Y-%m') 35 | else: 36 | date = datetime.strptime(token, '%Y') 37 | except ValueError as e: 38 | print(f"Can't parse date {token} -- skipping") 39 | continue 40 | dec_date = date.year + ((date.month-1)*30 + date.day)/365.0 41 | dates_file.write('%s, %.2f\n' % (name, dec_date)) 42 | dates_file.close() 43 | return file_name 44 | 45 | 46 | def root_tree(tree_path: str, alignment_path: str) -> str: 47 | dates_file = extract_dates(alignment_path) 48 | rooted_tree = alignment_path + '.rooted.tre' 49 | treetime_dir = alignment_path + '.treetime' 50 | print(' '.join(['treetime', 'clock', '--tree', tree_path, 51 | '--aln', alignment_path, '--dates', dates_file, 52 | '--outdir', treetime_dir])) 53 | subprocess.call(['treetime', 'clock', '--tree', tree_path, 54 | '--aln', alignment_path, '--dates', dates_file, 55 | '--outdir', treetime_dir], stderr=subprocess.STDOUT) 56 | os.replace(treetime_dir + '/rerooted.newick', rooted_tree) 57 | 58 | 59 | if __name__ == '__main__': 60 | args = sys.argv[1:] 61 | root_tree(args[0], args[1]) 62 | -------------------------------------------------------------------------------- /prepare_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: ./prepare_dataset.sh [--segments "..." --fast] fasta_path reference_segment outdir 4 | # Using --fast will make all trees to be inferred with FastTree. 5 | # By default (without --fast) the reference tree is inferred with IQ-Tree, which is recommended for better accuracy. 6 | # Example usage: ./prepare_dataset.sh --segments "HA,NA" segments.fasta HA myoutdir 7 | # Example with default segments: ./prepare_dataset.sh segments.fasta HA myoutdir 8 | 9 | # These are the default segment names 10 | declare -a segments=("PB2" "PB1" "PA" "HA" "NP" "NA" "MP" "NS") 11 | FAST=0 12 | 13 | POSITIONAL_ARGS=() 14 | 15 | while [[ $# -gt 0 ]]; do 16 | case $1 in 17 | --segments) 18 | SEGMENTS_STR="$2" 19 | segments=(${SEGMENTS_STR//,/ }) 20 | shift # past argument 21 | shift # past value 22 | ;; 23 | --fast) 24 | FAST=1 25 | shift 26 | ;; 27 | -*|--*) 28 | echo "Unrecognized option $1" 29 | exit 1 30 | ;; 31 | *) 32 | POSITIONAL_ARGS+=("$1") # save positional arg 33 | shift # past argument 34 | ;; 35 | esac 36 | done 37 | 38 | set -- "${POSITIONAL_ARGS[@]}" 39 | 40 | # Required arguments: 41 | main_fasta="$1" # Provide a path to a fasta file with all segments 42 | ref_seg="$2" # Name of the segment to use as the reference (typically - HA) 43 | outdir="$3" # Path to the directory to store the results 44 | 45 | rm -r $outdir # Clear out the directory 46 | mkdir $outdir # Re-create the directory 47 | 48 | name=${main_fasta##*/} 49 | 50 | # Split out the segments and align them 51 | for seg in "${segments[@]}" 52 | do 53 | cat $main_fasta | smof grep "|${seg}|" > ${outdir}/${seg}-${name} 54 | echo "Aligning ${seg}..." 55 | mafft --thread 6 ${outdir}/${seg}-${name} | sed "s/|${seg}|/|/g"> ${outdir}/${seg}-${name}.aln 56 | rm ${outdir}/${seg}-${name} 57 | done 58 | 59 | if [ $FAST -eq 0 ]; then 60 | # Build fasttree trees in parallel for non-reference segments 61 | echo "Building non-reference trees in parallel with FastTree..." 62 | for seg in "${segments[@]}" 63 | do 64 | if [ $seg != $ref_seg ]; then 65 | fasttree -nt -gtr -gamma ${outdir}/${seg}-${name}.aln > ${outdir}/${seg}-${name}.tre & 66 | fi 67 | done 68 | wait # Wait to finish. 69 | 70 | # Build an IQ-Tree tree for the reference segment. We use the GTR+F+R5 model by default which can be changed 71 | echo "Building the reference tree with IQ-Tree..." 72 | iqtree2 -s ${outdir}/${ref_seg}-${name}.aln -T 6 --prefix "${outdir}/${ref_seg}-${name}" -m GTR+F+R5 73 | mv ${outdir}/${ref_seg}-${name}.treefile ${outdir}/${ref_seg}-${name}.tre 74 | else 75 | # Build all trees with FastTree in parallel. 76 | echo "Building trees in parallel with FastTree..." 77 | for seg in "${segments[@]}" 78 | do 79 | fasttree -nt -gtr -gamma ${outdir}/${seg}-${name}.aln > ${outdir}/${seg}-${name}.tre & 80 | done 81 | wait # Wait to finish. 82 | fi 83 | 84 | # Root the trees with a custom rooting script (in parallel) 85 | echo "Rooting trees with TreeTime..." 86 | for seg in "${segments[@]}" 87 | do 88 | python treetime-root.py ${outdir}/${seg}-${name}.tre ${outdir}/${seg}-${name}.aln & 89 | done 90 | wait 91 | 92 | # Create a descriptor file 93 | descriptor=${outdir}/descriptor.csv 94 | for seg in "${segments[@]}" 95 | do 96 | if [ $seg == $ref_seg ]; then 97 | echo -n "*" >> $descriptor 98 | fi 99 | echo "${seg},${seg}-${name}.aln,${seg}-${name}.aln.rooted.tre" >> $descriptor 100 | done 101 | echo "The descriptor file was written to ${descriptor}" 102 | -------------------------------------------------------------------------------- /tutorial/swH1-parsed/HA-swine_H1_HANA.fasta.aln.treetime/outliers.tsv: -------------------------------------------------------------------------------- 1 | given_date apparent_date residual 2 | A/swine/Chile/YA026/2014|1B.2.2|LAIV-98|PPPPPP|2014-01-01 2014.0 1998.1559081042533 -6.609097869366056 3 | A/swine/Oklahoma/A02245006/2019|1A.2-3-like|LAIV-98|TTTPPT|2019-03-07 2019.18 2000.8638006608433 -7.640296132074614 4 | A/swine/Nebraska/A02479104/2020|1A.2-3-like|LAIV-98|LLLLPT|2020-02-25 2020.15 2000.7591344688262 -8.08857515536438 5 | A/swine/Oklahoma/A01785571/2018|1A.2-3-like|LAIV-Classical|LTLLPT|2018-06-12 2018.44 2000.594878879735 -7.443793734002908 6 | A/swine/Oklahoma/A01785573/2018|1A.2-3-like|LAIV-Classical|LTLLPT|2018-06-14 2018.45 2000.594878879735 -7.447965066707222 7 | A/swine/Minnesota/A02245535/2020|1A.2-3-like|LAIV-Classical|LLLPPT|2020-03-19 2020.22 2001.0826973297421 -7.98280564993242 8 | A/swine/Illinois/A02157797/2018|1A.2-3-like|LAIV-Classical|TTLTPT|2018-04-11 2018.28 2000.7569594747958 -7.309443202045201 9 | A/swine/Minnesota/A01785575/2018|1A.2-3-like|LAIV-98|TLLTPT|2018-06-19 2018.46 2000.5948654061 -7.452142019712851 10 | A/swine/Minnesota/A01785613/2018|1A.2-3-like|LAIV-98|TLLTPT|2018-09-10 2018.68 2001.2455868419358 -7.272473778540949 11 | A/swine/Minnesota/A01785608/2018|1A.2-3-like|LAIV-98|TLLTPT|2018-08-30 2018.66 2001.2463119231168 -7.263828657648307 12 | A/swine/Nebraska/A02245333/2019|1A.2-3-like|2002A|TLLPPT|2019-11-12 2019.85 2002.2278834603933 -7.350771103953527 13 | A/swine/Nebraska/A01378044/2019|1A.2-3-like|2002B|TLLPPT|2019-05-23 2019.39 2001.4092014353105 -7.50038931011847 14 | A/swine/Nebraska/A02245334/2019|1A.2-3-like|2002B|TLLPPT|2019-11-12 2019.85 2002.0736866884797 -7.415091707710629 15 | A/swine/Iowa/A02432384/2019|1A.2-3-like|LAIV-Classical|TTTPPT|2019-04-17 2019.29 2000.9194220723643 -7.662979250527693 16 | A/swine/Iowa/A02478477/2019|1A.2-3-like|LAIV-Classical|TTTPPT|2019-05-07 2019.35 2001.0811593392405 -7.620541251671756 17 | A/swine/Texas/A01104132/2019|1A.2-3-like|LAIV-Classical|LTTLPT|2019-05-30 2019.41 2000.9216749140646 -7.712095507752426 18 | A/swine/California/A02478680/2019|1A.2-3-like|LAIV-Classical|LLLLPT|2019-09-17 2019.7 2001.5726720603977 -7.561511587488909 19 | A/swine/Nebraska/A02157974/2018|1A.2-3-like|LAIV-Classical|TLLLLT|2018-04-18 2018.3 2000.9213206073623 -7.2492253706956555 20 | A/swine/Iowa/A02271349/2018|1A.2-3-like|pdm|LLPLPP|2018-12-04 2018.92 2000.5947231843954 -7.644082649512495 21 | A/swine/South_Dakota/A02156993/2018|1A.2-3-like|LAIV-Classical|TLLLPT|2018-03-22 2018.22 2000.5949088211464 -7.352011924950756 22 | A/swine/Illinois/A02431144/2019|1A.2-3-like|LAIV-Classical|TLLTPT|2019-02-22 2019.14 2000.7567446451687 -7.668267427194577 23 | A/swine/Texas/A01785906/2019|1A.2-3-like|LAIV-Classical|LLLLPP|2019-01-23 2019.06 2000.5947903030597 -7.70245330994396 24 | A/swine/North_Carolina/A02245875/2021|1A.2-3-like|LAIV-Classical|TTTTPT|2021-01-21 2021.06 2000.5947785760068 -8.536724742535252 25 | A/swine/North_Carolina/A02479173/2020|1A.2-3-like|pdm|PPPLPP|2020-03-25 2020.23 2000.4329637109724 -8.258002491938507 26 | A/swine/Nebraska/A02257618/2018|1A.2-3-like|LAIV-Classical|LLLLPT|2018-09-21 2018.72 2000.9193604429593 -7.425238994061569 27 | A/swine/Iowa/A02254795/2018|1A.2-3-like|LAIV-Classical|LLLLPT|2018-07-30 2018.58 2001.0815498251486 -7.299185748781743 28 | A/swine/Oklahoma/A02246973/2021|1A.2-3-like|LAIV-Classical|LLLLPT|2021-10-06 2021.76 2001.8977946193393 -8.285186688262133 29 | A/swine/Kansas/A02636184/2021|1A.2-3-like|LAIV-Classical|LLLLPT|2021-09-21 2021.72 2001.7358068426536 -8.336071848502439 30 | A/swine/Oklahoma/A02248197/2021|1A.2-3-like|LAIV-Classical|LLLLPT|2021-09-15 2021.7 2001.4074790556162 -8.46468562667179 31 | A/swine/Oklahoma/A02246915/2022|1A.2-3-like|LAIV-Classical|LLLLPT|2022-01-07 2022.02 2002.0565265170408 -8.327428982963466 32 | A/swine/Colorado/A02636469/2022|1A.2-3-like|LAIV-Classical|LLLLPT|2022-02-04 2022.09 2001.731696136373 -8.492125870914018 33 | A/swine/Missouri/A01932424/2017|1A.3.2|classicalSwine|PPPPPP|2017-02-22 2017.14 2027.5609154662764 4.346910549256599 34 | -------------------------------------------------------------------------------- /treesort/helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | import re 4 | 5 | from Bio import SeqIO 6 | from dendropy import Tree, Node 7 | from typing import Union, List 8 | 9 | 10 | def get_median(l: List[float]) -> float: 11 | l = sorted(l) 12 | l_size = len(l) 13 | if l_size % 2 == 1: 14 | return l[l_size // 2] 15 | else: 16 | return (l[l_size // 2 - 1] + l[l_size // 2]) / 2 17 | 18 | 19 | def compute_sampling_density(tree: Union[str, Tree]) -> float: 20 | """ 21 | Reports the median edge length. 22 | """ 23 | if isinstance(tree, str): 24 | tree: Tree = Tree.get(path=tree, schema='newick', preserve_underscores=True) 25 | elif not isinstance(tree, Tree): 26 | raise ValueError('"tree" should be either a path to a newick tree or a dendropy Tree object.') 27 | edge_lengths = [node.edge_length for node in tree.postorder_node_iter()] 28 | return get_median(edge_lengths) 29 | 30 | 31 | def collapse_zero_branches(tree: Tree, threshold=1e-7): 32 | tree.collapse_unweighted_edges(threshold) 33 | 34 | 35 | def parse_dates(aln_path: str): 36 | records = SeqIO.parse(aln_path, 'fasta') 37 | dates = {} 38 | for record in records: 39 | name = record.name 40 | # date_str = name.split('|')[-1] 41 | date = None 42 | for token in name.split('|'): 43 | if re.fullmatch(r'[\d\-/]{4,}', token) and not re.fullmatch(r'\d{5,}', token): 44 | if token.count('/') == 2: 45 | date = datetime.strptime(token, '%m/%d/%Y') 46 | elif token.count('/') == 1: 47 | date = datetime.strptime(token, '%m/%Y') 48 | elif token.count('-') == 2: 49 | date = datetime.strptime(token, '%Y-%m-%d') 50 | elif token.count('-') == 1: 51 | date = datetime.strptime(token, '%Y-%m') 52 | else: 53 | date = datetime.strptime(token, '%Y') 54 | if not date: 55 | # print(f'No date for {record.id}') 56 | # TODO: log with low priority level 57 | pass 58 | else: 59 | dec_date = date.year + ((date.month - 1) * 30 + date.day) / 365.0 60 | dates[name] = dec_date 61 | return dates 62 | 63 | 64 | # For a binary tree only 65 | def sibling_distance(parent_node: Node) -> float: 66 | return parent_node.child_nodes()[0].edge_length + parent_node.child_nodes()[1].edge_length 67 | 68 | 69 | # Siblings specified. 70 | def sibling_distance_n2(sib1: Node, sib2: Node) -> float: 71 | assert sib1.parent_node is sib2.parent_node 72 | return sib1.edge_length + sib2.edge_length 73 | 74 | 75 | def aunt_distance(node: Node) -> float: 76 | # We assume that the tree is binary 77 | assert node.parent_node and node.parent_node.parent_node 78 | parent: Node = node.parent_node 79 | aunt: Node = parent.sibling_nodes()[0] 80 | return node.edge_length + parent.edge_length + aunt.edge_length 81 | 82 | 83 | def node_distance(node1: Node, node2: Node) -> float: 84 | """ 85 | Linear-time algorithm to find a distance between two nodes on the same tree. 86 | Note: with constant-time LCA computation, one can compute distance in constant time. 87 | """ 88 | node1_depth = get_node_depth(node1) 89 | node2_depth = get_node_depth(node2) 90 | distance = 0 91 | p1, p2 = node1, node2 92 | if node1_depth > node2_depth: 93 | for step in range(node1_depth - node2_depth): 94 | distance += p1.edge_length 95 | p1 = p1.parent_node 96 | elif node2_depth > node1_depth: 97 | for step in range(node2_depth - node1_depth): 98 | distance += p2.edge_length 99 | p2 = p2.parent_node 100 | 101 | while p1 != p2: 102 | distance += p1.edge_length 103 | distance += p2.edge_length 104 | p1 = p1.parent_node 105 | p2 = p2.parent_node 106 | return distance 107 | 108 | 109 | def node_distance_w_lca(node1: Node, node2: Node, lca: Node) -> float: 110 | distance = 0 111 | while node1 is not None and node1 is not lca: 112 | distance += node1.edge_length 113 | node1 = node1.parent_node 114 | 115 | while node2 is not None and node2 is not lca: 116 | distance += node2.edge_length 117 | node2 = node2.parent_node 118 | 119 | assert node1 and node2 120 | return distance 121 | 122 | 123 | def get_node_depth(node: Node) -> int: 124 | depth = 0 125 | p: Node = node.parent_node 126 | while p: 127 | depth += 1 128 | p = p.parent_node 129 | return depth 130 | 131 | 132 | def binarize_tree(tree: Tree, edge_length=0): 133 | """ 134 | Adds/removes nodes from the tree to make it fully binary (added edges will have length 'edge_length') 135 | :param tree: Dendropy tree to be made bifurcating. 136 | """ 137 | 138 | # First suppress unifurcations. 139 | tree.suppress_unifurcations() 140 | 141 | # Now binarize multifurcations. 142 | node: Node 143 | for node in tree.postorder_node_iter(): 144 | if node.child_nodes() and len(node.child_nodes()) > 2: 145 | num_children = len(node.child_nodes()) 146 | children = node.child_nodes() 147 | interim_node = node 148 | # Creates a caterpillar structure with children on the left of the trunk: 149 | for child_ind in range(len(children) - 2): 150 | new_node = Node(edge_length=edge_length) 151 | interim_node.set_child_nodes([children[child_ind], new_node]) 152 | interim_node = new_node 153 | interim_node.set_child_nodes(children[num_children - 2:]) 154 | -------------------------------------------------------------------------------- /treesort/reassortment_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import math 3 | import random as rnd 4 | from typing import List, Optional 5 | from dendropy import Tree, Node, Edge 6 | import numpy as np 7 | from scipy.optimize import minimize, LinearConstraint 8 | import warnings 9 | 10 | from treesort.tree_indexer import TreeIndexer 11 | from treesort.helpers import sibling_distance 12 | 13 | 14 | def compute_rea_rate_simple(annotated_tree: Tree, evol_rate: float, ignore_top_edges=1) -> float: 15 | """ 16 | A simpler way to compute the reassortment rate: the number of detected events divided by the total size of the tree (in years) 17 | :ignore_top_edges: the longest x percent of edges will not be counted for. 18 | """ 19 | edge_cutoff = math.inf 20 | if ignore_top_edges > 0: 21 | edge_lengths = sorted([node.edge_length for node in annotated_tree.postorder_node_iter() if node.edge_length]) 22 | top_percentile = min(len(edge_lengths) - 1, int(round(len(edge_lengths) * (1.0 - ignore_top_edges / 100)))) 23 | edge_cutoff = edge_lengths[top_percentile] 24 | 25 | # Compute the number of reassortment events detected (a ?-only edge counts as 0.5). 26 | rea_events = 0 27 | for node in annotated_tree.postorder_node_iter(): 28 | if node is annotated_tree.seed_node: 29 | continue # Skip the root edge 30 | edge: Edge = node.edge 31 | if edge and node.edge_length >= edge_cutoff: 32 | continue # Skip the edge if its in the top percentile. 33 | if edge.annotations.get_value('is_reassorted', '0') == '1': 34 | rea_annotation = edge.annotations.get_value('rea').strip('"') 35 | is_uncertain = all([g_str.startswith('?') for g_str in rea_annotation.split(',')]) # Is this 100% uncertain reassortment? 36 | if not is_uncertain: 37 | rea_events += 1 38 | else: 39 | rea_events += 0.5 40 | 41 | # Compute the total tree length (phylogenetic diversity) 42 | tree_length = 0 43 | for node in annotated_tree.postorder_node_iter(): 44 | if node is not annotated_tree.seed_node: 45 | if node.edge_length and node.edge_length >= edge_cutoff: 46 | continue # Skip the edge if its in the top percentile. 47 | tree_length += node.edge_length 48 | 49 | # print(f'{rea_events}, {tree_length}, {evol_rate}') 50 | rea_rate_per_lineage_per_year = (rea_events / tree_length * evol_rate) if tree_length > 0 else 0.0 51 | return rea_rate_per_lineage_per_year 52 | 53 | 54 | def likelihood_binary(x, rea_events, edge_lengths): 55 | func = 0 56 | if x < 1e-10: 57 | return np.inf 58 | for i in range(len(rea_events)): 59 | if edge_lengths[i] > 0: 60 | if rea_events[i] > 0: 61 | with warnings.catch_warnings(): 62 | warnings.filterwarnings('error') 63 | try: 64 | func -= np.log(1 - np.exp(-1 * x * edge_lengths[i])) 65 | except Warning: 66 | # print(x) 67 | func += np.inf 68 | else: 69 | func -= (-1 * x * edge_lengths[i]) 70 | elif rea_events[i] > 0: 71 | # print('+1') 72 | pass 73 | return func 74 | 75 | 76 | def compute_rea_rate_binary_mle(annotated_tree: Tree, evol_rate: float, ref_seg_len=1700) -> Optional[float]: 77 | rea_events = [] # reassortment events per branches (1 - at least one event, 0 - no events). 78 | edge_lengths = [] # Corresponding branch lengths (the two arrays are coupled). 79 | processed_uncertain = set() 80 | node: Node 81 | for node in annotated_tree.postorder_node_iter(): 82 | if node is annotated_tree.seed_node: 83 | continue # Skip the root edge 84 | is_uncertain = False 85 | edge: Edge = node.edge 86 | if edge.annotations.get_value('is_reassorted', '0') == '1': 87 | rea_annotation = edge.annotations.get_value('rea').strip('"') 88 | is_uncertain = all( 89 | [g_str.startswith('?') for g_str in rea_annotation.split(',')]) # Is this 100% uncertain reassortment? 90 | if not is_uncertain: # Uncertain branches are handled below. 91 | rea_events.append(1) 92 | else: 93 | rea_events.append(0) 94 | 95 | edge_length = node.edge_length 96 | if is_uncertain: 97 | # check if the sister edge was already processed. 98 | siblings = node.parent_node.child_nodes() 99 | sibling = siblings[0] if siblings[0] is not Node else siblings[1] 100 | if sibling not in processed_uncertain: 101 | # log the event over the two sister branches 102 | rea_events.append(1) 103 | edge_length = sibling_distance(node.parent_node) 104 | processed_uncertain.add(node) 105 | else: 106 | continue # Skip if already processed. 107 | 108 | if edge_length > 1e-7: 109 | edge_lengths.append(edge_length / evol_rate) 110 | elif rea_events[-1] > 0: 111 | # If reassortment happened on too short of an edge, this can mess up the likelihood function. 112 | # Replace branch length with (1 / ref_seg_len), e.g., 1 / 1700 for HA (1 substitution). 113 | edge_lengths.append((1 / ref_seg_len) / evol_rate) 114 | else: 115 | edge_lengths.append(0) 116 | 117 | # print(len(rea_events), len(edge_lengths)) 118 | est = compute_rea_rate_simple(annotated_tree, evol_rate, ignore_top_edges=1) 119 | np_est = np.array([est]) 120 | linear_constraint = LinearConstraint([[1]], [0]) 121 | num_est = minimize(likelihood_binary, np_est, args=(rea_events, edge_lengths), tol=1e-9, 122 | constraints=[linear_constraint]) 123 | if num_est.success: 124 | return num_est.x[0] 125 | else: 126 | return None 127 | -------------------------------------------------------------------------------- /treesort/parsimony.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from typing import Tuple 3 | 4 | import numpy as np 5 | from dendropy import Tree, Node, DnaCharacterMatrix 6 | from dendropy.model import parsimony 7 | import random as rnd 8 | 9 | # character_sets_annotation = 'character_sets' 10 | 11 | 12 | def compute_parsimony_edge_lengths(tree: Tree, aln_path: str) -> np.ndarray: 13 | """ 14 | Compute the parsimony score of the tree given an alignment and find associated edge-lengths. 15 | :param tree: Tree topology to be scored by parsimony. Must be BINARY. 16 | :param aln_path: DNA alignment for the tips of the tree 17 | :return: A dictionary that specifies # of parsimony substitutions per node (except the root). 18 | """ 19 | tree_copy: Tree = tree.clone() 20 | taxon_characters: DnaCharacterMatrix = DnaCharacterMatrix.get_from_path(aln_path, schema='fasta', 21 | taxon_namespace=tree_copy.taxon_namespace) 22 | taxon_states = taxon_characters.taxon_state_sets_map(gaps_as_missing=True) 23 | p_score = parsimony.fitch_down_pass(tree_copy.postorder_node_iter(), taxon_state_sets_map=taxon_states) 24 | print(p_score) 25 | 26 | edge_lengths = np.zeros(len(tree.nodes()), dtype=int) 27 | p_score_2 = 0 28 | node: Node 29 | for node in tree_copy.preorder_node_iter(): 30 | edge_len = 0 31 | parent: Node = node.parent_node 32 | for site in range(taxon_characters.sequence_size): 33 | if parent: 34 | parent_state = parent.state_sets[site] 35 | if parent_state in node.state_sets[site]: 36 | node.state_sets[site] = parent_state 37 | continue 38 | else: 39 | edge_len += 1 40 | p_score_2 += 1 41 | # choose a random state and assign 42 | state_sets = list(node.state_sets[site]) 43 | rnd_state = rnd.choice(state_sets) 44 | node.state_sets[site] = rnd_state 45 | if parent: 46 | cluster = {leaf.taxon.label for leaf in node.leaf_nodes()} 47 | original_node = tree.find_node(filter_fn=lambda tree_node: {leaf.taxon.label for leaf in tree_node.leaf_nodes()} == cluster) 48 | edge_lengths[original_node.index] = edge_len 49 | print(p_score, p_score_2) 50 | return edge_lengths 51 | 52 | 53 | def compute_parsimony_sibling_dist(tree: Tree, aln_path: str, schema='fasta') -> Tuple[np.ndarray, np.ndarray, np.ndarray]: 54 | # tree must be binary! 55 | tree_copy: Tree = tree.clone() 56 | taxon_characters: DnaCharacterMatrix = DnaCharacterMatrix.get_from_path(aln_path, schema=schema, 57 | taxon_namespace=tree_copy.taxon_namespace) 58 | taxon_states = taxon_characters.taxon_state_sets_map(gaps_as_missing=True) 59 | p_score = parsimony.fitch_down_pass(tree_copy.postorder_node_iter(), taxon_state_sets_map=taxon_states) 60 | # print(p_score) 61 | 62 | children_dists = np.zeros(len(tree.internal_nodes()), dtype=int) 63 | child1_to_sibling_dists, child2_to_sibling_dists = np.zeros(len(tree.internal_nodes()), dtype=int),\ 64 | np.zeros(len(tree.internal_nodes()), dtype=int) 65 | p_score_2 = 0 66 | node: Node 67 | for node in tree_copy.preorder_internal_node_iter(): 68 | children_dist = 0 69 | child1_to_sibling, child2_to_sibling = 0, 0 70 | sibling = node.sibling_nodes()[0] if node.parent_node else None 71 | if not sibling: 72 | child1_to_sibling, child2_to_sibling = -1, -1 73 | child1, child2 = node.child_nodes() 74 | for site in range(taxon_characters.sequence_size): 75 | if len(child1.state_sets[site].intersection(child2.state_sets[site])) == 0: 76 | children_dist += 1 77 | p_score_2 += 1 78 | if sibling and len(child1.state_sets[site].intersection(sibling.state_sets[site])) == 0: 79 | child1_to_sibling += 1 80 | if sibling and len(child2.state_sets[site].intersection(sibling.state_sets[site])) == 0: 81 | child2_to_sibling += 1 82 | # cluster = {leaf.taxon.label for leaf in node.leaf_nodes()} 83 | # original_node = tree.find_node(filter_fn=lambda tree_node: {leaf.taxon.label for leaf in tree_node.leaf_nodes()} == cluster) 84 | children_dists[node.index] = children_dist 85 | child1_to_sibling_dists[node.index] = child1_to_sibling 86 | child2_to_sibling_dists[node.index] = child2_to_sibling 87 | # print(p_score, p_score_2) 88 | # TODO: if p_score != p_score_2: log a warning (debug only) 89 | return children_dists, child1_to_sibling_dists, child2_to_sibling_dists 90 | 91 | 92 | def get_cluster_str(node: Node) -> str: 93 | return ';'.join(sorted([leaf.taxon.label for leaf in node.leaf_nodes()])) 94 | 95 | # 96 | # if __name__ == '__main__': 97 | # # seg1_tree_path = '../simulations/segs2/l1500/sim_250_10/sim_1.trueSeg1.tre' 98 | # # schema = 'nexus' 99 | # # seg1_path = '../simulations/segs2/l1500/sim_250_10/sim_1.seg1.alignment.fasta' 100 | # # seg2_path = '../simulations/segs2/l1500/sim_250_10/sim_1.seg2.alignment.fasta' 101 | # # simulated = True 102 | # seg1_tree_path = '../../gammas/HAs.fast.rooted.tre' 103 | # schema = 'newick' 104 | # seg1_path = '../../gammas/HAs_unique.aln' 105 | # seg2_path = '../../gammas/NAs_unique.aln' 106 | # simulated = False 107 | # na_ha_ratio = 1.057 108 | # 109 | # tree: Tree = Tree.get(path=seg1_tree_path, schema=schema, preserve_underscores=True) 110 | # binarize_tree(tree) # Randomly binarize. 111 | # tree_indexer = TreeIndexer(tree.taxon_namespace) 112 | # tree_indexer.index_tree(tree) 113 | # if simulated: 114 | # node: Node 115 | # for node in tree.nodes(): 116 | # if node.edge_length: 117 | # node.edge_length *= 0.00474 118 | # 119 | # # lengths_by_node_s1 = compute_parsimony_edge_lengths(tree, 'testdata/l1000_50_5/sim_1.seg4.alignment.fasta') 120 | # # lengths_by_node_s2 = compute_parsimony_edge_lengths(tree, 'testdata/l1000_50_5/sim_1.seg1.alignment.fasta') 121 | # child_dists_s1, child1_dists_s1, child2_dists_s1 = compute_parsimony_sibling_dist(tree, seg1_path) 122 | # child_dists_s2, child1_dists_s2, child2_dists_s2 = compute_parsimony_sibling_dist(tree, seg2_path) 123 | # seg2_aln = list(SeqIO.parse(seg2_path, format='fasta')) 124 | # seg2_len = len(seg2_aln[0]) 125 | # 126 | # node_by_index = {} 127 | # cluster_by_index = {} 128 | # for node in tree.postorder_node_iter(): 129 | # node_by_index[node.index] = node 130 | # cluster_by_index[node.index] = get_cluster_str(node) 131 | # 132 | # # s1_lengths = np.zeros(len(lengths_by_node_s1)) 133 | # # node: Node 134 | # # for node in tree.postorder_internal_node_iter(): 135 | # # cluster = [leaf.taxon.label for leaf in node.leaf_nodes()] 136 | # # # print(cluster, lengths_by_node_s1[node.index], lengths_by_node_s2[node.index]) 137 | # # print(node.index, sorted(cluster), 138 | # # f'c1 {node.child_nodes()[0].index}({child1_dists_s1[node.index]}, {child1_dists_s2[node.index]})', 139 | # # f'c2 {node.child_nodes()[1].index}({child2_dists_s1[node.index]}, {child2_dists_s2[node.index]})') 140 | # 141 | # outlier_detector = LMOutlierDetector(child_dists_s1, child_dists_s2) 142 | # outliers = [(ind, outlier_detector.get_residual(child_dists_s1[ind], child_dists_s2[ind])) 143 | # for ind in range(child_dists_s1.size) if 144 | # outlier_detector.is_outlier(child_dists_s1[ind], child_dists_s2[ind], iqd_mult=3)] 145 | # jc_outliers = [(node.index, -1) for node in tree.internal_nodes() if 146 | # is_jc_outlier(child_dists_s2[node.index], seg2_len, helpers.sibling_distance(node), 147 | # rate_ratio=na_ha_ratio, pvalue_threshold=0.001)] 148 | # jc_outlier_indices = [x[0] for x in jc_outliers] 149 | # outliers = sorted(outliers, key=lambda x: x[1], reverse=True) 150 | # print(len(outliers)) 151 | # print(len(jc_outliers)) 152 | # for outlier_ind, residual in jc_outliers: 153 | # outlier_node = node_by_index[outlier_ind] 154 | # is_c1_rea = outlier_detector.is_outlier(child1_dists_s1[outlier_ind], child1_dists_s2[outlier_ind], iqd_mult=1.7) 155 | # is_c2_rea = outlier_detector.is_outlier(child2_dists_s1[outlier_ind], child2_dists_s2[outlier_ind], iqd_mult=1.7) 156 | # print(outlier_ind, residual, is_c1_rea, cluster_by_index[outlier_node.child_nodes()[0].index], 157 | # is_c2_rea, cluster_by_index[outlier_node.child_nodes()[1].index]) 158 | # 159 | # jc_colors = ['red' if i in jc_outlier_indices else 'blue' for i in range(len(tree.internal_nodes()))] 160 | # plt.scatter(child_dists_s1, child_dists_s2, c=jc_colors) 161 | # for ind in range(len(child_dists_s1)): 162 | # plt.annotate(str(ind), (child_dists_s1[ind], child_dists_s2[ind] + 0.2)) 163 | # plt.show() 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TreeSort # 2 | 3 |  4 | 5 | The tool infers both recent and ancestral reassortment events along the branches of a phylogenetic tree of a fixed genomic segment. 6 | It uses a statistical hypothesis testing framework to identify branches where reassortment with other segments has occurred and reports these events. 7 | 8 | 11 | 12 | Below is an example of 2 reassortment events inferred by TreeSort on a swine H1 dataset. The reference phylogeny is the hemagglutinin (HA) segment tree, and the branch annotations indicate reassortment relative to the HA's evolutionary history. The annotations list the acquired gene segments and how distant these segments were (# of nucleotide differences) from the original segments. For example, `PB2(136)` indicates that a new PB2 was acquired that was approximately 136 nucleotides different from the pre-reassortment PB2. 13 | 14 | 15 |
17 |
116 |
125 |