├── src ├── gitver.txt ├── usage.cpp ├── build_linux_x86.bash ├── supercluster.cpp ├── murmur.h ├── stringsfromfile.cpp ├── ladderize.cpp ├── newick_token.h ├── rotate.cpp ├── splitter.h ├── subtree.cpp ├── quarts.h ├── consorder.h ├── newicklexer.h ├── relabeln.cpp ├── clustermaker.h ├── draw.cpp ├── intlabel.cpp ├── newick_token.cpp ├── svg.h ├── newicktree.h ├── tax.cpp ├── consmakerx.h ├── consmaker.h ├── rofos.cpp ├── relabelacc.cpp ├── relabelf.cpp ├── addrootlabel.cpp ├── gobuff.h ├── newick.sln ├── countsort.h ├── correl2.cpp ├── cmds.h ├── newick_main.cpp ├── topo.cpp ├── murmurhash.cpp ├── newickparser2.h ├── supermaker.h ├── rootbyhalves.cpp ├── getlabels.cpp ├── quarts.cpp ├── taxq2.cpp ├── rofo3.cpp ├── subset.cpp ├── deleteleaves.cpp ├── divconker.h ├── biparterx.h ├── cladogram.cpp ├── deletegroup.cpp ├── taxtable.cpp ├── lcalabel.cpp ├── biparter.h ├── rootbyoutgroupx.cpp ├── layout.h ├── syncft.cpp ├── conf.cpp ├── tax_usage.txt ├── relabel.cpp ├── getlcasx.cpp ├── getlcas.cpp ├── bootq.cpp ├── taxer.h ├── condense.cpp ├── testdeletesubtree.cpp ├── getlcasubtrees.cpp ├── svg.cpp ├── deleteoutgroup.cpp ├── featuretablefromtree.cpp ├── bestfitsubtree.cpp ├── myopts.h ├── syncftacc.cpp ├── consensusx.cpp ├── cluster2.cpp ├── diameter.cpp ├── findgroups.cpp ├── collapse.cpp ├── condensex.cpp ├── treesfromdata.cpp ├── newicklexer.cpp ├── supermaker.cpp ├── Makefile ├── clustermaker.cpp ├── cladeq.cpp ├── featuretable.h ├── usage.h ├── stats.cpp ├── colors.cpp ├── shrink.cpp ├── conf2.cpp ├── split.cpp ├── usage.txt ├── getcc.cpp ├── newicktree.cpp ├── tree2.h ├── fixft.cpp └── randtree.cpp ├── README.md └── .gitignore /src/gitver.txt: -------------------------------------------------------------------------------- 1 | "554d4cd" 2 | -------------------------------------------------------------------------------- /src/usage.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | 3 | void Usage(FILE *f) 4 | { 5 | PrintCopyright(stdout); 6 | fprintf(f, 7 | #include "usage.h" 8 | ); 9 | } 10 | -------------------------------------------------------------------------------- /src/build_linux_x86.bash: -------------------------------------------------------------------------------- 1 | curl -fsSL https://raw.githubusercontent.com/rcedgar/vcxproj_make/d27afda/vcxproj_make.py \ 2 | > vcxproj_make.py 3 | 4 | mkdir -p ../bin 5 | 6 | python3 ./vcxproj_make.py --openmp 7 | -------------------------------------------------------------------------------- /src/supercluster.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "supermaker.h" 3 | 4 | void cmd_supercluster() 5 | { 6 | //const string &InputFile = opt(supercluster); 7 | //SuperMaker SM; 8 | //SM.Load(InputFile); 9 | } 10 | -------------------------------------------------------------------------------- /src/murmur.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | static inline uint32_t murmur_32_scramble(uint32_t k) 4 | { 5 | k *= 0xcc9e2d51; 6 | k = (k << 15) | (k >> 17); 7 | k *= 0x1b873593; 8 | return k; 9 | } 10 | 11 | uint32_t murmur3_32(const byte* key, size_t len, uint32_t seed); 12 | -------------------------------------------------------------------------------- /src/stringsfromfile.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | 3 | void StringsFromFile(const string &FileName, vector &Strings) 4 | { 5 | Strings.clear(); 6 | FILE *f = OpenStdioFile(FileName); 7 | string Line; 8 | Progress("Reading %s...", FileName.c_str()); 9 | while (ReadLineStdioFile(f, Line)) 10 | Strings.push_back(Line); 11 | Progress("done.\n"); 12 | CloseStdioFile(f); 13 | } 14 | -------------------------------------------------------------------------------- /src/ladderize.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | 4 | void cmd_ladderize() 5 | { 6 | vector Trees; 7 | TreesFromFile(opt(ladderize), Trees); 8 | const uint TreeCount = SIZE(Trees); 9 | 10 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 11 | { 12 | TreeN &T = *Trees[TreeIndex]; 13 | T.Ladderize(opt(right)); 14 | } 15 | 16 | TreesToFile(Trees, opt(output)); 17 | } 18 | -------------------------------------------------------------------------------- /src/newick_token.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | enum NEWICK_TOKEN_TYPE 4 | { 5 | NTT_Unknown, 6 | NTT_EOF, 7 | 8 | // Returned from Tree::GetToken: 9 | NTT_Lparen, 10 | NTT_Rparen, 11 | NTT_Colon, 12 | NTT_Comma, 13 | NTT_Semicolon, 14 | NTT_String, 15 | 16 | // Following are never returned from Tree::GetToken: 17 | NTT_SingleQuotedString, 18 | NTT_DoubleQuotedString, 19 | NTT_Comment 20 | }; 21 | 22 | const char *NTTToStr(NEWICK_TOKEN_TYPE NTT); 23 | -------------------------------------------------------------------------------- /src/rotate.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | 4 | void cmd_rotate() 5 | { 6 | asserta(optset_output); 7 | Tree2 T; 8 | T.FromNewickFile(opt(rotate)); 9 | if (!optset_node) 10 | Die("-node required"); 11 | const uint Node = opt(node); 12 | if (!T.IsRooted()) 13 | Die("Rooted tree required"); 14 | if (T.IsLeaf(Node)) 15 | Die("Cannot rotate leaf node"); 16 | 17 | T.Rotate(Node); 18 | T.ToNewickFile(opt(output)); 19 | } 20 | -------------------------------------------------------------------------------- /src/splitter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tree2.h" 4 | 5 | class Splitter 6 | { 7 | public: 8 | const Tree2 *m_T = 0; 9 | uint m_SplitIndex = 0; 10 | uint m_SplitCount = 0; 11 | uint m_TargetSize = 0; 12 | vector m_SubtreeNodes; 13 | 14 | public: 15 | void Run(const Tree2 &T, uint SplitCount); 16 | uint GetBiggestNode() const; 17 | void WriteLabels(const string &FileNamePrefix) const; 18 | void LogState() const; 19 | void GetSizeOrder(vector &Order) const; 20 | void GetSubtree(Tree2 &Subtree) const; 21 | }; 22 | -------------------------------------------------------------------------------- /src/subtree.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | 4 | void cmd_subtree() 5 | { 6 | Die("Not implemented"); 7 | //const string &InputFileName = opt(subtree); 8 | //if (!optset_node) 9 | // Die("-node required"); 10 | //const uint NodeIndex = opt(node); 11 | 12 | //Tree2 T; 13 | //T.FromFile(InputFileName); 14 | 15 | //const vector &LeafNodeIndexes = T.GetLeafNodeIndexes(NodeIndex); 16 | 17 | //Tree2 Subtree; 18 | //T.MakeSubset(LeafNodeIndexes, Subtree); 19 | 20 | //Subtree.ToNewickFile(opt(output)); 21 | } 22 | -------------------------------------------------------------------------------- /src/quarts.h: -------------------------------------------------------------------------------- 1 | #ifndef quarts_h 2 | #define quarts_h 3 | 4 | struct Quarts 5 | { 6 | unsigned Min; 7 | unsigned LoQ; 8 | unsigned Med; 9 | unsigned HiQ; 10 | unsigned Max; 11 | unsigned Total; 12 | double Avg; 13 | }; 14 | 15 | struct QuartsDouble 16 | { 17 | double Min; 18 | double LoQ; 19 | double Med; 20 | double HiQ; 21 | double Max; 22 | double Total; 23 | double Avg; 24 | double StdDev; 25 | }; 26 | 27 | void GetQuarts(const vector &v, Quarts &Q); 28 | void GetQuarts(const vector &v, QuartsDouble &Q); 29 | 30 | #endif // quarts_h 31 | -------------------------------------------------------------------------------- /src/consorder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treen.h" 4 | 5 | class ConsOrder 6 | { 7 | public: 8 | vector *> m_LabelSetVec; 9 | vector > m_Mx; 10 | vector m_Pending; 11 | uint m_TreeCount = UINT_MAX; 12 | vector m_Names; 13 | 14 | public: 15 | void AddTree(const TreeN &T); 16 | void InitMx(); 17 | void InitPending(); 18 | uint GetFirstOnly(uint i, uint j) const; 19 | void GetBestJoin(uint &i, uint &j) const; 20 | void DeleteFromPending(uint i, uint j); 21 | uint Join(uint i, uint j); 22 | const char *GetName(uint i) const; 23 | }; 24 | -------------------------------------------------------------------------------- /src/newicklexer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | class NewickLexer 4 | { 5 | public: 6 | const char *m_Data; 7 | uint m_DataPos; 8 | uint m_DataBytes; 9 | 10 | vector m_Tokens; 11 | 12 | public: 13 | void Clear() 14 | { 15 | m_Data = 0; 16 | m_DataPos = 0; 17 | m_DataBytes = 0; 18 | m_Tokens.clear(); 19 | } 20 | 21 | void FromStr(const string &Str); 22 | void FromData(const char *Data, uint DataBytes); 23 | char GetCharFailOnEof(); 24 | int GetChar(); 25 | void SkipWhite(); 26 | bool GetToken(string &Token); 27 | void LogTokens() const; 28 | void SplitTokens(vector > &TokensVec) const; 29 | }; 30 | -------------------------------------------------------------------------------- /src/relabeln.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | 4 | void cmd_relabeln() 5 | { 6 | Tree2 T; 7 | T.FromNewickFile(opt(relabeln)); 8 | 9 | string &Prefix = opt(label); 10 | if (Prefix == "") 11 | Prefix = "Lab"; 12 | const uint NodeCount = T.GetNodeCount(); 13 | for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) 14 | { 15 | if (!T.IsLeaf(NodeIndex)) 16 | { 17 | T.m_Labels[NodeIndex] = ""; 18 | continue; 19 | } 20 | string NewLabel; 21 | Ps(NewLabel, "%s%u", Prefix.c_str(), NodeIndex); 22 | T.m_Labels[NodeIndex] = NewLabel; 23 | } 24 | 25 | T.ToNewickFile(opt(output)); 26 | } 27 | -------------------------------------------------------------------------------- /src/clustermaker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tree2.h" 4 | 5 | class ClusterMaker 6 | { 7 | public: 8 | const Tree2 *m_T = 0; 9 | double m_MaxDistFromFarthestLeaf = DBL_MAX; 10 | vector m_SubtreeNodes; 11 | vector > m_SubtreeLeafNodesVec; 12 | 13 | void Clear() 14 | { 15 | m_T = 0; 16 | m_MaxDistFromFarthestLeaf = DBL_MAX; 17 | m_SubtreeNodes.clear(); 18 | m_SubtreeLeafNodesVec.clear(); 19 | } 20 | 21 | public: 22 | void Run(const Tree2 &T, double MaxDistFromFarthestLeaf); 23 | void Validate() const; 24 | void ToTSV(const string &FileName) const; 25 | void ToNewick(const string &FileName) const; 26 | uint GetSubsetSize(uint MaxPerCluster) const; 27 | }; 28 | -------------------------------------------------------------------------------- /src/draw.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | #include "layout.h" 4 | #include "svg.h" 5 | 6 | void cmd_draw() 7 | { 8 | const string &FileName = opt(draw); 9 | double OffsetX = 0; 10 | double OffsetY = 0; 11 | if (optset_offsetx) 12 | OffsetX = opt(offsetx); 13 | if (optset_offsety) 14 | OffsetY = opt(offsety); 15 | 16 | Tree2 T; 17 | T.FromNewickFile(FileName); 18 | 19 | const uint NodeCount = T.GetNodeCount(); 20 | for (uint Node = 0; Node < NodeCount; ++Node) 21 | { 22 | if (!T.IsRoot(Node) && T.GetEdgeLengthToParent(Node) == MISSING_LENGTH) 23 | Die("Missing lengths"); 24 | } 25 | 26 | Layout Lay; 27 | Lay.m_OffsetX = OffsetX; 28 | Lay.m_OffsetY = OffsetY; 29 | Lay.Run(T); 30 | Lay.Render(opt(svg)); 31 | } 32 | -------------------------------------------------------------------------------- /src/intlabel.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | 4 | void cmd_intlabel() 5 | { 6 | asserta(optset_output); 7 | Tree2 T; 8 | T.FromNewickFile(opt(intlabel)); 9 | 10 | const uint NodeCount = T.GetNodeCount(); 11 | for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) 12 | { 13 | if (T.IsLeaf(NodeIndex)) 14 | { 15 | string Label; 16 | T.GetLabel(NodeIndex, Label); 17 | if (Label == "") 18 | { 19 | Warning("Empty leaf label"); 20 | Ps(Label, "_leaf_%u", NodeIndex); 21 | T.m_Labels[NodeIndex] = Label; 22 | } 23 | continue; 24 | } 25 | 26 | string NewLabel; 27 | if (!opt(delete_labels)) 28 | Ps(NewLabel, "_node_%u", NodeIndex); 29 | T.m_Labels[NodeIndex] = NewLabel; 30 | } 31 | 32 | T.ToNewickFile(opt(output)); 33 | } 34 | -------------------------------------------------------------------------------- /src/newick_token.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "newick_token.h" 3 | 4 | /*** 5 | Tokens in Newick files are: 6 | ( ) : , ; 7 | string 8 | 'string' 9 | "string" 10 | [ comment ] 11 | 12 | We can't safely distinguish between identifiers and floating point 13 | numbers at the lexical level (because identifiers may be numeric, 14 | or start with digits), so both edge lengths and identifiers are 15 | returned as strings. 16 | ***/ 17 | 18 | const char *NTTToStr(NEWICK_TOKEN_TYPE NTT) 19 | { 20 | switch (NTT) 21 | { 22 | #define c(x) case NTT_##x: return #x; 23 | c(Unknown) 24 | c(Lparen) 25 | c(Rparen) 26 | c(Colon) 27 | c(Comma) 28 | c(Semicolon) 29 | c(String) 30 | c(SingleQuotedString) 31 | c(DoubleQuotedString) 32 | c(Comment) 33 | #undef c 34 | } 35 | return "??"; 36 | } 37 | -------------------------------------------------------------------------------- /src/svg.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | class Svg 4 | { 5 | public: 6 | FILE *m_f = 0; 7 | double m_Width = 0; 8 | double m_Height = 0; 9 | 10 | public: 11 | void Open(const string &FileName, double Width, double Height); 12 | void Close(); 13 | 14 | void Line(double x1, double y1, double x2, double y2, 15 | double StrokeWidth, const string &Color); 16 | 17 | void Rect(double x, double y, double w, double h, 18 | double LineWidth, const string &LineColor, 19 | const string &FillColor); 20 | 21 | void Text(double x, double y, const string &FontFamily, 22 | double FontSize, const string &FontWeight, 23 | const string &FillColor, const string &TextAnchor, 24 | const string &Str); 25 | 26 | void Triangle(double x1, double y1, double x2, double y2, 27 | double x3, double y3, double LineWidth, 28 | const string &LineColor, const string &FillColor); 29 | }; 30 | -------------------------------------------------------------------------------- /src/newicktree.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | const double MISSING_LENGTH = DBL_MAX; 4 | 5 | class NewickTree 6 | { 7 | public: 8 | // There is always a root, even if unrooted. 9 | uint m_Root; 10 | vector m_Parents; 11 | vector m_Labels; 12 | vector m_IsLeafs; 13 | vector m_Lengths; 14 | 15 | public: 16 | NewickTree() { Clear(); } 17 | 18 | void Clear() 19 | { 20 | m_Root = UINT_MAX; 21 | m_Parents.clear(); 22 | m_Labels.clear(); 23 | m_IsLeafs.clear(); 24 | m_Lengths.clear(); 25 | } 26 | uint GetNodeCount() const { return SIZE(m_Labels); } 27 | 28 | void LogMe() const; 29 | uint GetLeafCount() const; 30 | void Validate() const; 31 | uint GetParent(uint Node) const; 32 | const string &GetLabel(uint Node) const; 33 | double GetLength(uint Node) const; 34 | void GetNonParentEdges(vector > &Edges) const; 35 | bool IsBinary() const; 36 | bool HasBinaryRoot() const; 37 | }; 38 | -------------------------------------------------------------------------------- /src/tax.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "taxer.h" 3 | 4 | void cmd_tax() 5 | { 6 | const string &InputFileName = opt(tax); 7 | FILE *fTsv = CreateStdioFile(opt(tsvout)); 8 | FILE *fFev = CreateStdioFile(opt(fevout)); 9 | FILE *fOut = CreateStdioFile(opt(output)); 10 | 11 | uint K = 0; 12 | if (optset_kfold) 13 | K = opt(kfold); 14 | double MinTPFract = 0.5; 15 | if (optset_mintpfract) 16 | MinTPFract = opt(mintpfract); 17 | 18 | vector Trees; 19 | TreesFromFile(InputFileName, Trees); 20 | const uint TreeCount = SIZE(Trees); 21 | 22 | Taxer TheTaxer; 23 | 24 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 25 | { 26 | TreeX &T = *Trees[TreeIndex]; 27 | TheTaxer.Init(TreeIndex, T, K, MinTPFract); 28 | TheTaxer.ToFev(fFev); 29 | TheTaxer.ToTsv(fTsv); 30 | TheTaxer.ToNewick(fOut); 31 | } 32 | CloseStdioFile(fTsv); 33 | CloseStdioFile(fFev); 34 | CloseStdioFile(fOut); 35 | } 36 | -------------------------------------------------------------------------------- /src/consmakerx.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treex.h" 4 | 5 | class ConsMakerX 6 | { 7 | public: 8 | const TreeX *m_Tree1 = 0; 9 | const TreeX *m_Tree2 = 0; 10 | 11 | set m_LabelsBoth; 12 | set m_Labels1Only; 13 | set m_Labels2Only; 14 | 15 | set m_LeafNodes2Only; 16 | 17 | TreeX m_ConsTree; 18 | 19 | vector m_Pures1; 20 | vector m_Pures2; 21 | 22 | uint m_NewSubtreeCount = 0; 23 | uint m_NewNodeCount = 0; 24 | 25 | public: 26 | void MakeConsensus(const TreeX &Tree1, const TreeX &Tree2); 27 | void MakeConsensus_Disjoint(); 28 | void AddSubtree(uint SubtreeNode2); 29 | void InsertSubtree(uint SubtreeNode2, uint NodeC); 30 | void ValidateConsensusLabels(); 31 | 32 | public: 33 | static void IntersectLabels(const TreeX &Tree1, const TreeX &Tree2, 34 | set &LabelsBoth, set &Labels1Only, set &Labels2Only, 35 | bool ErrorIfEmpty, bool ErrorIfDupe); 36 | }; 37 | -------------------------------------------------------------------------------- /src/consmaker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treen.h" 4 | 5 | class ConsMaker 6 | { 7 | public: 8 | const TreeN *m_Tree1 = 0; 9 | const TreeN *m_Tree2 = 0; 10 | 11 | set m_LabelsBoth; 12 | set m_Labels1Only; 13 | set m_Labels2Only; 14 | 15 | TreeN m_ConsTree; 16 | 17 | vector m_Pures1; 18 | vector m_Pures2; 19 | 20 | uint m_NewSubtreeCount = 0; 21 | uint m_NewNodeCount = 0; 22 | 23 | public: 24 | void MakeConsensus(const TreeN &Tree1, const TreeN &Tree2); 25 | void MakeConsensus_Disjoint(); 26 | void AddSubtree(uint SubtreeNode2); 27 | void InsertSubtree(uint SubtreeNode2, uint NodeC); 28 | void InsertSubtreeNode(const TreeN &T, uint NodeT, uint NodeC); 29 | void ValidateConsensusLabels(); 30 | 31 | public: 32 | static void IntersectLabels(const TreeN &Tree1, const TreeN &Tree2, 33 | set &LabelsBoth, set &Labels1Only, set &Labels2Only, 34 | bool ErrorIfEmpty, bool ErrorIfDupe); 35 | }; 36 | -------------------------------------------------------------------------------- /src/rofos.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "biparter.h" 3 | #include "quarts.h" 4 | 5 | double RofoPair(const TreeN &T1, const TreeN &T2); 6 | void TreesFromFile(const string &FileName, vector &Trees); 7 | 8 | void cmd_rofos() 9 | { 10 | const string &FileName = opt(rofos); 11 | 12 | vector Trees; 13 | TreesFromFile(FileName, Trees); 14 | 15 | const uint TreeCount = SIZE(Trees); 16 | ProgressLog("%u trees\n", TreeCount); 17 | uint PairCount = (TreeCount*(TreeCount - 1))/2; 18 | uint PairIndex = 0; 19 | vector RFs; 20 | for (uint i = 0; i < TreeCount; ++i) 21 | { 22 | const TreeN &Ti = *Trees[i]; 23 | for (uint j = i + 1; j < TreeCount; ++j) 24 | { 25 | ProgressStep(PairIndex++, PairCount, "Comparing"); 26 | const TreeN &Tj = *Trees[j]; 27 | double RF = RofoPair(Ti, Tj); 28 | RFs.push_back(RF); 29 | } 30 | } 31 | 32 | QuartsDouble Q; 33 | GetQuarts(RFs, Q); 34 | ProgressLog("Median RF %.3g\n", Q.Med); 35 | } 36 | -------------------------------------------------------------------------------- /src/relabelacc.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | 4 | void GetAccFromLabel(const string &Label, string &Acc); 5 | void MakeAccMaps(const vector &Labels, 6 | map &LabelToAcc, 7 | map &AccToLabel); 8 | 9 | void cmd_relabelacc() 10 | { 11 | vector Trees; 12 | TreesFromFile(opt(relabelacc), Trees); 13 | const uint TreeCount = SIZE(Trees); 14 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 15 | { 16 | TreeN &T = *Trees[TreeIndex]; 17 | asserta(T.IsNormalized()); 18 | const uint NodeCount = T.GetNodeCount(); 19 | for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) 20 | { 21 | if (!T.IsLeaf(NodeIndex)) 22 | continue; 23 | const string OldLabel = T.GetLabel(NodeIndex); 24 | if (OldLabel == "") 25 | continue; 26 | string Acc; 27 | GetAccFromLabel(OldLabel, Acc); 28 | T.UpdateLabel(NodeIndex, Acc); 29 | } 30 | } 31 | 32 | TreesToFile(Trees, opt(output)); 33 | } 34 | -------------------------------------------------------------------------------- /src/relabelf.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | #include "featuretable.h" 4 | 5 | void cmd_relabelf() 6 | { 7 | vector Trees; 8 | TreesFromFile(opt(relabelf), Trees); 9 | 10 | const uint TreeCount = SIZE(Trees); 11 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 12 | { 13 | TreeN &T = *Trees[TreeIndex]; 14 | asserta(T.IsNormalized()); 15 | 16 | FeatureTable FT; 17 | FT.FromFile(opt(features)); 18 | 19 | const uint NodeCount = T.GetNodeCount(); 20 | for (uint Node = 0; Node < NodeCount; ++Node) 21 | { 22 | if (!T.IsLeaf(Node)) 23 | continue; 24 | string Label = T.GetLabel(Node); 25 | if (opt(accs)) 26 | GetAccFromLabel(Label, Label); 27 | if (Label == "") 28 | continue; 29 | string Value; 30 | FT.GetValue_ByLabel(Label, Value, false); 31 | string NewLabel = Label; 32 | if (Value != "") 33 | NewLabel += "." + Value; 34 | T.UpdateLabel(Node, NewLabel); 35 | } 36 | } 37 | TreesToFile(Trees, opt(output)); 38 | } 39 | -------------------------------------------------------------------------------- /src/addrootlabel.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | 4 | void MakeOutputFileName(const string &Pattern, string &FN, uint n); 5 | 6 | void cmd_addrootlabel() 7 | { 8 | const string &FileName = opt(addrootlabel); 9 | 10 | vector Trees; 11 | TreesFromFile(FileName, Trees); 12 | const uint TreeCount = SIZE(Trees); 13 | 14 | string Pattern = "Tree.@"; 15 | if (optset_pattern) 16 | Pattern = opt(pattern); 17 | 18 | FILE *f = CreateStdioFile(opt(output)); 19 | uint UnrootedCount = 0; 20 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 21 | { 22 | TreeN &T = *Trees[TreeIndex]; 23 | bool Rooted = false; 24 | bool Binary = T.IsBinary(Rooted); 25 | if (Rooted) 26 | ++UnrootedCount; 27 | else 28 | { 29 | string RootLabel; 30 | if (optset_label) 31 | RootLabel = opt(label); 32 | else 33 | MakeOutputFileName(opt(pattern), RootLabel, TreeIndex+1); 34 | T.UpdateLabel(T.m_Root, RootLabel); 35 | } 36 | T.ToNewickFile(f); 37 | } 38 | if (UnrootedCount > 0) 39 | Warning("%u unrooted, not changed", UnrootedCount); 40 | CloseStdioFile(f); 41 | } 42 | -------------------------------------------------------------------------------- /src/gobuff.h: -------------------------------------------------------------------------------- 1 | #ifndef gobuff_h 2 | #define gobuff_h 3 | 4 | #include "myutils.h" 5 | 6 | template class GoBuff 8 | { 9 | public: 10 | unsigned MaxSize; 11 | unsigned Size; 12 | T *Data; 13 | 14 | public: 15 | GoBuff() 16 | { 17 | MaxSize = 0; 18 | Size = 0; 19 | Data = 0; 20 | } 21 | 22 | ~GoBuff() { Free(); } 23 | 24 | void Free() 25 | { 26 | myfree(Data); 27 | Size = 0; 28 | Data = 0; 29 | } 30 | 31 | void Alloc(unsigned n) 32 | { 33 | if (n <= MaxSize) 34 | return; 35 | 36 | unsigned NewMaxSize = n + SizeInc; 37 | T *NewBuffer = myalloc(T, NewMaxSize); 38 | if (Size > 0) 39 | { 40 | if (CopyOnGrow) 41 | memcpy(NewBuffer, Data, Size*sizeof(T)); 42 | myfree(Data); 43 | } 44 | if (ZeroOnGrow) 45 | memset(NewBuffer, 0, NewMaxSize*sizeof(T)); 46 | Data = NewBuffer; 47 | MaxSize = NewMaxSize; 48 | } 49 | 50 | unsigned GetMemUseBytes() const 51 | { 52 | return (MaxSize*sizeof(T)); 53 | } 54 | }; 55 | 56 | const unsigned GROW64K = 0x10000; 57 | 58 | #endif // gobuff_h 59 | -------------------------------------------------------------------------------- /src/newick.sln: -------------------------------------------------------------------------------- 1 | Microsoft Visual Studio Solution File, Format Version 12.00 2 | # Visual Studio 15 3 | VisualStudioVersion = 15.0.27004.2008 4 | MinimumVisualStudioVersion = 10.0.40219.1 5 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "newick", "newick.vcxproj", "{1D4F05A8-7B62-4EB6-B0DC-0CFE76E0224A}" 6 | EndProject 7 | Global 8 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 9 | Debug|x64 = Debug|x64 10 | Release|x64 = Release|x64 11 | EndGlobalSection 12 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 13 | {1D4F05A8-7B62-4EB6-B0DC-0CFE76E0224A}.Debug|x64.ActiveCfg = Debug|x64 14 | {1D4F05A8-7B62-4EB6-B0DC-0CFE76E0224A}.Debug|x64.Build.0 = Debug|x64 15 | {1D4F05A8-7B62-4EB6-B0DC-0CFE76E0224A}.Release|x64.ActiveCfg = Release|x64 16 | {1D4F05A8-7B62-4EB6-B0DC-0CFE76E0224A}.Release|x64.Build.0 = Release|x64 17 | EndGlobalSection 18 | GlobalSection(SolutionProperties) = preSolution 19 | HideSolutionNode = FALSE 20 | EndGlobalSection 21 | GlobalSection(ExtensibilityGlobals) = postSolution 22 | SolutionGuid = {774F4BCE-8E1C-44CA-BCE9-27E3300C42A2} 23 | EndGlobalSection 24 | EndGlobal 25 | -------------------------------------------------------------------------------- /src/countsort.h: -------------------------------------------------------------------------------- 1 | #ifndef countsort_h 2 | #define countsort_h 3 | 4 | #include "gobuff.h" 5 | 6 | class CountSortMem 7 | { 8 | public: 9 | static const unsigned NVEC = 8; 10 | 11 | public: 12 | unsigned *m_Vecs[NVEC]; 13 | unsigned m_VecPos[NVEC]; 14 | unsigned m_MaxValueCount; 15 | 16 | GoBuff m_Sizes; 17 | GoBuff m_Offsets; 18 | 19 | public: 20 | CountSortMem() 21 | { 22 | m_MaxValueCount = 0; 23 | zero(m_Vecs, NVEC); 24 | } 25 | 26 | void Free() 27 | { 28 | for (unsigned i = 0; i < NVEC; ++i) 29 | { 30 | myfree(m_Vecs[i]); 31 | m_Vecs[i] = 0; 32 | } 33 | m_MaxValueCount = 0; 34 | } 35 | 36 | void Alloc(unsigned ValueCount) 37 | { 38 | if (ValueCount <= m_MaxValueCount) 39 | return; 40 | 41 | Free(); 42 | 43 | m_MaxValueCount = ValueCount; 44 | for (unsigned i = 0; i < NVEC; ++i) 45 | m_Vecs[i] = myalloc(unsigned, m_MaxValueCount); 46 | } 47 | }; 48 | 49 | unsigned CountSortOrderDesc(const unsigned *Values, unsigned ValueCount, 50 | CountSortMem &Mem, unsigned *Order); 51 | unsigned CountSortSubsetDesc(const unsigned *Values, unsigned ValueCount, 52 | CountSortMem &Mem, const unsigned *Subset, unsigned *Result); 53 | 54 | #endif // countsort_h 55 | -------------------------------------------------------------------------------- /src/correl2.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | #include "biparter.h" 4 | 5 | // Output tsv with confidences from equivalent bipartitions 6 | // in two trees with some/all leaves in common. 7 | void cmd_correl2() 8 | { 9 | const string &TreeFN1 = opt(correl2); 10 | const string &TreeFN2 = opt(tree2); 11 | 12 | FILE *fOut = CreateStdioFile(opt(output)); 13 | 14 | TreeN T1; 15 | TreeN T2; 16 | 17 | T1.FromNewickFile(TreeFN1); 18 | T2.FromNewickFile(TreeFN2); 19 | 20 | BiParter BP1; 21 | BiParter BP2; 22 | 23 | BP1.Init(T1); 24 | BP2.Init(T2); 25 | 26 | uint UniquePartCount2 = SIZE(BP2.m_UniquePartNodes); 27 | for (uint k = 0; k < UniquePartCount2; ++k) 28 | { 29 | uint Node2 = BP2.m_UniquePartNodes[k]; 30 | const vector &Part2 = BP2.m_PartVec[Node2]; 31 | uint Node1 = BP1.Search(Part2); 32 | if (Node1 != UINT_MAX) 33 | { 34 | const string &Label1 = T1.GetLabel(Node1); 35 | const string &Label2 = T2.GetLabel(Node2); 36 | if (Label1.empty() || Label2.empty()) 37 | continue; 38 | if (!IsValidFloatStr(Label1) || !IsValidFloatStr(Label2)) 39 | continue; 40 | Pf(fOut, "%s %s\n", Label1.c_str(), Label2.c_str()); 41 | } 42 | } 43 | 44 | CloseStdioFile(fOut); 45 | } 46 | -------------------------------------------------------------------------------- /src/cmds.h: -------------------------------------------------------------------------------- 1 | #ifndef A 2 | #error "A not defined" 3 | #endif 4 | 5 | A(version) 6 | A(subset) 7 | A(subsetacc) 8 | A(subsetnodes) 9 | A(subtree) 10 | A(stats) 11 | A(getlabels) 12 | A(getlabels_treeorder) 13 | A(rofos) 14 | A(rofo) 15 | A(relabel) 16 | A(relabeln) 17 | A(relabelf) 18 | A(relabelacc) 19 | A(syncft) 20 | A(syncftacc) 21 | A(intlabel) 22 | A(rootbyoutgroup) 23 | A(rootbyoutgroupx) 24 | A(test) 25 | A(tsv2newick) 26 | A(newick2tsv) 27 | A(ladderize) 28 | A(rotate) 29 | A(draw) 30 | A(drawf) 31 | A(drawfs) 32 | A(drawfsp) 33 | A(split) 34 | A(cluster) 35 | A(cluster2) 36 | A(rootbyhalves) 37 | A(clado) 38 | A(tax) 39 | A(taxq) 40 | A(taxqx) 41 | A(taxq2) 42 | A(cladeq) 43 | A(randtree) 44 | A(bootq) 45 | A(conf) 46 | A(biparts) 47 | A(condense) 48 | A(condensex) 49 | A(topo) 50 | A(fixft) 51 | A(deleteleaves) 52 | A(deletegroup) 53 | A(deleteoutgroup) 54 | A(collapse) 55 | A(lcalabel) 56 | A(conf2) 57 | A(confcmps) 58 | A(correl2) 59 | A(mono) 60 | A(bestfitsubtree) 61 | A(palette) 62 | A(tree2palette) 63 | A(rootdists) 64 | A(supercluster) 65 | A(consensus) 66 | A(consensus2) 67 | A(consensus2x) 68 | A(consorder) 69 | A(addrootlabel) 70 | A(testx) 71 | A(dq) 72 | A(olcs) 73 | A(testdeletesubtree) 74 | A(getlcasubtrees) 75 | 76 | #undef A 77 | -------------------------------------------------------------------------------- /src/newick_main.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | 3 | void Usage(FILE *f); 4 | 5 | int main(int argc, char **argv) 6 | { 7 | #ifdef _MSC_VER 8 | _putenv("TZ="); 9 | #endif 10 | setbuf(stdout, 0); 11 | setbuf(stderr, 0); 12 | 13 | if (argc <= 1) 14 | { 15 | Usage(stdout); 16 | return 0; 17 | } 18 | 19 | for (int i = 1; i < argc; ++i) 20 | { 21 | string s = string(argv[i]); 22 | if (s == "-h") 23 | { 24 | Usage(stdout); 25 | return 0; 26 | } 27 | } 28 | 29 | MyCmdLine(argc, argv); 30 | 31 | if (!opt(quiet)) 32 | { 33 | PrintProgramInfo(stdout); 34 | PrintCopyright(stdout); 35 | } 36 | 37 | SetLogFileName(opt(log)); 38 | LogProgramInfoAndCmdLine(); 39 | 40 | extern vector g_Argv; 41 | uint n = SIZE(g_Argv); 42 | asserta(n > 0); 43 | string ShortCmdLine = g_Argv[1]; 44 | if (n > 2) 45 | ShortCmdLine += " " + g_Argv[2]; 46 | 47 | ProgressPrefix(false); 48 | Progress("[%s]\n", ShortCmdLine.c_str() + 1); 49 | ProgressPrefix(true); 50 | 51 | CMD Cmd = GetCmd(); 52 | switch (Cmd) 53 | { 54 | #define A(x) case CMD_##x: { void cmd_##x(); cmd_##x(); break; } 55 | #include "cmds.h" 56 | default: 57 | asserta(false); 58 | } 59 | 60 | CheckUsedOpts(opt_log_used_opts); 61 | 62 | LogElapsedTimeAndRAM(); 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /src/topo.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | 4 | // delete lengths & internal node labels leaving 5 | // "topology-only" tree. 6 | void cmd_topo() 7 | { 8 | const string &InputFileName = opt(topo); 9 | const string &OutputFileName = opt(output); 10 | FILE *fTsv = CreateStdioFile(opt(tsvout)); 11 | 12 | vector Trees; 13 | TreesFromFile(InputFileName, Trees); 14 | uint TreeCount = SIZE(Trees); 15 | 16 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 17 | { 18 | ProgressStep(TreeIndex, TreeCount, "Processing"); 19 | TreeN &T = *Trees[TreeIndex]; 20 | 21 | string BeforeStr; 22 | T.ToNewickStr(BeforeStr, false); 23 | BeforeStr[SIZE(BeforeStr)-1] = 0; 24 | 25 | T.CollapseUnary(); 26 | T.Normalize(); 27 | const uint NodeCount = T.GetNodeCount(); 28 | for (uint Node = 0; Node < NodeCount; ++Node) 29 | { 30 | if (!T.IsLeaf(Node)) 31 | T.UpdateLabel(Node, ""); 32 | T.UpdateLength(Node, MISSING_LENGTH); 33 | } 34 | T.Ladderize(opt(right)); 35 | 36 | string AfterStr; 37 | T.ToNewickStr(AfterStr, false); 38 | AfterStr[SIZE(AfterStr)-1] = 0; 39 | if (fTsv != 0) 40 | fprintf(fTsv, "%s\t%s\n", AfterStr.c_str(), BeforeStr.c_str()); 41 | } 42 | 43 | TreesToFile(Trees, OutputFileName); 44 | CloseStdioFile(fTsv); 45 | } 46 | -------------------------------------------------------------------------------- /src/murmurhash.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "murmur.h" 3 | 4 | // https://en.wikipedia.org/wiki/MurmurHash 5 | 6 | uint32_t murmur3_32(const byte* key, size_t len, uint32_t seed) 7 | { 8 | uint32_t h = seed; 9 | uint32_t k; 10 | /* Read in groups of 4. */ 11 | for (size_t i = len >> 2; i; i--) 12 | { 13 | // Here is a source of differing results across endiannesses. 14 | // A swap here has no effects on hash properties though. 15 | memcpy(&k, key, sizeof(uint32_t)); 16 | key += sizeof(uint32_t); 17 | h ^= murmur_32_scramble(k); 18 | h = (h << 13) | (h >> 19); 19 | h = h * 5 + 0xe6546b64; 20 | } 21 | 22 | /* Read the rest. */ 23 | k = 0; 24 | for (size_t i = len & 3; i; i--) 25 | { 26 | k <<= 8; 27 | k |= key[i - 1]; 28 | } 29 | 30 | // A swap is *not* necessary here because the preceding loop already 31 | // places the low bytes in the low places according to whatever endianness 32 | // we use. Swaps only apply when the memory is copied in a chunk. 33 | h ^= murmur_32_scramble(k); 34 | /* Finalize. */ 35 | h ^= len; 36 | h ^= h >> 16; 37 | h *= 0x85ebca6b; 38 | h ^= h >> 13; 39 | h *= 0xc2b2ae35; 40 | h ^= h >> 16; 41 | return h; 42 | } 43 | -------------------------------------------------------------------------------- /src/newickparser2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "newicklexer.h" 4 | #include "newicktree.h" 5 | 6 | class NewickParser2 : public NewickTree 7 | { 8 | public: 9 | NewickLexer m_Lexer; 10 | vector m_Tokens; 11 | vector m_Stack; 12 | uint m_TokenIndex = 0; 13 | bool m_TraceParse = false; 14 | 15 | public: 16 | NewickParser2() 17 | { 18 | m_TokenIndex = 0; 19 | m_TraceParse = opt(trace_parse); 20 | } 21 | 22 | void Clear() 23 | { 24 | m_Lexer.Clear(); 25 | m_Tokens.clear(); 26 | 27 | m_Labels.clear(); 28 | m_Parents.clear(); 29 | m_Lengths.clear(); 30 | m_IsLeafs.clear(); 31 | 32 | m_TokenIndex = 0; 33 | m_TraceParse = false; 34 | } 35 | 36 | void FromFile(FILE *f); 37 | void FromFile(const string &FileName); 38 | void FromCStr(const char *CStr); 39 | void FromData(const char *Data, uint DataBytes); 40 | void FromStr(const string &Str); 41 | void FromTokens(const vector &Tokens); 42 | void LogTokens() const; 43 | void GetNextToken(string &s) const; 44 | bool IsValidLabel(const string &s) const; 45 | void LogContext() const; 46 | void LogState() const; 47 | void GetLabelAndLength(string &Label, double &Length); 48 | void GetLength(double &Length, string &BS); 49 | void PopStack(); 50 | const string &GetNextToken() const; 51 | uint FixFT(); 52 | }; 53 | -------------------------------------------------------------------------------- /src/supermaker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treen.h" 4 | 5 | class SuperMaker 6 | { 7 | public: 8 | uint m_MinLeafCount = 10; 9 | 10 | vector m_Trees; 11 | vector m_NodeIndexes; 12 | vector m_TreeIndexes; 13 | vector > m_LabelIndexSets; 14 | vector m_Scores; 15 | 16 | vector m_Labels; 17 | map m_LabelToIndex; 18 | vector > m_LabelIndexToTreeIndexes; 19 | vector > m_LabelIndexToLeafNodeIndexes; 20 | 21 | vector > m_RootDists; 22 | vector > m_LeafCounts; 23 | vector > m_Bootstraps; 24 | 25 | public: 26 | 27 | void Load(const string &FileName); 28 | const uint GetTreeCount() const { return SIZE(m_Trees); } 29 | void AddLabel(uint TreeIndex, uint Node, const string &Label); 30 | const vector &GetRootDists(uint TreeIndex) const; 31 | const vector &GetBootstraps(uint TreeIndex) const; 32 | const vector &GetLeafCounts(uint TreeIndex) const; 33 | double CalcScore(uint TreeIndex, uint NodeIndex) const; 34 | double CalcUpperLeafDist(uint TreeIndex, uint NodeIndex) const; 35 | double GetRootDist(uint TreeIndex, uint NodeIndex) const; 36 | uint GetLeafCount(uint TreeIndex, uint NodeIndex) const; 37 | double GetBootstrap(uint TreeIndex, uint NodeIndex) const; 38 | const TreeN &GetTree(uint TreeIndex) const; 39 | }; 40 | -------------------------------------------------------------------------------- /src/rootbyhalves.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | #include "featuretable.h" 4 | 5 | void RootByHalves(const Tree2 &UnrootedTree, Tree2 &RootedTree) 6 | { 7 | asserta(!UnrootedTree.IsRooted()); 8 | RootedTree.FromTree(UnrootedTree); 9 | 10 | const uint LeafCount = RootedTree.GetLeafCount(); 11 | const uint NodeCount = RootedTree.GetNodeCount(); 12 | asserta(LeafCount >= 2); 13 | 14 | const uint TargetLeafCount = LeafCount/2; 15 | asserta(TargetLeafCount > 0); 16 | 17 | uint BestNode = UINT_MAX; 18 | uint BestDiff = UINT_MAX; 19 | uint BestLeafCount = 0; 20 | for (uint Node = 0; Node < NodeCount; ++Node) 21 | { 22 | uint SubtreeLeafCount = RootedTree.GetSubtreeLeafCount(Node); 23 | uint Diff = (SubtreeLeafCount > TargetLeafCount ? 24 | SubtreeLeafCount - TargetLeafCount : 25 | TargetLeafCount - SubtreeLeafCount); 26 | if (BestNode == UINT_MAX || Diff < BestDiff) 27 | { 28 | BestNode = Node; 29 | BestDiff = Diff; 30 | BestLeafCount = SubtreeLeafCount; 31 | } 32 | } 33 | 34 | asserta(BestNode < NodeCount); 35 | uint NbrNode = RootedTree.m_Nbrs1[BestNode]; 36 | RootedTree.SetRoot(BestNode, NbrNode); 37 | } 38 | 39 | void cmd_rootbyhalves() 40 | { 41 | const string &InputFileName = opt(rootbyhalves); 42 | 43 | 44 | Tree2 T; 45 | T.FromNewickFile(InputFileName); 46 | 47 | Tree2 RootedTree; 48 | RootByHalves(T, RootedTree); 49 | RootedTree.Ladderize(opt(right)); 50 | RootedTree.ToNewickFile(opt(output)); 51 | } 52 | -------------------------------------------------------------------------------- /src/getlabels.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | 4 | void cmd_getlabels() 5 | { 6 | asserta(optset_output); 7 | Tree2 T; 8 | T.FromNewickFile(opt(getlabels)); 9 | FILE *f = CreateStdioFile(opt(output)); 10 | vector Labels; 11 | for (uint Node = 0; Node < T.GetNodeCount(); ++Node) 12 | { 13 | if (T.IsLeaf(Node)) 14 | { 15 | string Label; 16 | T.GetLabel(Node, Label); 17 | Labels.push_back(Label); 18 | } 19 | } 20 | sort(Labels.begin(), Labels.end()); 21 | for (uint i = 0; i < SIZE(Labels); ++i) 22 | { 23 | const char *Label = Labels[i].c_str(); 24 | fprintf(f, "%s\n", Label); 25 | } 26 | CloseStdioFile(f); 27 | } 28 | 29 | static FILE *g_fOut; 30 | 31 | static void OnNode(const Tree2 &T, uint Node) 32 | { 33 | if (T.IsLeaf(Node)) 34 | { 35 | string Label; 36 | T.GetLabel(Node, Label); 37 | Pf(g_fOut, "%s\n", Label.c_str()); 38 | } 39 | } 40 | 41 | void cmd_getlabels_treeorder() 42 | { 43 | asserta(optset_output); 44 | Tree2 T; 45 | T.FromNewickFile(opt(getlabels_treeorder)); 46 | asserta(T.IsRooted()); 47 | uint Root = T.GetRoot(); 48 | 49 | string Order = "in"; // in pre post 50 | if (optset_order) 51 | Order = opt(order); 52 | 53 | g_fOut = CreateStdioFile(opt(output)); 54 | if (Order == "in") 55 | T.Inorder(Root, OnNode); 56 | else if (Order == "pre") 57 | T.Preorder(Root, OnNode); 58 | else if (Order == "post") 59 | T.Postorder(Root, OnNode); 60 | else 61 | asserta(false); 62 | 63 | CloseStdioFile(g_fOut); 64 | } 65 | -------------------------------------------------------------------------------- /src/quarts.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "sort.h" 3 | #include "quarts.h" 4 | 5 | void GetQuarts(const vector &v, Quarts &Q) 6 | { 7 | const unsigned N = SIZE(v); 8 | Q.Min = 0; 9 | Q.LoQ = 0; 10 | Q.Med = 0; 11 | Q.HiQ = 0; 12 | Q.Max = 0; 13 | Q.Total = 0; 14 | Q.Avg = 0.0; 15 | if (N == 0) 16 | return; 17 | 18 | vector v2 = v; 19 | unsigned *vs = v2.data(); 20 | QuickSortInPlace(vs, N); 21 | 22 | for (unsigned i = 0; i < N; ++i) 23 | Q.Total += vs[i]; 24 | 25 | Q.Min = vs[0]; 26 | Q.LoQ = vs[N/4]; 27 | Q.Med = vs[N/2]; 28 | Q.HiQ = vs[(3*N)/4]; 29 | Q.Max = vs[N-1]; 30 | Q.Avg = double(Q.Total)/N; 31 | } 32 | 33 | void GetQuarts(const vector &v, QuartsDouble &Q) 34 | { 35 | const unsigned N = SIZE(v); 36 | Q.Min = 0.0f; 37 | Q.LoQ = 0.0f; 38 | Q.Med = 0.0f; 39 | Q.HiQ = 0.0f; 40 | Q.Max = 0.0f; 41 | Q.Total = 0.0f; 42 | Q.Avg = 0.0f; 43 | if (N == 0) 44 | return; 45 | 46 | vector v2 = v; 47 | double *vs = v2.data(); 48 | QuickSortInPlace(vs, N); 49 | 50 | for (unsigned i = 0; i < N; ++i) 51 | Q.Total += vs[i]; 52 | 53 | double Mean = double(Q.Total)/N; 54 | double Sumd = 0.0f; 55 | for (unsigned i = 0; i < N; ++i) 56 | { 57 | double x = vs[i]; 58 | double d = (x - Mean)*(x - Mean); 59 | Sumd += d; 60 | } 61 | double StdDev = (double) sqrt(Sumd/N); 62 | 63 | Q.Min = vs[0]; 64 | Q.LoQ = vs[N/4]; 65 | Q.Med = vs[N/2]; 66 | Q.HiQ = vs[(3*N)/4]; 67 | Q.Max = vs[N-1]; 68 | Q.Avg = Mean; 69 | Q.StdDev = StdDev; 70 | } 71 | -------------------------------------------------------------------------------- /src/taxq2.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void GetFractConfs(const TreeN &T, vector &Confs); 6 | 7 | double GetNodeToAnyConflict(FILE *fTsv, const TreeN &T, 8 | const vector &FractConfs, 9 | const FeatureTable &FT, vector &NodeToAnyConflict); 10 | 11 | void SetFeatureTable(const TreeN &T, FeatureTable &FT); 12 | 13 | void cmd_taxq2() 14 | { 15 | const string &TreeFileName = opt(taxq2); 16 | const string &OutputFileName = opt(output); 17 | FILE *fTsv = CreateStdioFile(OutputFileName); 18 | 19 | TreeN T; 20 | T.FromNewickFile(TreeFileName); 21 | asserta(T.IsNormalized()); 22 | const uint NodeCount = T.GetNodeCount(); 23 | asserta(T.IsNormalized()); 24 | 25 | FeatureTable FT; 26 | SetFeatureTable(T, FT); 27 | 28 | vector FractConfs; 29 | GetFractConfs(T, FractConfs); 30 | asserta(SIZE(FractConfs) == NodeCount); 31 | 32 | vector NodeToAnyConflict; 33 | double Q = GetNodeToAnyConflict(fTsv, T, FractConfs, FT, NodeToAnyConflict); 34 | for (uint Node = 0; Node < NodeCount; ++Node) 35 | { 36 | if (T.IsLeaf(Node) || T.IsRoot(Node)) 37 | continue; 38 | 39 | double FractConf = FractConfs[Node]; 40 | bool AnyConflict = NodeToAnyConflict[Node]; 41 | Pf(fTsv, "eval"); 42 | Pf(fTsv, "\tnode=%u", Node); 43 | if (FractConf < 0) 44 | Pf(fTsv, "\tconf=."); 45 | else 46 | Pf(fTsv, "\tconf=%.3f", FractConf); 47 | Pf(fTsv, "\tconflict=%c", yon(AnyConflict)); 48 | Pf(fTsv, "\n"); 49 | } 50 | 51 | CloseStdioFile(fTsv); 52 | } 53 | -------------------------------------------------------------------------------- /src/rofo3.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "biparter.h" 3 | 4 | double RofoPair(const TreeN &T1, const TreeN &T2) 5 | { 6 | uint LeafCount = T1.GetLeafCount(); 7 | 8 | if (T2.GetLeafCount() != LeafCount) 9 | Die("Different leaf counts"); 10 | 11 | BiParter BP1; 12 | BiParter BP2; 13 | 14 | BP1.Init(T1); 15 | BP2.Init(T2); 16 | 17 | #if TRACE 18 | Log("_____________________1____________________\n"); 19 | BP1.LogMe(); 20 | 21 | Log("\n"); 22 | Log("_____________________2____________________\n"); 23 | BP2.LogMe(); 24 | #endif 25 | 26 | uint SameCount = 0; 27 | uint DiffCount = 0; 28 | uint UniquePartCount2 = SIZE(BP2.m_UniquePartNodes); 29 | for (uint k = 0; k < UniquePartCount2; ++k) 30 | { 31 | uint Node2 = BP2.m_UniquePartNodes[k]; 32 | const vector &Part2 = BP2.m_PartVec[Node2]; 33 | uint Node1 = BP1.Search(Part2); 34 | if (Node1 == UINT_MAX) 35 | { 36 | ++DiffCount; 37 | #if TRACE 38 | Log("Node2 %u (not found)\n", Node2, Node1); 39 | #endif 40 | } 41 | else 42 | { 43 | #if TRACE 44 | Log("Node2 %u == %u\n", Node2, Node1); 45 | #endif 46 | ++SameCount; 47 | } 48 | } 49 | double RF = double(DiffCount)/UniquePartCount2; 50 | ProgressLog("RF = %u / %u, %.4f\n", DiffCount, UniquePartCount2, RF); 51 | return RF; 52 | } 53 | 54 | void cmd_rofo() 55 | { 56 | const string &FileName1 = opt(rofo); 57 | const string &FileName2 = opt(tree2); 58 | 59 | TreeN T1; 60 | TreeN T2; 61 | T1.FromNewickFile(FileName1); 62 | T2.FromNewickFile(FileName2); 63 | 64 | RofoPair(T1, T2); 65 | } 66 | -------------------------------------------------------------------------------- /src/subset.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | 4 | void StringsFromFile(const string &FileName, vector &Strings); 5 | 6 | static void Subset(const string &InputFileName, bool UseAccs) 7 | { 8 | const string &LabelsFileName = opt(labels); 9 | 10 | vector Labels; 11 | StringsFromFile(LabelsFileName, Labels); 12 | 13 | TreeN T; 14 | T.FromNewickFile(InputFileName); 15 | 16 | set LeafNodeIndexes; 17 | const uint LabelCount = SIZE(Labels); 18 | vector NotLeaves; 19 | for (uint i = 0; i < LabelCount; ++i) 20 | { 21 | const string &Label = Labels[i]; 22 | uint NodeIndex = UINT_MAX; 23 | if (UseAccs) 24 | { 25 | string Acc; 26 | GetAccFromLabel(Label, Acc); 27 | NodeIndex = T.GetNodeByAcc(Acc, false); 28 | if (NodeIndex == UINT_MAX) 29 | Log("Not found >%s (acc=%s)\n", 30 | Label.c_str(), Acc.c_str()); 31 | } 32 | else 33 | NodeIndex = T.GetNodeByLabel(Label, false); 34 | if (NodeIndex == UINT_MAX || !T.IsLeaf(NodeIndex)) 35 | { 36 | NotLeaves.push_back(Label); 37 | continue; 38 | } 39 | LeafNodeIndexes.insert(NodeIndex); 40 | } 41 | 42 | if (!NotLeaves.empty()) 43 | Warning("%u / %u labels not found", 44 | SIZE(NotLeaves), SIZE(Labels)); 45 | 46 | T.Subset(LeafNodeIndexes); 47 | T.CollapseConfidenceUnary(); 48 | T.CollapseUnary(); 49 | Progress("Writing output... "); 50 | T.ToNewickFile(opt(output)); 51 | Progress("done.\n"); 52 | } 53 | 54 | void cmd_subset() 55 | { 56 | Subset(opt(subset), false); 57 | } 58 | 59 | void cmd_subsetacc() 60 | { 61 | Subset(opt(subsetacc), true); 62 | } 63 | -------------------------------------------------------------------------------- /src/deleteleaves.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | #include "tree2.h" 4 | 5 | void StringsFromFile(const string &FileName, vector &Strings); 6 | 7 | void cmd_deleteleaves() 8 | { 9 | Die("doesn't work internal nodes become leaves"); 10 | const string &InputFileName = opt(deleteleaves); 11 | 12 | vector Labels; 13 | if (optset_labels) 14 | { 15 | asserta(!optset_label); 16 | StringsFromFile(opt(labels), Labels); 17 | } 18 | else if (optset_label) 19 | { 20 | asserta(!optset_labels); 21 | Labels.push_back(opt(label)); 22 | } 23 | else 24 | Die("Must set -label or -labels"); 25 | const uint LabelCount = SIZE(Labels); 26 | asserta(LabelCount > 0); 27 | 28 | FILE *fOut = CreateStdioFile(opt(output)); 29 | 30 | vector Trees; 31 | TreesFromFile(InputFileName, Trees); 32 | const uint TreeCount = SIZE(Trees); 33 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 34 | { 35 | TreeN T = *Trees[TreeIndex]; 36 | for (uint i = 0; i < SIZE(Labels); ++i) 37 | { 38 | const string &Label = Labels[i]; 39 | uint Node = (opt(accs) ? T.GetNodeByAcc(Label, false) 40 | : T.GetNodeByLabel(Label, false)); 41 | if (Node == UINT_MAX) 42 | { 43 | //Log("Tree %u, not found >%s\n", TreeIndex, Label.c_str()); 44 | continue; 45 | } 46 | if (!T.IsLeaf(Node)) 47 | { 48 | Warning("Not leaf >%s", Label.c_str()); 49 | continue; 50 | } 51 | T.DeleteLeaf(Node); 52 | } 53 | 54 | T.CollapseUnary(); 55 | T.FixEmptyLeafLabels(); 56 | T.SetDerived(); 57 | T.ToNewickFile(fOut, false); 58 | } 59 | CloseStdioFile(fOut); 60 | } 61 | -------------------------------------------------------------------------------- /src/divconker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treex.h" 4 | #include "featuretable.h" 5 | 6 | class DivConker 7 | { 8 | public: 9 | vector m_Trees; 10 | map m_LabelToIndex; 11 | map, uint> m_EdgeToCount; 12 | vector m_SelectedFroms; 13 | vector m_SelectedTos; 14 | vector m_Labels; 15 | uint m_LabelCount = 0; 16 | double m_MinTPFract = 0.5; 17 | double m_MinMedianLeafDist = 1.5; 18 | double m_MaxMedianLeafDist = 2.3; 19 | double m_MinBootstrapFract = 0.85; 20 | uint m_MinClusterSize = 8; 21 | double m_MaxPct = 95; 22 | uint m_MaxClusters = 50; 23 | 24 | vector > m_CCs; 25 | vector m_SortedCCIndexes; 26 | vector m_SortedCCSizes; 27 | 28 | public: 29 | void Reset() 30 | { 31 | m_Trees.clear(); 32 | m_LabelToIndex.clear(); 33 | m_EdgeToCount.clear(); 34 | m_SelectedFroms.clear(); 35 | m_SelectedTos.clear(); 36 | m_Labels.clear(); 37 | m_CCs.clear(); 38 | m_SortedCCIndexes.clear(); 39 | m_SortedCCSizes.clear(); 40 | } 41 | 42 | void Run(const vector &Trees); 43 | uint GetTreeCount() const { return SIZE(m_Trees); } 44 | uint GetLabelIndex(const string &Label, bool Init); 45 | const string &GetLabel(uint LabelIndex) const; 46 | double GetNodeScore(const TreeX &T, uint Node) const; 47 | void AddTreeToIndex(const TreeX &T, uint TreeIndex); 48 | void AddEdges(const vector &Labels); 49 | void AddTree(uint TreeIndex); 50 | void SelectEdges(); 51 | void MakeCCs(); 52 | void CCsToTsv(FILE *f) const; 53 | void CCToTsv(FILE *f, uint k, uint CCindex) const; 54 | void GetCCLabelVec(vector > &LabelVec) const; 55 | }; 56 | -------------------------------------------------------------------------------- /src/biparterx.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treex.h" 4 | #include 5 | #include 6 | 7 | class BiParterX 8 | { 9 | public: 10 | TreeX m_T; 11 | 12 | // Leaf labels sorted alphabetically, this so that 13 | // any tree with same set of labels will have 14 | // same label indexes. 15 | vector m_Labels; 16 | map m_LabelToIndex; 17 | vector m_NodeToLabelIndex; 18 | vector m_LabelIndexToNode; 19 | 20 | // PartVec[Node] is boolean vector. 21 | // Vector has one true/false for each label index. 22 | // Labels are indexed as above. 23 | // True/false is disambiguated by requiring that first 24 | // entry is always false. 25 | vector > m_PartVec; 26 | vector m_UniquePartNodes; 27 | 28 | // One vector of node indexes for every entry 29 | // in m_UniquePartNodes. 30 | vector > m_NodeToDupeNodes; 31 | 32 | // Hash table entries are node indexes. 33 | // Collisions should be rare. 34 | vector > m_HashTable; 35 | 36 | public: 37 | void Init(const TreeX &T); 38 | void LogMe() const; 39 | void LogPatterns() const; 40 | uint Search(const vector &Part) const; 41 | void SearchBestMatch(const vector &Labels, 42 | uint &Node, vector &MissingLabels, uint &OtherCount) const; 43 | void GetPartInternalNodeLabels(uint PartIndex, vector &Labels) const; 44 | void GetPartInternalNodes(uint PartIndex, vector &Nodes) const; 45 | void ToTSV(const string &Name, FILE *f) const; 46 | uint GetLabelIndex(const string &Label) const; 47 | uint GetOtherCount(const vector &Part, const vector &Query) const; 48 | 49 | private: 50 | void SetHashTable(); 51 | uint GetHash(const vector &Part) const; 52 | uint GetHash(uint Node) const; 53 | }; 54 | -------------------------------------------------------------------------------- /src/cladogram.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | 4 | uint TreeN::GetNodeCountToFurthestLeaf(uint Node) const 5 | { 6 | if (IsLeaf(Node)) 7 | return 1; 8 | uint Left = GetLeft(Node); 9 | uint Right = GetRight(Node); 10 | uint n = 1 + max(GetNodeCountToFurthestLeaf(Left), 11 | GetNodeCountToFurthestLeaf(Right)); 12 | return n; 13 | } 14 | 15 | static void SetL(TreeN &T, uint Node) 16 | { 17 | if (T.IsRoot(Node)) 18 | return; 19 | 20 | if (opt(unitlengths)) 21 | { 22 | T.m_NodeToLength[Node] = 1; 23 | return; 24 | } 25 | 26 | uint Parent = T.GetParent(Node); 27 | uint NodeN = T.GetNodeCountToFurthestLeaf(Node); 28 | uint ParentN = T.GetNodeCountToFurthestLeaf(Parent); 29 | asserta(NodeN < ParentN); 30 | uint d = ParentN - NodeN; 31 | T.m_NodeToLength[Node] = d; 32 | } 33 | 34 | static void ConvertToCladogram(TreeN &T) 35 | { 36 | asserta(T.IsNormalized()); 37 | T.m_NodeToLength.clear(); 38 | const uint NodeCount = T.GetNodeCount(); 39 | for (uint Node = 0; Node < NodeCount; ++Node) 40 | { 41 | if (T.IsRoot(Node)) 42 | { 43 | T.m_NodeToLength[Node] = MISSING_LENGTH; 44 | continue; 45 | } 46 | 47 | SetL(T, Node); 48 | } 49 | } 50 | 51 | void cmd_clado() 52 | { 53 | const string &InputFileName = opt(clado); 54 | const string &OutputFileName = opt(output); 55 | FILE *f = CreateStdioFile(OutputFileName); 56 | 57 | vector Trees; 58 | TreesFromFile(InputFileName, Trees); 59 | uint TreeCount = SIZE(Trees); 60 | 61 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 62 | { 63 | ProgressStep(TreeIndex, TreeCount, "Processing"); 64 | TreeN &T = *Trees[TreeIndex]; 65 | ConvertToCladogram(T); 66 | T.ToNewickFile(f, false); 67 | } 68 | 69 | CloseStdioFile(f); 70 | } 71 | -------------------------------------------------------------------------------- /src/deletegroup.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void StringsFromFile(const string &FileName, vector &Strings); 6 | void TreesFromFile(const string &FileName, vector &Trees); 7 | void TreesToFile(const vector &Trees, const string &FileName); 8 | 9 | void cmd_deletegroup() 10 | { 11 | const string &InputFileName = opt(deletegroup); 12 | 13 | vector Trees; 14 | TreesFromFile(InputFileName, Trees); 15 | const uint TreeCount = SIZE(Trees); 16 | 17 | vector GroupLabelsVec; 18 | set GroupLabels; 19 | string GroupName; 20 | string LabelSubstr; 21 | 22 | if (optset_labels) 23 | { 24 | StringsFromFile(opt(labels), GroupLabelsVec); 25 | for (uint i = 0; i < SIZE(GroupLabelsVec); ++i) 26 | GroupLabels.insert(GroupLabelsVec[i]); 27 | } 28 | else if (optset_label) 29 | GroupName = opt(label); 30 | else 31 | Die("Must specify -labels, -label or -label_substr"); 32 | 33 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 34 | { 35 | TreeN &T = *Trees[TreeIndex]; 36 | 37 | vector LeafNodes; 38 | T.GetLeafNodes(LeafNodes); 39 | const uint LeafCount = SIZE(LeafNodes); 40 | 41 | set GroupLeafNodes; 42 | if (optset_labels) 43 | T.GetGroupLeafNodes(GroupLabels, GroupLeafNodes); 44 | else if (optset_label) 45 | T.GetGroupLeafNodes(GroupName, GroupLeafNodes); 46 | else 47 | asserta(false); 48 | 49 | if (GroupLeafNodes.empty()) 50 | Die("Group not found"); 51 | 52 | uint FPs; 53 | uint FNs; 54 | bool Invert; 55 | uint LCA = T.GetBestFitSubtree1(GroupLeafNodes, false, FPs, FNs, Invert); 56 | asserta(LCA != UINT_MAX); 57 | 58 | T.DeleteSubtree(LCA); 59 | T.CollapseUnary(); 60 | T.Ladderize(opt(right)); 61 | } 62 | 63 | TreesToFile(Trees, opt(output)); 64 | } 65 | -------------------------------------------------------------------------------- /src/taxtable.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | 4 | // superkingdom:Bacteria,clade:Terrabacteria group,phylum:Actinobacteria,class:Actin... 5 | static void GetTaxName(const string &TaxStr, const string &RankName, 6 | string &Name) 7 | { 8 | Name.clear(); 9 | const string RankNameColon = RankName + ":"; 10 | size_t n = TaxStr.find(RankNameColon); 11 | if (n == string::npos) 12 | return; 13 | for (size_t i = n + RankNameColon.size(); i < TaxStr.size(); ++i) 14 | { 15 | char c = TaxStr[i]; 16 | if (c == ',') 17 | return; 18 | Name += c; 19 | } 20 | } 21 | 22 | void ReadTaxTable(const string &FileName, const vector &Ranks, 23 | vector &FTs) 24 | { 25 | FTs.clear(); 26 | const uint RankCount = SIZE(Ranks); 27 | 28 | vector Accs; 29 | vector > NamesVec; 30 | 31 | string Line; 32 | vector Fields; 33 | FILE *f = OpenStdioFile(FileName); 34 | while (ReadLineStdioFile(f, Line)) 35 | { 36 | Split(Line, Fields, '\t'); 37 | asserta(SIZE(Fields) == 2); 38 | const string &Label = Fields[0]; 39 | const string &TaxStr = Fields[1]; 40 | 41 | string Acc; 42 | GetAccFromLabel(Label, Acc); 43 | 44 | vector Names; 45 | for (uint i = 0; i < RankCount; ++i) 46 | { 47 | const string &RankName = Ranks[i]; 48 | string Name; 49 | GetTaxName(TaxStr, RankName, Name); 50 | Names.push_back(Name); 51 | } 52 | Accs.push_back(Acc); 53 | NamesVec.push_back(Names); 54 | } 55 | const uint AccCount = SIZE(Accs); 56 | 57 | for (uint i = 0; i < RankCount; ++i) 58 | { 59 | vector Values; 60 | for (uint j = 0; j < AccCount; ++j) 61 | { 62 | const string &Value = NamesVec[j][i]; 63 | Values.push_back(Value); 64 | } 65 | FeatureTable *FT = new FeatureTable; 66 | FT->FromVecs(Accs, Values); 67 | FTs.push_back(FT); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/lcalabel.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void SetFeatureTable(const TreeN &T, FeatureTable &FT); 6 | void GetLCAs(const TreeN &T, const FeatureTable &FT, bool AllowInvert, 7 | vector &Values, vector &LCAs, vector &GroupSizes, 8 | vector &SubtreeSizes, vector &FPs, vector &FNs, 9 | vector &Inverts, vector &MonoFs); 10 | 11 | void UpdateLCALabel(const string &OldLabel, const string &Value, 12 | bool IsLeaf, string &NewLabel) 13 | { 14 | if (!opt(lcaconfs)) 15 | { 16 | NewLabel = Value; 17 | return; 18 | } 19 | 20 | if (EndsWith(OldLabel, "<")) 21 | { 22 | NewLabel = Value + "+" + OldLabel; 23 | return; 24 | } 25 | 26 | if (OldLabel == "") 27 | NewLabel = Value; 28 | else 29 | NewLabel = Value + "<" + OldLabel + ">"; 30 | } 31 | 32 | void cmd_lcalabel() 33 | { 34 | TreeN T; 35 | T.FromNewickFile(opt(lcalabel)); 36 | 37 | FeatureTable FT; 38 | SetFeatureTable(T, FT); 39 | 40 | vector Values; 41 | vector LCAs; 42 | vector GroupSizes; 43 | vector SubtreeSizes; 44 | vector FPs; 45 | vector FNs; 46 | vector Inverts; 47 | vector MonoFs; 48 | GetLCAs(T, FT, false, Values, LCAs, GroupSizes, SubtreeSizes, 49 | FPs, FNs, Inverts, MonoFs); 50 | 51 | const uint FoundCount = SIZE(LCAs); 52 | ProgressLog("%u LCAs found\n", FoundCount); 53 | for (uint i = 0; i < FoundCount; ++i) 54 | { 55 | const string &Value = Values[i]; 56 | uint LCA = LCAs[i]; 57 | if (LCA == UINT_MAX) 58 | continue; 59 | asserta(T.IsNode(LCA)); 60 | const string &Label = T.GetLabel(LCA); 61 | string NewLabel; 62 | bool IsLeaf = T.IsLeaf(LCA); 63 | UpdateLCALabel(Label, Value, IsLeaf, NewLabel); 64 | T.UpdateLabel(LCA, NewLabel); 65 | } 66 | 67 | T.ToNewickFile(opt(output)); 68 | T.Ladderize(opt(right)); 69 | } 70 | -------------------------------------------------------------------------------- /src/biparter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treen.h" 4 | #include "featuretable.h" 5 | #include 6 | 7 | class BiParter 8 | { 9 | public: 10 | TreeN m_T; 11 | 12 | // Leaf labels sorted alphabetically, this so that 13 | // any tree with same set of labels will have 14 | // same label indexes. 15 | vector m_Labels; 16 | map m_LabelToIndex; 17 | vector m_NodeToLabelIndex; 18 | vector m_LabelIndexToNode; 19 | 20 | // PartVec[Node] is boolean vector. 21 | // Vector has one true/false for each label index. 22 | // Labels are indexed as above. 23 | // True/false is disambiguated by requiring that first 24 | // entry is always false. 25 | vector > m_PartVec; 26 | vector m_UniquePartNodes; 27 | 28 | // One vector of node indexes for every entry 29 | // in m_UniquePartNodes. 30 | vector > m_NodeToDupeNodes; 31 | 32 | // Hash table entries are node indexes. 33 | // Collisions should be rare. 34 | vector > m_HashTable; 35 | 36 | public: 37 | void Init(const TreeN &T); 38 | void LogMe() const; 39 | void LogPatterns() const; 40 | uint Search(const vector &Part) const; 41 | void SearchBestMatch(const vector &Labels, 42 | uint &Node, vector &MissingLabels, uint &OtherCount) const; 43 | void CountFeatures(const FeatureTable &FT, 44 | vector &ValueIndexToTotal, 45 | vector > &NodeToValueCounts); 46 | void GetPartInternalNodeLabels(uint PartIndex, vector &Labels) const; 47 | void GetPartInternalNodes(uint PartIndex, vector &Nodes) const; 48 | void ToTSV(const string &Name, FILE *f) const; 49 | uint GetLabelIndex(const string &Label) const; 50 | uint GetOtherCount(const vector &Part, const vector &Query) const; 51 | 52 | private: 53 | void SetHashTable(); 54 | uint GetHash(const vector &Part) const; 55 | uint GetHash(uint Node) const; 56 | }; 57 | -------------------------------------------------------------------------------- /src/rootbyoutgroupx.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treex.h" 4 | 5 | void StringsFromFile(const string &FileName, vector &Strings); 6 | void TreesFromFile(const string &FileName, vector &Trees); 7 | void TreesToFile(const vector &Trees, const string &FileName); 8 | 9 | void TreeX::SetRootByGroupLabels(const vector &GroupLabels) 10 | { 11 | set LeafNodes; 12 | for (uint i = 0; i < SIZE(GroupLabels); ++i) 13 | { 14 | uint Node = GetNodeByLabel(GroupLabels[i], false); 15 | if (Node != UINT_MAX) 16 | LeafNodes.insert(Node); 17 | } 18 | asserta(LeafNodes.size() > 0); 19 | 20 | uint FromNode, ToNode; 21 | GetLCAEdge(LeafNodes, FromNode, ToNode); 22 | if (GetParent(ToNode) == FromNode) 23 | { 24 | Log("Root edge %u --> %u\n", FromNode, ToNode); 25 | InsertRootAbove(ToNode); 26 | } 27 | else if (GetParent(FromNode) == ToNode) 28 | { 29 | Log("Root edge %u <-- %u\n", ToNode, FromNode); 30 | InsertRootAbove(FromNode); 31 | } 32 | else 33 | asserta(false); 34 | 35 | Validate(); 36 | } 37 | 38 | void cmd_rootbyoutgroupx() 39 | { 40 | const string &InputFileName = opt(rootbyoutgroup); 41 | const string &TsvFileName = opt(tsvout); 42 | FILE *fTsv = CreateStdioFile(TsvFileName); 43 | 44 | vector Trees; 45 | TreesFromFile(InputFileName, Trees); 46 | const uint TreeCount = SIZE(Trees); 47 | 48 | const string NewRootLabel = optset_root_label ? opt(root_label) : ""; 49 | const string OldRootLabel = optset_old_root_label ? opt(old_root_label) : "-"; 50 | 51 | vector GroupLabels; 52 | string GroupName; 53 | 54 | asserta(optset_labels); 55 | StringsFromFile(opt(labels), GroupLabels); 56 | 57 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 58 | { 59 | TreeX &T = *Trees[TreeIndex]; 60 | T.SetRootByGroupLabels(GroupLabels); 61 | } 62 | 63 | TreesToFile(Trees, opt(output)); 64 | CloseStdioFile(fTsv); 65 | } 66 | -------------------------------------------------------------------------------- /src/layout.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | class Svg; 4 | class FeatureTable; 5 | 6 | const uint TREE_WIDTH = 1000; 7 | const uint TREE_HEIGHT = 1000; 8 | const uint TREE_SPACING = 300; 9 | 10 | class Layout 11 | { 12 | public: 13 | const Tree2 *m_T = 0; 14 | double m_LeafSpacingY = 10; // spacing between leaves 15 | double m_ScaleX = 10; // scaling factor for branch length 16 | 17 | vector m_Xs; 18 | vector m_Ys; 19 | vector m_Midxs; 20 | vector m_Rxs; 21 | vector m_Rys; 22 | vector m_Lxs; 23 | vector m_Lys; 24 | 25 | double m_CurrentLeafY = 0; 26 | double m_MaxRootDist = 0; 27 | double m_Margin = 100; 28 | double m_Width = TREE_WIDTH; 29 | double m_Height = TREE_HEIGHT; 30 | double m_StrokeWidth = 1; 31 | double m_OffsetX = 0; 32 | double m_OffsetY = 0; 33 | 34 | double m_TriangleWidth = 0; 35 | double m_TriangleHeight = 0; 36 | double m_RectangleWidth = 0; 37 | double m_RectangleHeight = 0; 38 | 39 | const FeatureTable *m_FT = 0; 40 | uint m_ValueCount = 0; 41 | vector m_NodeToValueIndex; 42 | vector > m_NodeToValueToCount; 43 | string m_DefaultColor = "gray"; 44 | string m_LeafLabelColor = "black"; 45 | vector m_ValueToColor; 46 | double m_MajorityFract = 1.0; 47 | 48 | string m_Title = ""; 49 | double m_TitleFontSize = 10; 50 | double m_LabelFontSize = 0; 51 | 52 | double m_MaxY = 0; 53 | 54 | public: 55 | void Run(const Tree2 &T); 56 | void SetFeatures(const FeatureTable &FT); 57 | void RenderNode(Svg &S, uint Node); 58 | void Render(Svg &S); 59 | void Render(const string &SvgFileName); 60 | void RenderLegend(const string &SvgFileName) const; 61 | uint GetMajorityValueIndex(uint Node) const; 62 | void SetColors(const string &DefaultColor, 63 | const vector &ValueToColor); 64 | void GetColor(uint Node, string &Color) const; 65 | double GetEstimatedMaxLeafLabelPx() const; 66 | 67 | private: 68 | double SetY(uint Node); 69 | void SetValueCounts(uint Node); 70 | }; 71 | -------------------------------------------------------------------------------- /src/syncft.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | #include 4 | 5 | void cmd_syncft() 6 | { 7 | const string &FN = opt(syncft); 8 | FILE *fOut = CreateStdioFile(opt(output)); 9 | if (fOut == 0) 10 | Die("Failed to create output file"); 11 | 12 | Tree2 T; 13 | T.FromNewickFile(opt(tree)); 14 | set TreeLabels; 15 | 16 | const uint NodeCount = T.GetNodeCount(); 17 | for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) 18 | { 19 | if (!T.IsLeaf(NodeIndex)) 20 | continue; 21 | const string Label = T.GetLabel(NodeIndex); 22 | if (TreeLabels.find(Label) != TreeLabels.end()) 23 | Die("Dupe label >%s", Label.c_str()); 24 | TreeLabels.insert(Label); 25 | } 26 | 27 | FILE *fIn = OpenStdioFile(FN); 28 | 29 | string Line; 30 | vector Fields; 31 | set FoundLabels; 32 | uint Found = 0; 33 | uint NotFound = 0; 34 | uint CatCount = UINT_MAX; 35 | string HdrLine; 36 | ReadLineStdioFile(fIn, HdrLine); 37 | fprintf(fOut, "%s\n", HdrLine.c_str()); 38 | while (ReadLineStdioFile(fIn, Line)) 39 | { 40 | Split(Line, Fields, '\t'); 41 | uint FieldCount = SIZE(Fields); 42 | if (CatCount == UINT_MAX) 43 | CatCount = FieldCount - 1; 44 | else 45 | asserta(FieldCount == CatCount + 1); 46 | 47 | const string &Label = Fields[0]; 48 | if (TreeLabels.find(Label) == TreeLabels.end()) 49 | { 50 | ++NotFound; 51 | ProgressLog("Deleted >%s\n", Label.c_str()); 52 | continue; 53 | } 54 | ++Found; 55 | FoundLabels.insert(Label); 56 | fprintf(fOut, "%s\n", Line.c_str()); 57 | } 58 | asserta(SIZE(FoundLabels) == Found); 59 | 60 | set Missing; 61 | for (set::const_iterator p = TreeLabels.begin(); 62 | p != TreeLabels.end(); ++p) 63 | { 64 | const string &TreeLabel = *p; 65 | if (FoundLabels.find(TreeLabel) != FoundLabels.end()) 66 | continue; 67 | fprintf(fOut, "%s", TreeLabel.c_str()); 68 | for (uint i = 0; i < CatCount; ++i) 69 | fprintf(fOut, "\t."); 70 | fprintf(fOut, "\n"); 71 | ProgressLog("Inserted >%s\n", TreeLabel.c_str()); 72 | } 73 | 74 | CloseStdioFile(fIn); 75 | } 76 | -------------------------------------------------------------------------------- /src/conf.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | #include "biparter.h" 4 | 5 | void TreesFromFile(const string &FileName, vector &Trees); 6 | 7 | void cmd_conf() 8 | { 9 | const string &TreeFileName = opt(conf); 10 | const string &TreesFileName = opt(trees); 11 | const string &OutputFileName = opt(output); 12 | const string &TSVFileName = opt(tsvout); 13 | 14 | TreeN T; 15 | T.FromNewickFile(TreeFileName); 16 | 17 | vector Trees; 18 | TreesFromFile(TreesFileName, Trees); 19 | if (opt(self)) 20 | Trees.push_back(&T); 21 | const uint TreeCount = SIZE(Trees); 22 | 23 | FILE *fTSV = CreateStdioFile(TSVFileName); 24 | 25 | BiParter BP1; 26 | BP1.Init(T); 27 | 28 | const vector > &Parts = BP1.m_PartVec; 29 | const uint PartCount = SIZE(Parts); 30 | vector ReplicateCounts(PartCount, 0); 31 | 32 | const uint N = SIZE(Trees); 33 | BiParter BP2; 34 | for (uint i = 0; i < N; ++i) 35 | { 36 | const TreeN &T = *Trees[i]; 37 | BP2.Init(T); 38 | 39 | string Name; 40 | Ps(Name, "tree%u", i); 41 | BP2.ToTSV(Name, fTSV); 42 | 43 | for (uint j = 0; j < PartCount; ++j) 44 | { 45 | const vector &Part = Parts[j]; 46 | uint Node = BP2.Search(Part); 47 | if (Node != UINT_MAX) 48 | ++ReplicateCounts[j]; 49 | } 50 | } 51 | 52 | for (uint j = 0; j < PartCount; ++j) 53 | { 54 | const vector &Part = Parts[j]; 55 | if (T.IsRoot(j) || T.IsLeaf(j)) 56 | { 57 | asserta(Part.empty()); 58 | continue; 59 | } 60 | uint RC = ReplicateCounts[j]; 61 | uint Pct = (RC*100)/TreeCount; 62 | string sPct; 63 | Ps(sPct, "%u", Pct); 64 | 65 | vector Nodes; 66 | BP1.GetPartInternalNodes(j, Nodes); 67 | for (uint k = 0; k < SIZE(Nodes); ++k) 68 | { 69 | uint Node = Nodes[k]; 70 | T.UpdateLabel(Node, sPct); 71 | } 72 | } 73 | 74 | //////////////////////////////////////////// 75 | // Do NOT unroot -- loses a bootstrap value! 76 | //////////////////////////////////////////// 77 | //if (!T.IsRooted()) 78 | // T1->Unroot(); 79 | T.ToNewickFile(OutputFileName); 80 | 81 | CloseStdioFile(fTSV); 82 | } 83 | -------------------------------------------------------------------------------- /src/tax_usage.txt: -------------------------------------------------------------------------------- 1 | newick 2 | -tax trees.newick \ # INPUT one or more rooted binary trees, suggest just one to KISS 3 | -features feature_table.tsv \ # INPUT feature table (see below) 4 | -tsv results.tsv \ # OUTPUT results in tab-separated text format 5 | -fev results.fev \ # OUTPUT results in "field=value" format 6 | -out annotated.newick \ # OUTPUT annotated tree -- I forget what this is :-) 7 | 8 | All OUTPUT options are optional :-) 9 | 10 | Each taxon is matched to the tree by finding a best-fit LCA 11 | node which minimizes FP+FN errors. 12 | FP = leaf in LCA's subtree belongs to a different taxon. 13 | FN = species in the taxon not in the LCA's subtree. 14 | 15 | Output files report TPs and FPs for each taxon, and summary for the whole tree 16 | 17 | Feature table: 18 | Specifies taxon for each leaf label in the tree. 19 | Format is tab-separated text with two fields: 20 | 1. leaf_label, 2. taxon 21 | 22 | Putting taxon names in labels: 23 | It can be nice to put taxon names in the tree label, e.g. A1234_Pisuviricota, 24 | where A1234 = Genbank ID and Pisuviricota = taxon. Then you can quickly 25 | visualize in a tree viewer such as Dendroscope to see whether taxa are 26 | grouping together. If you do this, then you don't need to give a feature 27 | table, instead you tell newick where the taxon name is by using the -ff 28 | option. This has two characters: 1. integer field number, 2. separator 29 | character. For labels formatted like A1234_Pisuviricota, you would 30 | use -ff 2_ (second field, fields separated by underscore). 31 | 32 | All of this is for a single rank e.g. phylum or family. Typically, one 33 | rank is the "sweet spot" -- lower ranks are too easy, higher ranks are 34 | too hard. To measure multiple ranks, you will need different feature 35 | tables. 36 | 37 | You need a rooted tree, there is a -rootbyoutgroup command to do rooting by taxonomic outgroup 38 | newick -rootbyoutgroup trees.newick -output rooted.newick 39 | If the outgroup is a single leaf, you can specify it by -label outgroupname 40 | If the group is one or more leaves, specify by -labels outgrouplabels.txt -------------------------------------------------------------------------------- /src/relabel.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | 4 | void GetAccFromLabel(const string &Label, string &Acc); 5 | 6 | static void Relabel1(TreeN &T, const map &OldLabelToNewLabel, 7 | set &FoundLabels) 8 | { 9 | uint NotFound = 0; 10 | uint Replaced = 0; 11 | asserta(T.IsNormalized()); 12 | const uint NodeCount = T.GetNodeCount(); 13 | for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) 14 | { 15 | if (!T.IsLeaf(NodeIndex)) 16 | continue; 17 | string OldLabel = T.GetLabel(NodeIndex); 18 | if (opt(accs)) 19 | GetAccFromLabel(OldLabel, OldLabel); 20 | map::const_iterator p = 21 | OldLabelToNewLabel.find(OldLabel); 22 | if (p == OldLabelToNewLabel.end()) 23 | { 24 | ++NotFound; 25 | if (NotFound < 10) 26 | ProgressLog("Not found >%s\n", OldLabel.c_str()); 27 | else if (NotFound == 10) 28 | ProgressLog("10+ Not found\n"); 29 | continue; 30 | } 31 | FoundLabels.insert(OldLabel); 32 | const string &NewLabel = p->second; 33 | T.UpdateLabel(NodeIndex, NewLabel); 34 | ++Replaced; 35 | } 36 | } 37 | 38 | void cmd_relabel() 39 | { 40 | vector Trees; 41 | TreesFromFile(opt(relabel), Trees); 42 | 43 | FILE *f = OpenStdioFile(opt(labels2)); 44 | string Line; 45 | vector Fields; 46 | map OldLabelToNewLabel; 47 | uint LabelCount = 0; 48 | while (ReadLineStdioFile(f, Line)) 49 | { 50 | Split(Line, Fields, '\t'); 51 | if (SIZE(Fields) != 2) 52 | Die("Expected 2 fields in line '%s'", Line.c_str()); 53 | string OldLabel = Fields[0]; 54 | if (opt(accs)) 55 | GetAccFromLabel(OldLabel, OldLabel); 56 | const string &NewLabel = Fields[1]; 57 | if (OldLabelToNewLabel.find(OldLabel) != OldLabelToNewLabel.end()) 58 | Die("Dupe label >%s", OldLabel.c_str()); 59 | OldLabelToNewLabel[OldLabel] = NewLabel; 60 | ++LabelCount; 61 | } 62 | CloseStdioFile(f); 63 | 64 | set FoundLabels; 65 | const uint TreeCount = SIZE(Trees); 66 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 67 | { 68 | ProgressStep(TreeIndex, TreeCount, "Processing"); 69 | TreeN *T = Trees[TreeIndex]; 70 | Relabel1(*T, OldLabelToNewLabel, FoundLabels); 71 | } 72 | TreesToFile(Trees, opt(output)); 73 | 74 | ProgressLog("%u / %u labels found\n", SIZE(FoundLabels), LabelCount); 75 | } 76 | -------------------------------------------------------------------------------- /src/getlcasx.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treex.h" 4 | 5 | const double MIN_TP_FRACT = 0.5; 6 | 7 | void GetGroupLeafNodeSet(const TreeX &T, const FeatureTable &FT, 8 | uint ValueIndex, set &LeafNodeSet) 9 | { 10 | LeafNodeSet.clear(); 11 | asserta(!optset_accs); 12 | vector FeatureLabels; 13 | FT.GetLabels_ByValueIndex(ValueIndex, FeatureLabels); 14 | for (uint i = 0; i < SIZE(FeatureLabels); ++i) 15 | { 16 | const string &Label = FeatureLabels[i]; 17 | uint Node = T.GetNodeByLabel(Label, false); 18 | if (Node == UINT_MAX) 19 | continue; 20 | if (T.IsLeaf(Node)) 21 | LeafNodeSet.insert(Node); 22 | } 23 | } 24 | 25 | void GetLCAs(TreeX &T, const FeatureTable &FT, bool AllowInvert, 26 | vector &Values, vector &LCAs, vector &GroupSizes, 27 | vector &SubtreeSizes, vector &FPs, vector &FNs, 28 | vector &MonoFs) 29 | { 30 | asserta(SIZE(FT.m_LeafNodeSet) > 0); 31 | 32 | uint ValueCount = FT.GetValueCount(); 33 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 34 | { 35 | const string &Value = FT.GetValue(ValueIndex); 36 | 37 | set GroupLeafNodes; 38 | GetGroupLeafNodeSet(T, FT, ValueIndex, GroupLeafNodes); 39 | 40 | uint TP = UINT_MAX; 41 | uint FP = UINT_MAX; 42 | uint FN = UINT_MAX; 43 | uint GroupSize = SIZE(GroupLeafNodes); 44 | uint SubtreeSize = SIZE(GroupLeafNodes); 45 | uint LCA = UINT_MAX; 46 | if (GroupSize > 0) 47 | { 48 | asserta(SIZE(FT.m_LeafNodeSet) > SIZE(GroupLeafNodes)); 49 | LCA = T.GetBestFitSubtree(FT.m_LeafNodeSet, MIN_TP_FRACT, 50 | TP, FP, FN); 51 | if (LCA == UINT_MAX) 52 | Warning("No LCA for %s", Value.c_str()); 53 | } 54 | 55 | double MonoF = 1.0 - double(FP + FN)/(GroupSize + FP); 56 | 57 | Values.push_back(Value); 58 | LCAs.push_back(LCA); 59 | GroupSizes.push_back(GroupSize); 60 | SubtreeSizes.push_back(SubtreeSize); 61 | FPs.push_back(FP); 62 | FNs.push_back(FN); 63 | MonoFs.push_back(MonoF); 64 | 65 | Log("Value=%s", Value.c_str()); 66 | Log(", GroupSize=%u", GroupSize); 67 | Log(", SubtreeSize=%u", SubtreeSize); 68 | Log(", LCA=%u", LCA); 69 | Log(", FPs=%u", FP); 70 | Log(", FNs=%u", FN); 71 | Log(", Errs=%u", FP + FN); 72 | Log(", MonoF=%.4f", MonoF); 73 | Log("\n"); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/getlcas.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void GetGroupLeafNodeSet(const TreeN &T, const FeatureTable &FT, 6 | uint ValueIndex, set &LeafNodeSet) 7 | { 8 | vector FeatureLabels; 9 | FT.GetLabels_ByValueIndex(ValueIndex, FeatureLabels); 10 | for (uint i = 0; i < SIZE(FeatureLabels); ++i) 11 | { 12 | const string &Label = FeatureLabels[i]; 13 | uint Node = (opt(accs) ? 14 | T.GetNodeByAcc(Label, false) : T.GetNodeByLabel(Label, false)); 15 | if (Node == UINT_MAX) 16 | continue; 17 | if (T.IsLeaf(Node)) 18 | LeafNodeSet.insert(Node); 19 | } 20 | } 21 | 22 | void GetLCAs(const TreeN &T, const FeatureTable &FT, bool AllowInvert, 23 | vector &Values, vector &LCAs, vector &GroupSizes, 24 | vector &SubtreeSizes, vector &FPs, vector &FNs, 25 | vector &Inverts, vector &MonoFs) 26 | { 27 | asserta(SIZE(FT.m_LeafNodeSet) > 0); 28 | 29 | uint ValueCount = FT.GetValueCount(); 30 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 31 | { 32 | const string &Value = FT.GetValue(ValueIndex); 33 | 34 | set GroupLeafNodes; 35 | GetGroupLeafNodeSet(T, FT, ValueIndex, GroupLeafNodes); 36 | 37 | uint FP = UINT_MAX; 38 | uint FN = UINT_MAX; 39 | bool Invert = false; 40 | uint GroupSize = SIZE(GroupLeafNodes); 41 | uint SubtreeSize = SIZE(GroupLeafNodes); 42 | uint LCA = UINT_MAX; 43 | if (GroupSize > 0) 44 | { 45 | asserta(SIZE(FT.m_LeafNodeSet) > SIZE(GroupLeafNodes)); 46 | LCA = T.GetBestFitSubtree2(FT.m_LeafNodeSet, GroupLeafNodes, 47 | AllowInvert, FP, FN, Invert); 48 | if (LCA == UINT_MAX) 49 | Warning("No LCA for %s", Value.c_str()); 50 | } 51 | 52 | double MonoF = 1.0 - double(FP + FN)/(GroupSize + FP); 53 | 54 | Values.push_back(Value); 55 | LCAs.push_back(LCA); 56 | GroupSizes.push_back(GroupSize); 57 | SubtreeSizes.push_back(SubtreeSize); 58 | FPs.push_back(FP); 59 | FNs.push_back(FN); 60 | Inverts.push_back(Invert); 61 | MonoFs.push_back(MonoF); 62 | 63 | Log("Value=%s", Value.c_str()); 64 | Log(", GroupSize=%u", GroupSize); 65 | Log(", SubtreeSize=%u", SubtreeSize); 66 | Log(", LCA=%u", LCA); 67 | Log(", Invert %c", tof(Invert)); 68 | Log(", FPs=%u", FP); 69 | Log(", FNs=%u", FN); 70 | Log(", Errs=%u", FP + FN); 71 | Log(", MonoF=%.4f", MonoF); 72 | Log("\n"); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/bootq.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "biparter.h" 3 | 4 | static FILE *g_fTab; 5 | 6 | static void BootQPair(const TreeN &T1, const TreeN &T2) 7 | { 8 | uint LeafCount = T1.GetLeafCount(); 9 | 10 | if (T2.GetLeafCount() != LeafCount) 11 | Die("Different leaf counts"); 12 | 13 | BiParter BP1; 14 | BiParter BP2; 15 | 16 | BP1.Init(T1); 17 | BP2.Init(T2); 18 | const uint NodeCount1 = T1.GetNodeCount(); 19 | 20 | #if TRACE 21 | Log("_____________________1____________________\n"); 22 | BP1.LogMe(); 23 | 24 | Log("\n"); 25 | Log("_____________________2____________________\n"); 26 | BP2.LogMe(); 27 | #endif 28 | 29 | uint SameCount = 0; 30 | uint DiffCount = 0; 31 | const vector > &PartVec1 = BP1.m_PartVec; 32 | for (uint Node1 = 0; Node1 < NodeCount1; ++Node1) 33 | { 34 | if (T1.IsLeaf(Node1) || T1.IsRoot(Node1)) 35 | continue; 36 | 37 | asserta(Node1 < SIZE(PartVec1)); 38 | const vector &Part1 = PartVec1[Node1]; 39 | 40 | vector Labels; 41 | BP1.GetPartInternalNodeLabels(Node1, Labels); 42 | const uint LabelCount = SIZE(Labels); 43 | 44 | uint Node2 = BP2.Search(Part1); 45 | bool Found = (Node2 != UINT_MAX); 46 | if (Found) 47 | ++SameCount; 48 | else 49 | ++DiffCount; 50 | 51 | Pf(g_fTab, "%u", Node1); 52 | Pf(g_fTab, "\t%c", tof(Found)); 53 | bool First = true; 54 | for (uint LabelIndex = 0; LabelIndex < LabelCount; ++LabelIndex) 55 | { 56 | const string &Label = Labels[LabelIndex]; 57 | if (Label == "") 58 | continue; 59 | if (First) 60 | Pf(g_fTab, "\t"); 61 | else 62 | Pf(g_fTab, ","); 63 | Pf(g_fTab, "%s", Label.c_str()); 64 | } 65 | Pf(g_fTab, "\n"); 66 | #if TRACE 67 | Log("Node1=%u Found=%c\n", Node1, tof(Found)); 68 | #endif 69 | } 70 | 71 | const uint UniquePartCount1 = SIZE(BP1.m_UniquePartNodes); 72 | double RF = double(DiffCount)/UniquePartCount1; 73 | double Q = double(SameCount)/UniquePartCount1; 74 | ProgressLog("RF = %u / %u, %.4f Q = %.4f\n", DiffCount, UniquePartCount1, RF, Q); 75 | } 76 | 77 | // "Q"-score of tree compared to correct reference tree 78 | // plus bootstrap assessment. 79 | void cmd_bootq() 80 | { 81 | const string &FileName1 = opt(bootq); 82 | const string &FileName2 = opt(ref); 83 | const string &OutputFileName = opt(output); 84 | g_fTab = CreateStdioFile(OutputFileName); 85 | 86 | TreeN T1; 87 | TreeN T2; 88 | T1.FromNewickFile(FileName1); 89 | T2.FromNewickFile(FileName2); 90 | 91 | BootQPair(T1, T2); 92 | 93 | CloseStdioFile(g_fTab); 94 | } 95 | -------------------------------------------------------------------------------- /src/taxer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "treex.h" 4 | #include "featuretable.h" 5 | 6 | class Taxer 7 | { 8 | public: 9 | uint m_TreeIndex = UINT_MAX; 10 | double m_MinTPFract = DBL_MAX; 11 | TreeX *m_T = 0; 12 | FeatureTable m_FT; 13 | uint m_ValueCount = 0; 14 | uint m_NodeIndexCount = 0; 15 | uint m_KFold = 0; 16 | set m_HoldOutSet; 17 | vector > m_NodeToValueCountVec; 18 | map m_BestNodeToValueIndex; 19 | 20 | vector m_BestNodes; 21 | vector m_Ns; 22 | vector m_TPs; 23 | vector m_FPs; 24 | vector m_FNs; 25 | vector m_Accs; 26 | 27 | vector m_NodeToTrueValueIndex; 28 | vector m_NodeToPredictedValueIndex; 29 | vector m_NodeToTrueValue; 30 | vector m_NodeToPredictedValue; 31 | vector m_NodeToResult; 32 | 33 | uint m_QN = 0; 34 | uint m_QTP = 0; 35 | uint m_QFP = 0; 36 | uint m_QFN = 0; 37 | uint m_RN = 0; 38 | uint m_RTP = 0; 39 | uint m_RFP = 0; 40 | uint m_RFN = 0; 41 | 42 | public: 43 | void Reset() 44 | { 45 | m_TreeIndex = UINT_MAX; 46 | m_T = 0; 47 | m_FT.Clear(); 48 | m_ValueCount = 0; 49 | m_NodeIndexCount = 0; 50 | m_KFold = 0; 51 | m_HoldOutSet.clear(); 52 | m_NodeToValueCountVec.clear(); 53 | m_BestNodeToValueIndex.clear(); 54 | m_BestNodes.clear(); 55 | m_Ns.clear(); 56 | m_TPs.clear(); 57 | m_FPs.clear(); 58 | m_FNs.clear(); 59 | m_Accs.clear(); 60 | m_NodeToPredictedValueIndex.clear(); 61 | m_NodeToTrueValueIndex.clear(); 62 | m_NodeToPredictedValue.clear(); 63 | m_NodeToTrueValue.clear(); 64 | m_NodeToResult.clear(); 65 | m_QN = 0; 66 | m_QTP = 0; 67 | m_QFP = 0; 68 | m_QFN = 0; 69 | m_RN = 0; 70 | m_RTP = 0; 71 | m_RFP = 0; 72 | m_RFN = 0; 73 | } 74 | 75 | void Init(uint TreeIndex, TreeX &T, 76 | uint K, double MinTPFract); 77 | void SetHoldOut(uint K); 78 | void SetValueCountVec(); 79 | void SetBestNode(uint ValueIndex); 80 | void SetBestNodes(); 81 | void SetResults(); 82 | void SetResult(uint Node); 83 | uint GetTrueValueIndex(uint Node) const; 84 | uint GetPredictedValueIndex(uint Node) const; 85 | bool IsQuery(uint Node) const; 86 | 87 | void ToFev(FILE *f) const; 88 | void ToTsv(FILE *f) const; 89 | void ToNewick(FILE *f) const; 90 | void TreeToFev(FILE *f) const; 91 | void ValueToFev(FILE *f, uint ValueIndex) const; 92 | void NodeToFev(FILE *f, uint Node) const; 93 | void NodeToTsv(FILE *f, uint Node) const; 94 | 95 | public: 96 | static double GetAcc(uint N, uint TP, uint FP, uint FN); 97 | }; 98 | -------------------------------------------------------------------------------- /src/condense.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void SetFeatureTable(const TreeN &T, FeatureTable &FT); 6 | void GetLCAs(const TreeN &T, const FeatureTable &FT, bool AllowInvert, 7 | vector &Values, vector &LCAs, vector &GroupSizes, 8 | vector &SubtreeSizes, vector &FPs, vector &FNs, 9 | vector &Inverts, vector &MonoFs); 10 | void UpdateLCALabel(const string &OldLabel, const string &Value, 11 | bool IsLeaf, string &NewLabel); 12 | 13 | void cmd_condense() 14 | { 15 | vector Trees; 16 | TreesFromFile(opt(condense), Trees); 17 | FILE *fTsv = CreateStdioFile(opt(tsvout)); 18 | 19 | const uint TreeCount = SIZE(Trees); 20 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 21 | { 22 | TreeN &T = *Trees[TreeIndex]; 23 | 24 | FeatureTable FT; 25 | SetFeatureTable(T, FT); 26 | 27 | vector Values; 28 | vector LCAs; 29 | vector GroupSizes; 30 | vector SubtreeSizes; 31 | vector FPs; 32 | vector FNs; 33 | vector Inverts; 34 | vector MonoFs; 35 | GetLCAs(T, FT, false, Values, LCAs, GroupSizes, SubtreeSizes, 36 | FPs, FNs, Inverts, MonoFs); 37 | 38 | if (fTsv != 0) 39 | { 40 | const uint ValueCount = SIZE(Values); 41 | asserta(SIZE(LCAs) == ValueCount); 42 | asserta(SIZE(FPs) == ValueCount); 43 | asserta(SIZE(FNs) == ValueCount); 44 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 45 | { 46 | fprintf(fTsv, "tree=%u", TreeIndex); 47 | fprintf(fTsv, "\tvalue=%s", Values[ValueIndex].c_str()); 48 | fprintf(fTsv, "\tsize=%u", GroupSizes[ValueIndex]); 49 | fprintf(fTsv, "\tFP=%u", FPs[ValueIndex]); 50 | fprintf(fTsv, "\tFN=%u", FNs[ValueIndex]); 51 | fprintf(fTsv, "\n"); 52 | } 53 | } 54 | 55 | set LCASet; 56 | const uint FoundCount = SIZE(LCAs); 57 | Progress("Tree %u / %u, %u LCAs\n", TreeIndex+1, TreeCount, FoundCount); 58 | for (uint i = 0; i < FoundCount; ++i) 59 | { 60 | const string &Value = Values[i]; 61 | uint LCA = LCAs[i]; 62 | if (LCA == UINT_MAX) 63 | continue; 64 | asserta(T.IsNode(LCA)); 65 | LCASet.insert(LCA); 66 | const string &Label = T.GetLabel(LCA); 67 | string NewLabel; 68 | bool IsLeaf = T.IsLeaf(LCA); 69 | UpdateLCALabel(Label, Value, IsLeaf, NewLabel); 70 | T.UpdateLabel(LCA, NewLabel); 71 | } 72 | 73 | T.Subset(LCASet); 74 | T.CollapseConfidenceUnary(); 75 | T.CollapseUnary(); 76 | T.Ladderize(opt(right)); 77 | } 78 | 79 | TreesToFile(Trees, opt(output)); 80 | 81 | CloseStdioFile(fTsv); 82 | } 83 | -------------------------------------------------------------------------------- /src/testdeletesubtree.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treex.h" 3 | #include "consmakerx.h" 4 | 5 | #define TEST 1 6 | 7 | #if TEST 8 | 9 | void Shuffle(vector &v); 10 | void GenerateRandomTree(const vector &LeafLabels, bool Rooted, 11 | double MinLength, double MaxLength, TreeX &T); 12 | 13 | static uint g_OkCount = 0; 14 | 15 | static void TestT1n(uint Node, const string &Str) 16 | { 17 | TreeX T; 18 | T.FromNewickStr(Str); 19 | asserta(T.IsRootedBinary()); 20 | 21 | asserta(!T.IsLeaf(Node)); 22 | asserta(Node != T.m_Origin); 23 | asserta(T.IsNode(Node)); 24 | 25 | Log("\n"); 26 | Log("%s\n", Str.c_str()); 27 | Log("Delete subtree under %u\n", Node); 28 | T.LogMe(); 29 | T.DeleteSubtree(Node, "DEL", false); 30 | Log("\nAfter delete:\n"); 31 | T.LogMe(); 32 | T.Validate(); 33 | ++g_OkCount; 34 | ProgressLog("Ok\n"); 35 | } 36 | 37 | static void TestT1(TreeX &T) 38 | { 39 | asserta(T.IsRootedBinary()); 40 | 41 | string Str; 42 | T.ToNewickStr(Str, false); 43 | const uint NIC = T.GetNodeIndexCount(); 44 | vector Nodes; 45 | for (uint i = 0; i < NIC; ++i) 46 | Nodes.push_back(i); 47 | Shuffle(Nodes); 48 | for (uint i = 0; i < NIC; ++i) 49 | { 50 | uint Node = Nodes[i]; 51 | if (T.IsLeaf(Node)) 52 | continue; 53 | if (Node == T.m_Origin) 54 | continue; 55 | TestT1n(Node, Str); 56 | } 57 | } 58 | 59 | static void Test1(const string &Str) 60 | { 61 | TreeX T; 62 | T.FromNewickStr(Str); 63 | TestT1(T); 64 | } 65 | 66 | static void TestHand() 67 | { 68 | Test1("((A,B),(C,D));"); 69 | Test1("((A,B),(C,D));"); 70 | Test1("((A,B),(C,D));"); 71 | Test1("((X,Y),(C,D));"); 72 | } 73 | 74 | static void TestR(uint N) 75 | { 76 | vector Labels; 77 | 78 | for (uint i = 0; i < N; ++i) 79 | { 80 | string Label; 81 | Ps(Label, "A%u", i); 82 | Labels.push_back(Label); 83 | } 84 | 85 | TreeX T; 86 | GenerateRandomTree(Labels, true, 1.0, 1.0, T); 87 | 88 | TestT1(T); 89 | } 90 | 91 | static void TestRs() 92 | { 93 | ResetRand(1); 94 | const uint ITERS = 100; 95 | const uint K = 100; 96 | for (uint Iter = 0; Iter < ITERS; ++Iter) 97 | { 98 | ProgressStep(Iter, ITERS, "Testing"); 99 | uint N = randu32()%K + K; 100 | TestR(N); 101 | } 102 | } 103 | 104 | void cmd_testdeletesubtree() 105 | { 106 | opt(testdeletesubtree); 107 | //TestT1n(12, 108 | //"(A0:1,(((A4:1,A8:1):1,(A2:1,A5:1):1):1,(A1:1,(A7:1,(A3:1,A6:1):1):1):1):1);"); 109 | // TestHand(); 110 | TestRs(); 111 | return; 112 | } 113 | 114 | #else // TEST 115 | void cmd_testrandomssubtree() {} 116 | #endif // TEST 117 | -------------------------------------------------------------------------------- /src/getlcasubtrees.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void SetFeatureTable(const TreeN &T, FeatureTable &FT); 6 | void GetLCAs(const TreeN &T, const FeatureTable &FT, bool AllowInvert, 7 | vector &Values, vector &LCAs, vector &GroupSizes, 8 | vector &SubtreeSizes, vector &FPs, vector &FNs, 9 | vector &Inverts, vector &MonoFs); 10 | void UpdateLCALabel(const string &OldLabel, const string &Value, 11 | bool IsLeaf, string &NewLabel); 12 | 13 | void cmd_getlcasubtrees() 14 | { 15 | vector Trees; 16 | TreesFromFile(opt(getlcasubtrees), Trees); 17 | FILE *fFev = CreateStdioFile(opt(fevout)); 18 | 19 | const uint TreeCount = SIZE(Trees); 20 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 21 | { 22 | TreeN &T = *Trees[TreeIndex]; 23 | 24 | FeatureTable FT; 25 | SetFeatureTable(T, FT); 26 | 27 | vector Values; 28 | vector LCAs; 29 | vector GroupSizes; 30 | vector SubtreeSizes; 31 | vector FPs; 32 | vector FNs; 33 | vector Inverts; 34 | vector MonoFs; 35 | GetLCAs(T, FT, false, Values, LCAs, GroupSizes, SubtreeSizes, 36 | FPs, FNs, Inverts, MonoFs); 37 | 38 | if (fFev != 0) 39 | { 40 | const uint ValueCount = SIZE(Values); 41 | asserta(SIZE(LCAs) == ValueCount); 42 | asserta(SIZE(FPs) == ValueCount); 43 | asserta(SIZE(FNs) == ValueCount); 44 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 45 | { 46 | const string &Value = Values[ValueIndex]; 47 | uint LCA = LCAs[ValueIndex]; 48 | if (LCA == UINT_MAX) 49 | { 50 | fprintf(fFev, "feature=%s\tsubtree_size=0\n", Value.c_str()); 51 | continue; 52 | } 53 | uint SubtreeSize = SubtreeSizes[ValueIndex]; 54 | uint N = GroupSizes[ValueIndex]; 55 | uint FP = FPs[ValueIndex]; 56 | uint FN = FNs[ValueIndex]; 57 | fprintf(fFev, "feature=%s\tsubtree_size=%u\tN=%u\tFP=%u\tFN=%u\n", 58 | Value.c_str(), SubtreeSize, N, FP, FN); 59 | vector Labels; 60 | T.GetSubtreeSortedLeafLabels(LCA, Labels); 61 | for (uint i = 0; i < SIZE(Labels); ++i) 62 | { 63 | const string &Label = Labels[i]; 64 | string LabelValue; 65 | FT.GetValue_ByLabel(Label, LabelValue, false); 66 | fprintf(fFev, "feature_label=%s\tleaf=%u\tlabel=%s\tvalue=%s", 67 | Value.c_str(), i, Labels[i].c_str(), LabelValue.c_str()); 68 | if (LabelValue == Value) 69 | fprintf(fFev, "\tcorrect=TP"); 70 | else 71 | fprintf(fFev, "\tcorrect=FP"); 72 | fprintf(fFev, "\n"); 73 | } 74 | } 75 | } 76 | } 77 | 78 | CloseStdioFile(fFev); 79 | } 80 | -------------------------------------------------------------------------------- /src/svg.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "svg.h" 3 | 4 | void Svg::Triangle(double x1, double y1, double x2, double y2, 5 | double x3, double y3, double LineWidth, 6 | const string &LineColor, const string &FillColor) 7 | { 8 | fprintf(m_f, 9 | "", 18 | x1, y1, x2, y2, x3, y3, LineWidth, LineColor.c_str(), FillColor.c_str()); 19 | } 20 | 21 | void Svg::Rect(double x, double y, double w, double h, 22 | double LineWidth, const string &LineColor, 23 | const string &FillColor) 24 | { 25 | if (m_f == 0) 26 | return; 27 | 28 | fprintf(m_f, 29 | "\n", 38 | x, y, w, h, LineWidth, LineColor.c_str(), FillColor.c_str()); 39 | } 40 | 41 | void Svg::Text(double x, double y, const string &FontFamily, 42 | double FontSize, const string &FontWeight, 43 | const string &FillColor, const string &TextAnchor, 44 | const string &Str) 45 | { 46 | if (m_f == 0) 47 | return; 48 | 49 | fprintf(m_f, 50 | "" 59 | "%s" 60 | "\n", 61 | x, y, FontFamily.c_str(), 62 | FontSize, FontWeight.c_str(), FillColor.c_str(), 63 | TextAnchor.c_str(), Str.c_str()); 64 | } 65 | 66 | void Svg::Line(double x1, double y1, double x2, double y2, 67 | double StrokeWidth, const string &Color) 68 | { 69 | if (m_f == 0) 70 | return; 71 | 72 | fprintf(m_f, 73 | "\n", 81 | x1, y1, x2, y2, StrokeWidth, Color.c_str()); 82 | } 83 | 84 | void Svg::Open(const string &FileName, double Width, double Height) 85 | { 86 | m_f = CreateStdioFile(FileName); 87 | m_Width = Width; 88 | m_Height = Height; 89 | fprintf(m_f, "\n"); 90 | fprintf(m_f, "\n", 91 | m_Width, m_Height); 92 | } 93 | 94 | void Svg::Close() 95 | { 96 | if (m_f == 0) 97 | return; 98 | fprintf(m_f, "\n"); 99 | CloseStdioFile(m_f); 100 | m_f = 0; 101 | } 102 | -------------------------------------------------------------------------------- /src/deleteoutgroup.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void StringsFromFile(const string &FileName, vector &Strings); 6 | void TreesFromFile(const string &FileName, vector &Trees); 7 | void TreesToFile(const vector &Trees, const string &FileName); 8 | 9 | void cmd_deleteoutgroup() 10 | { 11 | const string &InputFileName = opt(deleteoutgroup); 12 | 13 | vector Trees; 14 | TreesFromFile(InputFileName, Trees); 15 | const uint TreeCount = SIZE(Trees); 16 | 17 | vector GroupLabelsVec; 18 | set GroupLabels; 19 | string GroupName; 20 | string LabelSubstr; 21 | 22 | if (optset_labels) 23 | { 24 | StringsFromFile(opt(labels), GroupLabelsVec); 25 | for (uint i = 0; i < SIZE(GroupLabelsVec); ++i) 26 | GroupLabels.insert(GroupLabelsVec[i]); 27 | } 28 | else if (optset_outgroup) 29 | GroupName = opt(outgroup); 30 | else 31 | Die("Must specify -labels or -outgroup"); 32 | 33 | vector NewTrees; 34 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 35 | { 36 | TreeN &T = *Trees[TreeIndex]; 37 | 38 | bool Rooted = false; 39 | bool Binary = T.IsBinary(Rooted); 40 | asserta(Rooted && Binary); 41 | 42 | vector LeafNodes; 43 | T.GetLeafNodes(LeafNodes); 44 | const uint LeafCount = SIZE(LeafNodes); 45 | 46 | set GroupLeafNodes; 47 | if (optset_labels) 48 | T.GetGroupLeafNodes(GroupLabels, GroupLeafNodes); 49 | else if (optset_outgroup) 50 | T.GetGroupLeafNodes(GroupName, GroupLeafNodes); 51 | else 52 | asserta(false); 53 | 54 | if (GroupLeafNodes.empty()) 55 | Die("Group not found"); 56 | 57 | uint Root = T.GetRoot(); 58 | uint RootLeft = T.GetLeft(Root); 59 | uint RootRight = T.GetRight(Root); 60 | 61 | vector LeftLeaves; 62 | vector RightLeaves; 63 | T.AppendSubtreeLeafNodes(RootLeft, LeftLeaves); 64 | T.AppendSubtreeLeafNodes(RootRight, RightLeaves); 65 | 66 | uint LeftCount = 0; 67 | uint RightCount = 0; 68 | for (uint i = 0; i < SIZE(LeftLeaves); ++i) 69 | { 70 | uint LeafNode = LeftLeaves[i]; 71 | if (GroupLeafNodes.find(LeafNode) != GroupLeafNodes.end()) 72 | ++LeftCount; 73 | } 74 | 75 | for (uint i = 0; i < SIZE(RightLeaves); ++i) 76 | { 77 | uint LeafNode = RightLeaves[i]; 78 | if (GroupLeafNodes.find(LeafNode) != GroupLeafNodes.end()) 79 | ++RightCount; 80 | } 81 | asserta(LeftCount > 0 || RightCount > 0); 82 | 83 | TreeN &NewTree = *new TreeN; 84 | if (LeftCount > RightCount) 85 | NewTree.FromSubtree(T, RootRight); 86 | else 87 | NewTree.FromSubtree(T, RootLeft); 88 | 89 | if (optset_root_label) 90 | NewTree.UpdateLabel(NewTree.m_Root, opt(root_label)); 91 | 92 | NewTrees.push_back(&NewTree); 93 | } 94 | 95 | TreesToFile(NewTrees, opt(output)); 96 | } 97 | -------------------------------------------------------------------------------- /src/featuretablefromtree.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "tree2.h" 4 | #include "treen.h" 5 | #include "treex.h" 6 | 7 | void SetFeatureTable(const TreeN &T, FeatureTable &FT) 8 | { 9 | if (optset_ff) 10 | { 11 | const string &FF = opt(ff); 12 | if (SIZE(FF) != 2 || !isdigit(FF[1])) 13 | Die("Invalid ff"); 14 | 15 | char Sep = FF[0]; 16 | char Digit = FF[1]; 17 | if (Digit == '0') 18 | Die("Invalid ff (field must be >0)"); 19 | uint FieldIndex = uint(Digit - '0') - 1; 20 | FT.FromTree(T, Sep, FieldIndex); 21 | } 22 | else if (optset_features) 23 | { 24 | FT.FromFile(opt(features)); 25 | FT.SetLeafNodeSet(T); 26 | } 27 | else 28 | Die("Must set -ff or -features"); 29 | } 30 | 31 | void SetFeatureTable(const TreeX &T, FeatureTable &FT) 32 | { 33 | if (optset_ff) 34 | { 35 | const string &FF = opt(ff); 36 | if (SIZE(FF) != 2 || !isdigit(FF[1])) 37 | Die("Invalid ff"); 38 | 39 | char Sep = FF[0]; 40 | char Digit = FF[1]; 41 | if (Digit == '0') 42 | Die("Invalid ff (field must be >0)"); 43 | uint FieldIndex = uint(Digit - '0') - 1; 44 | FT.FromTree(T, Sep, FieldIndex); 45 | } 46 | else if (optset_features) 47 | { 48 | FT.FromFile(opt(features)); 49 | FT.SetLeafNodeSet(T); 50 | } 51 | else 52 | Die("Must set -ff or -features"); 53 | } 54 | 55 | void SetFeatureTable2(const Tree2 &T2, FeatureTable &FT) 56 | { 57 | TreeN TN; 58 | TN.FromTree2(T2); 59 | SetFeatureTable(TN, FT); 60 | } 61 | 62 | void AppendValuesFromTree(const TreeN &T, char Sep, uint FieldIndex, 63 | const string &MissingValue, set &Values) 64 | { 65 | const uint NodeCount = T.GetNodeCount(); 66 | vector Nodes; 67 | T.GetNodes(Nodes); 68 | 69 | for (uint k = 0; k < NodeCount; ++k) 70 | { 71 | uint Node = Nodes[k]; 72 | if (!T.IsLeaf(Node)) 73 | continue; 74 | const string &FullLabel = T.GetLabel(Node); 75 | if (FullLabel.empty()) 76 | continue; 77 | 78 | vector Fields; 79 | Split(FullLabel, Fields, Sep); 80 | if (SIZE(Fields) <= FieldIndex) 81 | continue; 82 | 83 | const string &Value = Fields[FieldIndex]; 84 | if (Value.empty() || Value == MissingValue) 85 | continue; 86 | 87 | Values.insert(Value); 88 | } 89 | } 90 | 91 | void GetValuesFromTrees(vector &Trees, char Sep, uint FieldIndex, 92 | const string &MissingValue, vector &Values) 93 | { 94 | Values.clear(); 95 | 96 | set setValues; 97 | const uint N = SIZE(Trees); 98 | for (uint i = 0; i < N; ++i) 99 | AppendValuesFromTree(*Trees[i], Sep, FieldIndex, MissingValue, 100 | setValues); 101 | 102 | for (set::const_iterator p = setValues.begin(); 103 | p != setValues.end(); ++p) 104 | Values.push_back(*p); 105 | } 106 | -------------------------------------------------------------------------------- /src/bestfitsubtree.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | void SetFeatureTable(const TreeN &T, FeatureTable &FT); 6 | void GetGroupLeafNodeSet(const TreeN &T, const FeatureTable &FT, 7 | uint ValueIndex, set &LeafNodeSet); 8 | 9 | void cmd_bestfitsubtree() 10 | { 11 | TreeN T; 12 | T.FromNewickFile(opt(bestfitsubtree)); 13 | 14 | asserta(optset_label); 15 | const string &TheValue = opt(label); 16 | 17 | bool Rooted; 18 | bool Binary = T.IsBinary(Rooted); 19 | asserta(Rooted); 20 | const uint NodeCount = T.GetNodeCount(); 21 | 22 | FeatureTable FT; 23 | SetFeatureTable(T, FT); 24 | uint ValueCount = FT.GetValueCount(); 25 | 26 | const uint TheValueIndex = FT.GetValueIndex(TheValue); 27 | asserta(TheValueIndex < ValueCount); 28 | 29 | vector NodeToValue(NodeCount, UINT_MAX); 30 | uint TheValueSize = 0; 31 | for (uint Node = 0; Node < NodeCount; ++Node) 32 | { 33 | if (!T.IsLeaf(Node)) 34 | continue; 35 | const string &Label = T.GetLabel(Node); 36 | uint ValueIndex = FT.GetValueIndex_ByLabel(Label); 37 | if (ValueIndex == TheValueIndex) 38 | ++TheValueSize; 39 | NodeToValue[Node] = ValueIndex; 40 | } 41 | 42 | Log(" Node Label TP FP FN Errs\n"); 43 | // 12345 123456789012 12345 12345 12345 12345 44 | 45 | uint BestNode = UINT_MAX; 46 | uint BestErrs = UINT_MAX; 47 | for (uint SubtreeNode = 0; SubtreeNode < NodeCount; ++SubtreeNode) 48 | { 49 | asserta(T.IsNode(SubtreeNode)); 50 | 51 | const string &SubtreeLabel = T.GetLabel(SubtreeNode); 52 | 53 | vector SubtreeLeafNodes; 54 | T.AppendSubtreeLeafNodes(SubtreeNode, SubtreeLeafNodes); 55 | const uint N = SIZE(SubtreeLeafNodes); 56 | 57 | vector ValueToCount(ValueCount, 0); 58 | uint SubtreeSize = 0; 59 | uint TP = 0; 60 | uint FP = 0; 61 | for (uint i = 0; i < N; ++i) 62 | { 63 | uint Leaf = SubtreeLeafNodes[i]; 64 | uint Value = NodeToValue[Leaf]; 65 | if (Value != UINT_MAX) 66 | { 67 | asserta(Value < ValueCount); 68 | ++(ValueToCount[Value]); 69 | ++SubtreeSize; 70 | if (Value == TheValueIndex) 71 | ++TP; 72 | else 73 | ++FP; 74 | } 75 | } 76 | 77 | if (TP < TheValueSize/4) 78 | continue; 79 | 80 | asserta(TP == ValueToCount[TheValueIndex]); 81 | asserta(TP <= TheValueSize); 82 | uint FN = TheValueSize - TP; 83 | uint Errs = FP + FN; 84 | if (BestNode == UINT_MAX || Errs < BestErrs) 85 | { 86 | BestNode = SubtreeNode; 87 | BestErrs = Errs; 88 | } 89 | 90 | Log("%5u", SubtreeNode); 91 | Log(" %12.12s", SubtreeLabel.c_str()); 92 | Log(" %5u", TP); 93 | Log(" %5u", FP); 94 | Log(" %5u", FN); 95 | Log(" %5u", Errs); 96 | 97 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 98 | { 99 | if (ValueIndex == TheValueIndex) 100 | continue; 101 | const string &Value = FT.GetValue(ValueIndex); 102 | uint n = ValueToCount[ValueIndex]; 103 | if (n > 0) 104 | Log(" %s=%u", Value.c_str(), n); 105 | } 106 | 107 | Log("\n"); 108 | } 109 | 110 | Log("BestNode %u\n", BestNode); 111 | } 112 | -------------------------------------------------------------------------------- /src/myopts.h: -------------------------------------------------------------------------------- 1 | #ifndef MY_VERSION 2 | #define MY_VERSION "1.0" 3 | #endif 4 | 5 | #define PROGRAM_NAME "newick" 6 | 7 | #define A(x) STR_OPT(x) 8 | #include "cmds.h" 9 | 10 | #ifndef VECTOR_OPT 11 | #define VECTOR_OPT(x) /* empty */ 12 | #endif 13 | 14 | STR_OPT(log) 15 | STR_OPT(tsvout) 16 | STR_OPT(fevout) 17 | STR_OPT(label) 18 | STR_OPT(labels) 19 | STR_OPT(labels2) 20 | STR_OPT(tree) 21 | STR_OPT(tree2) 22 | STR_OPT(report) 23 | STR_OPT(output) 24 | STR_OPT(features) 25 | STR_OPT(nodes) 26 | STR_OPT(cat) 27 | STR_OPT(value) 28 | STR_OPT(edge) 29 | STR_OPT(svg) 30 | STR_OPT(colors) 31 | STR_OPT(legend) 32 | STR_OPT(prefix) 33 | STR_OPT(pattern) 34 | STR_OPT(labelsout) 35 | STR_OPT(default_color) 36 | STR_OPT(title) 37 | STR_OPT(ref) 38 | STR_OPT(trees) 39 | STR_OPT(root_label) 40 | STR_OPT(old_root_label) 41 | STR_OPT(outgroup) 42 | STR_OPT(ff) 43 | STR_OPT(taxtable) 44 | STR_OPT(triangles) 45 | STR_OPT(squares) 46 | STR_OPT(titles) 47 | STR_OPT(labeldx) 48 | STR_OPT(order) 49 | STR_OPT(input2) 50 | 51 | UNS_OPT(node, 0, 0, UINT_MAX) 52 | UNS_OPT(bif, 0, 0, UINT_MAX) 53 | UNS_OPT(threads, 0, 0, UINT_MAX) 54 | UNS_OPT(randseed, 0, 0, UINT_MAX) 55 | UNS_OPT(strokewidth, 0, 0, UINT_MAX) 56 | UNS_OPT(n, 0, 0, UINT_MAX) 57 | UNS_OPT(subsetsize, 0, 0, UINT_MAX) 58 | UNS_OPT(maxpercluster, 0, 0, UINT_MAX) 59 | UNS_OPT(offsetx, 0, 0, UINT_MAX) 60 | UNS_OPT(offsety, 0, 0, UINT_MAX) 61 | UNS_OPT(trees_per_row, 0, 0, UINT_MAX) 62 | UNS_OPT(tree_width, 0, 0, UINT_MAX) 63 | UNS_OPT(tree_height, 0, 0, UINT_MAX) 64 | UNS_OPT(tree_spacing, 0, 0, UINT_MAX) 65 | UNS_OPT(title_font_size, 0, 0, UINT_MAX) 66 | UNS_OPT(label_font_size, 0, 0, UINT_MAX) 67 | UNS_OPT(min_group_size, 0, 0, UINT_MAX) 68 | UNS_OPT(treeix, 0, 0, UINT_MAX) 69 | UNS_OPT(nodeix, 0, 0, UINT_MAX) 70 | UNS_OPT(scalex, 0, 0, UINT_MAX) 71 | UNS_OPT(scaley, 0, 0, UINT_MAX) 72 | UNS_OPT(pixelsperunit, 0, 0, UINT_MAX) 73 | UNS_OPT(kfold, 0, 0, UINT_MAX) 74 | 75 | FLT_OPT(maxdist, 0.2, 0.0, 1.0) 76 | FLT_OPT(minlength, 0.0, 0.0, FLT_MAX) 77 | FLT_OPT(maxlength, 0.0, 0.0, FLT_MAX) 78 | FLT_OPT(length, 0.0, 0.0, FLT_MAX) 79 | FLT_OPT(minconf, 0.0, 0.0, FLT_MAX) 80 | FLT_OPT(majorityfract, 0.0, 0.0, FLT_MAX) 81 | FLT_OPT(mintpfract, 0.0, 0.0, FLT_MAX) 82 | FLT_OPT(minmedleafdist, 0.0, 0.0, FLT_MAX) 83 | FLT_OPT(maxmedleafdist, 0.0, 0.0, FLT_MAX) 84 | FLT_OPT(minboot, 0.0, 0.0, FLT_MAX) 85 | 86 | FLAG_OPT(quiet) 87 | FLAG_OPT(log_used_opts) 88 | FLAG_OPT(compilerinfo) 89 | FLAG_OPT(strict_newick) 90 | FLAG_OPT(right) 91 | FLAG_OPT(accs) 92 | FLAG_OPT(trace_parse) 93 | FLAG_OPT(trace_lex) 94 | FLAG_OPT(rooted) 95 | FLAG_OPT(self) 96 | FLAG_OPT(delete_labels) 97 | FLAG_OPT(lcaconfs) 98 | FLAG_OPT(unitlengths) 99 | FLAG_OPT(draw_leaf_labels) 100 | FLAG_OPT(draw_internal_labels) 101 | FLAG_OPT(internal_labels_pct) 102 | FLAG_OPT(title16x) 103 | FLAG_OPT(log_tree) 104 | FLAG_OPT(savetrees) 105 | FLAG_OPT(allow_blank_labels) 106 | 107 | VECTOR_OPT(_notused_vector) 108 | 109 | #undef FLAG_OPT 110 | #undef UNS_OPT 111 | #undef FLT_OPT 112 | #undef STR_OPT 113 | #undef VECTOR_OPT 114 | -------------------------------------------------------------------------------- /src/syncftacc.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | #include 4 | 5 | void MakeAccMaps(const vector &Labels, 6 | map &LabelToAcc, 7 | map &AccToLabel) 8 | { 9 | LabelToAcc.clear(); 10 | AccToLabel.clear(); 11 | 12 | for (uint i = 0; i < SIZE(Labels); ++i) 13 | { 14 | const string &Label = Labels[i]; 15 | string Acc; 16 | GetAccFromLabel(Label, Acc); 17 | if (LabelToAcc.find(Label) != LabelToAcc.end()) 18 | Die("Dup label >%s", Label.c_str()); 19 | if (AccToLabel.find(Acc) != AccToLabel.end()) 20 | Die("Dup acc >%s", Acc.c_str()); 21 | AccToLabel[Acc] = Label; 22 | LabelToAcc[Label] = Acc; 23 | } 24 | } 25 | 26 | void cmd_syncftacc() 27 | { 28 | const string &FN = opt(syncftacc); 29 | FILE *fOut = CreateStdioFile(opt(output)); 30 | if (fOut == 0) 31 | Die("Failed to create output file"); 32 | 33 | Tree2 T; 34 | T.FromNewickFile(opt(tree)); 35 | set TreeLabels; 36 | vector TreeLabelVec; 37 | 38 | const uint NodeCount = T.GetNodeCount(); 39 | for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) 40 | { 41 | if (!T.IsLeaf(NodeIndex)) 42 | continue; 43 | const string Label = T.GetLabel(NodeIndex); 44 | if (TreeLabels.find(Label) != TreeLabels.end()) 45 | Die("Dupe label >%s", Label.c_str()); 46 | TreeLabels.insert(Label); 47 | TreeLabelVec.push_back(Label); 48 | } 49 | 50 | map TreeAccToLabel; 51 | map TreeLabelToAcc; 52 | MakeAccMaps(TreeLabelVec, TreeLabelToAcc, TreeAccToLabel); 53 | 54 | FILE *fIn = OpenStdioFile(FN); 55 | 56 | string Line; 57 | vector Fields; 58 | set FoundLabels; 59 | uint Found = 0; 60 | uint NotFound = 0; 61 | uint CatCount = UINT_MAX; 62 | string HdrLine; 63 | ReadLineStdioFile(fIn, HdrLine); 64 | fprintf(fOut, "%s\n", HdrLine.c_str()); 65 | while (ReadLineStdioFile(fIn, Line)) 66 | { 67 | Split(Line, Fields, '\t'); 68 | uint FieldCount = SIZE(Fields); 69 | if (CatCount == UINT_MAX) 70 | CatCount = FieldCount - 1; 71 | else 72 | asserta(FieldCount == CatCount + 1); 73 | 74 | const string &Label = Fields[0]; 75 | string Acc; 76 | GetAccFromLabel(Label, Acc); 77 | map::const_iterator p = TreeAccToLabel.find(Acc); 78 | if (p == TreeAccToLabel.end()) 79 | { 80 | ++NotFound; 81 | ProgressLog("Deleted >%s\n", Label.c_str()); 82 | continue; 83 | } 84 | ++Found; 85 | const string &TreeLabel = p->second; 86 | FoundLabels.insert(TreeLabel); 87 | fprintf(fOut, "%s", TreeLabel.c_str()); 88 | for (uint i = 1; i < FieldCount; ++i) 89 | fprintf(fOut, "\t%s", Fields[i].c_str()); 90 | fprintf(fOut, "\n"); 91 | } 92 | asserta(SIZE(FoundLabels) == Found); 93 | 94 | set Missing; 95 | for (set::const_iterator p = TreeLabels.begin(); 96 | p != TreeLabels.end(); ++p) 97 | { 98 | const string &TreeLabel = *p; 99 | if (FoundLabels.find(TreeLabel) != FoundLabels.end()) 100 | continue; 101 | fprintf(fOut, "%s", TreeLabel.c_str()); 102 | for (uint i = 0; i < CatCount; ++i) 103 | fprintf(fOut, "\t."); 104 | fprintf(fOut, "\n"); 105 | ProgressLog("Inserted >%s\n", TreeLabel.c_str()); 106 | } 107 | 108 | CloseStdioFile(fIn); 109 | } 110 | -------------------------------------------------------------------------------- /src/consensusx.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treex.h" 3 | #include "consmakerx.h" 4 | 5 | #define TEST 0 6 | 7 | #if TEST 8 | 9 | void GenerateRandomTree(const vector &LeafLabels, bool Rooted, 10 | double MinLength, double MaxLength, TreeX &T); 11 | 12 | static void TestT12(const TreeX &T1, const TreeX &T2) 13 | { 14 | const bool Verbose = false; 15 | if (Verbose) 16 | { 17 | Log("\n________ T1 _____________\n"); 18 | T1.DrawText(); 19 | 20 | Log("\n________ T2 _____________\n"); 21 | T2.DrawText(); 22 | } 23 | 24 | ConsMakerX CM; 25 | CM.MakeConsensus(T1, T2); 26 | 27 | if (Verbose) 28 | { 29 | Log("\n________ CT _____________\n"); 30 | CM.m_ConsTree.DrawText(); 31 | } 32 | } 33 | 34 | static void Test12(const string &Str1, const string &Str2) 35 | { 36 | TreeX T1; 37 | TreeX T2; 38 | 39 | T1.FromNewickStr(Str1); 40 | T2.FromNewickStr(Str2); 41 | 42 | TestT12(T1, T2); 43 | } 44 | 45 | static void Test1(const string &Str1, const string &Str2) 46 | { 47 | Test12(Str1, Str2); 48 | Test12(Str2, Str1); 49 | } 50 | 51 | static void TestHand() 52 | { 53 | Test1("((A,B),(C,D));", "((A,B),(C,E));"); 54 | Test1("((A,B),(C,D));", "((E,F),(G,H));"); 55 | Test1("((A,B),(C,D));", "((E,F),(G,H));"); 56 | Test1("((X,Y),(C,D));", "(X,Y);"); 57 | } 58 | 59 | static void TestR(uint N, uint M1, uint M2) 60 | { 61 | vector Labels1; 62 | vector Labels2; 63 | 64 | for (uint i = 0; i < N; ++i) 65 | { 66 | string Label; 67 | Ps(Label, "A%u", i); 68 | Labels1.push_back(Label); 69 | Labels2.push_back(Label); 70 | } 71 | 72 | for (uint i = 0; i < M1; ++i) 73 | { 74 | string Label; 75 | Ps(Label, "b%u", i); 76 | Labels1.push_back(Label); 77 | } 78 | 79 | for (uint i = 0; i < M2; ++i) 80 | { 81 | string Label; 82 | Ps(Label, "c%u", i); 83 | Labels2.push_back(Label); 84 | } 85 | 86 | TreeX T1; 87 | TreeX T2; 88 | GenerateRandomTree(Labels1, true, 1.0, 1.0, T1); 89 | GenerateRandomTree(Labels2, true, 2.0, 2.0, T2); 90 | 91 | TestT12(T1, T2); 92 | TestT12(T2, T1); 93 | } 94 | 95 | static void TestRs() 96 | { 97 | ResetRand(1); 98 | const uint ITERS = 100; 99 | const uint K = 512; 100 | for (uint Iter = 0; Iter < ITERS; ++Iter) 101 | { 102 | ProgressStep(Iter, ITERS, "Testing"); 103 | uint N = randu32()%K + K; 104 | uint M1 = randu32()%K; 105 | uint M2 = randu32()%K; 106 | TestR(N, M1, M2); 107 | 108 | N = 0; 109 | M1 = 3 + randu32()%K; 110 | M2 = 3 + randu32()%K; 111 | TestR(N, M1, M2); 112 | 113 | N = 3 + randu32()%K; 114 | M1 = randu32()%K; 115 | M2 = randu32()%K; 116 | TestR(N, M1, M2); 117 | } 118 | } 119 | 120 | void cmd_consensus2x() 121 | { 122 | opt(consensus2x); 123 | TestRs(); 124 | return; 125 | } 126 | 127 | #else // TEST 128 | 129 | void cmd_consensus2x() 130 | { 131 | const string &InputFileName = opt(consensus2x); 132 | const string &Input2FileName = opt(input2); 133 | const string &OutputFileName = opt(output); 134 | 135 | TreeX Tree1; 136 | TreeX Tree2; 137 | 138 | Tree1.FromNewickFile(InputFileName); 139 | Tree2.FromNewickFile(Input2FileName); 140 | 141 | ConsMakerX CM; 142 | CM.MakeConsensus(Tree1, Tree2); 143 | 144 | CM.m_ConsTree.ToNewickFile(OutputFileName); 145 | } 146 | 147 | #endif // TEST 148 | -------------------------------------------------------------------------------- /src/cluster2.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "clustermaker.h" 3 | 4 | void Shuffle(vector &v); 5 | 6 | uint ClusterMaker::GetSubsetSize(uint MaxPerCluster) const 7 | { 8 | uint Size = 0; 9 | const uint SelectedNodeCount = SIZE(m_SubtreeNodes); 10 | for (uint i = 0; i < SelectedNodeCount; ++i) 11 | { 12 | uint Node = m_SubtreeNodes[i]; 13 | const vector &LeafNodes = m_SubtreeLeafNodesVec[i]; 14 | const uint SubtreeLeafCount = SIZE(LeafNodes); 15 | if (SubtreeLeafCount >= 3) 16 | Size += 3; 17 | else 18 | { 19 | asserta(SubtreeLeafCount == 1 || SubtreeLeafCount == 2); 20 | Size += SubtreeLeafCount; 21 | } 22 | } 23 | return Size; 24 | } 25 | 26 | void cmd_cluster2() 27 | { 28 | const string &TreeFileName = opt(cluster2); 29 | Tree2 T; 30 | T.FromFile(TreeFileName); 31 | 32 | asserta(optset_subsetsize); 33 | asserta(optset_maxpercluster); 34 | const uint TargetSubsetSize = opt(subsetsize); 35 | const uint MaxPerCluster = opt(maxpercluster); 36 | 37 | ClusterMaker CM; 38 | uint SizeLo = UINT_MAX; 39 | uint SizeHi = UINT_MAX; 40 | double DistLo = -1; 41 | double DistHi = -1; 42 | 43 | const uint ITERS1 = 100; 44 | const float FACTOR = 1.1; 45 | double d = 0.01; 46 | for (uint Iter = 0; Iter < ITERS1; ++Iter) 47 | { 48 | CM.Run(T, d); 49 | uint Size = CM.GetSubsetSize(MaxPerCluster); 50 | ProgressStep(Iter, ITERS1, "Pass 1 d=%.3g size=%u", d, Size); 51 | if (Size < TargetSubsetSize) 52 | { 53 | DistLo = d; 54 | SizeLo = Size; 55 | ProgressStep(ITERS1-1, ITERS1, "Pass 1 d=%.3g size=%u", d, Size); 56 | break; 57 | } 58 | if (Size > TargetSubsetSize) 59 | { 60 | DistHi = d; 61 | SizeHi = Size; 62 | } 63 | d *= FACTOR; 64 | } 65 | 66 | asserta(DistLo > DistHi); 67 | asserta(DistLo != DistHi); 68 | const uint ITERS2 = 100; 69 | double Delta = (DistLo - DistHi)/100; 70 | uint BestSize = SizeLo; 71 | uint BestAbsDiff = UINT_MAX; 72 | double Bestd = DistLo; 73 | for (uint Iter = 0; Iter < ITERS2; ++Iter) 74 | { 75 | ProgressStep(Iter, ITERS2, "Pass 2 d=%.3g size=%u", Bestd, BestSize); 76 | double d = DistHi + Iter*Delta; 77 | CM.Run(T, d); 78 | uint Size = CM.GetSubsetSize(MaxPerCluster); 79 | uint AbsDiff = (Size > TargetSubsetSize ? 80 | Size - TargetSubsetSize : 81 | TargetSubsetSize - Size); 82 | if (BestSize == UINT_MAX || AbsDiff < BestAbsDiff) 83 | { 84 | BestSize = Size; 85 | BestAbsDiff = AbsDiff; 86 | Bestd = d; 87 | if (BestAbsDiff == 0) 88 | { 89 | ProgressStep(ITERS2-1, ITERS2, "Pass 2 d=%.3g size=%u", Bestd, BestSize); 90 | break; 91 | } 92 | } 93 | } 94 | 95 | CM.Run(T, Bestd); 96 | uint Size = CM.GetSubsetSize(MaxPerCluster); 97 | ProgressLog("Bestd %.3g size %u\n", Bestd, Size); 98 | CM.ToTSV(opt(tsvout)); 99 | CM.ToNewick(opt(output)); 100 | 101 | if (!optset_labelsout) 102 | return; 103 | 104 | FILE *f = CreateStdioFile(opt(labelsout)); 105 | const uint N = SIZE(CM.m_SubtreeLeafNodesVec); 106 | for (uint i = 0; i < N; ++i) 107 | { 108 | vector LeafNodes = CM.m_SubtreeLeafNodesVec[i]; 109 | Shuffle(LeafNodes); 110 | uint n = SIZE(LeafNodes); 111 | asserta(n > 0); 112 | if (n > MaxPerCluster) 113 | n = MaxPerCluster; 114 | for (uint j = 0; j < n; ++j) 115 | { 116 | uint Node = LeafNodes[j]; 117 | const string &Label = T.GetLabel(Node); 118 | fprintf(f, "%s\n", Label.c_str()); 119 | } 120 | } 121 | CloseStdioFile(f); 122 | } 123 | -------------------------------------------------------------------------------- /src/diameter.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | #include "treen.h" 4 | 5 | double CalcDiameter(const Tree2 &T, string &Label1, string &Label2) 6 | { 7 | Tree2 C; 8 | C.FromTree(T); 9 | uint NodeCount = C.GetNodeCount(); 10 | if (!C.IsRooted()) 11 | { 12 | for (uint Node = 0; Node < NodeCount; ++Node) 13 | { 14 | if (!C.IsLeaf(Node)) 15 | { 16 | uint Node2 = C.GetEdge1(Node); 17 | C.SetRoot(Node, Node2); 18 | break; 19 | } 20 | } 21 | } 22 | asserta(C.IsRooted()); 23 | NodeCount = C.GetNodeCount(); 24 | uint NegativeLengthCount = 0; 25 | for (map, double>::iterator p = C.m_EdgeToLength.begin(); 26 | p != C.m_EdgeToLength.end(); ++p) 27 | { 28 | double Length = p->second; 29 | if (Length != MISSING_LENGTH && Length < 0) 30 | { 31 | ++NegativeLengthCount; 32 | p->second = 0; 33 | } 34 | } 35 | if (NegativeLengthCount > 0) 36 | Warning("%u negative edge lengths set to zero", NegativeLengthCount); 37 | 38 | vector RootDists; 39 | C.GetRootDists(RootDists); 40 | asserta(SIZE(RootDists) == NodeCount); 41 | double MaxRootDist = 0; 42 | uint MaxNode = UINT_MAX; 43 | for (uint Node = 0; Node < NodeCount; ++Node) 44 | { 45 | double RootDist = RootDists[Node]; 46 | if (RootDist > MaxRootDist) 47 | { 48 | MaxNode = Node; 49 | MaxRootDist = RootDist; 50 | } 51 | } 52 | if (MaxRootDist == 0) 53 | return 0; 54 | 55 | asserta(!C.IsRoot(MaxNode)); 56 | asserta(C.IsLeaf(MaxNode)); 57 | const string MaxLabel = C.GetLabel(MaxNode); 58 | asserta(MaxLabel != ""); 59 | 60 | C.Validate(); 61 | C.Unroot(); 62 | 63 | uint UnrootedMaxNode = C.GetNodeByLabel(MaxLabel, true); 64 | uint UnrootedMaxEdge1 = C.GetEdge1(UnrootedMaxNode); 65 | uint UnrootedMaxEdge2 = C.GetEdge2(UnrootedMaxNode); 66 | uint UnrootedMaxEdge3 = C.GetEdge3(UnrootedMaxNode); 67 | 68 | asserta(UnrootedMaxEdge1 != UINT_MAX); 69 | asserta(UnrootedMaxEdge2 == UINT_MAX); 70 | asserta(UnrootedMaxEdge3 == UINT_MAX); 71 | 72 | C.SetRoot(UnrootedMaxNode, UnrootedMaxEdge1); 73 | 74 | const uint UnrootedNodeCount = C.GetNodeCount(); 75 | C.GetRootDists(RootDists); 76 | asserta(SIZE(RootDists) == UnrootedNodeCount); 77 | 78 | double MaxRootDist2 = 0; 79 | uint MaxNode2 = UINT_MAX; 80 | for (uint Node = 0; Node < UnrootedNodeCount; ++Node) 81 | { 82 | double RootDist = RootDists[Node]; 83 | if (RootDist > MaxRootDist) 84 | { 85 | MaxNode2 = Node; 86 | MaxRootDist2 = RootDist; 87 | } 88 | } 89 | if (MaxNode2 == UINT_MAX) 90 | return 0; 91 | asserta(MaxRootDist > 0); 92 | double Diameter = MaxRootDist2; 93 | Label1 = MaxLabel; 94 | Label2 = C.GetLabel(MaxNode2); 95 | return Diameter; 96 | } 97 | 98 | double CalcDiameter(const TreeN &T, string &Label1, string &Label2) 99 | { 100 | Tree2 T2; 101 | T2.FromTreeN(T); 102 | double Diameter = CalcDiameter(T2, Label1, Label2); 103 | return Diameter; 104 | } 105 | 106 | static void Test1(const string &NewickStr) 107 | { 108 | Tree2 T; 109 | T.FromStr(NewickStr); 110 | T.LogMe(); 111 | 112 | string Label1; 113 | string Label2; 114 | double d = CalcDiameter(T, Label1, Label2); 115 | ProgressLog("\n"); 116 | ProgressLog(" Tree %s\n", NewickStr.c_str()); 117 | ProgressLog(" Diam %.3g\n", d); 118 | ProgressLog("Leaf1 %s\n", Label1.c_str()); 119 | ProgressLog("Leaf2 %s\n", Label2.c_str()); 120 | } 121 | 122 | static void _cmd_test() 123 | { 124 | opt(test); 125 | Test1("((A:0.1,B:0.2):0.25,C:0.3);"); 126 | Test1("((A:0.1,B:0.2):0.25,(C:0.3,D:0.4):0.5);"); 127 | } 128 | -------------------------------------------------------------------------------- /src/findgroups.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | #include "sort.h" 4 | 5 | #if 0 6 | 7 | static double g_IdealUpperDist = 2.0; 8 | static uint g_MinGroupSize = 10; 9 | static const vector *g_RootDists; 10 | 11 | static double GetUpperLeafDist(TreeN &T, uint Node) 12 | { 13 | const vector &RootDists = *g_RootDists; 14 | vector LeafNodes; 15 | T.GetSubtreeLeafNodes(Node, LeafNodes); 16 | asserta(Node < SIZE(RootDists)); 17 | double ThisRootDist = RootDists[Node]; 18 | vector LeafDists; 19 | const uint N = SIZE(LeafNodes); 20 | for (uint i = 0; i < N; ++i) 21 | { 22 | uint Leaf = LeafNodes[i]; 23 | asserta(Leaf < SIZE(RootDists)); 24 | double d = RootDists[Leaf]; 25 | LeafDists.push_back(d - ThisRootDist); 26 | } 27 | vector Order; 28 | QuickSortInPlace(LeafDists.data(), N); 29 | double UpperDist = LeafDists[3*N/4]; 30 | return UpperDist; 31 | } 32 | 33 | static double GetScore_Size(uint N) 34 | { 35 | double Score = double(N)/(N + 10); 36 | return Score; 37 | } 38 | 39 | static double GetScore_UpperDist(double d) 40 | { 41 | const double ID = g_IdealUpperDist; 42 | const double ID4 = ID/4; 43 | 44 | double Err = 0; 45 | if (d < ID - ID4) 46 | Err = ID - ID4 - d; 47 | else if (d > ID - ID4) 48 | Err = d - (ID + ID4); 49 | double Score = 1.0 - Err/ID; 50 | if (Score < 0) 51 | Score = 0; 52 | return Score; 53 | } 54 | 55 | static double GetScore_Length(double Length) 56 | { 57 | double Score = Length/(Length + 0.5); 58 | return Score; 59 | } 60 | 61 | static double GetScore_Balance(uint NL, uint NR) 62 | { 63 | double MaxN = (double) max(NL, NR); 64 | double MinN = (double) min(NL, NR); 65 | 66 | double Score = (MaxN + 5.0)/(MinN + 5.0); 67 | return Score; 68 | } 69 | 70 | static double GetScore(TreeN &T, uint Node) 71 | { 72 | double UpperDist = GetUpperLeafDist(T, Node); 73 | double Length = T.GetLength(Node); 74 | uint Left = T.GetLeft(Node); 75 | uint Right = T.GetRight(Node); 76 | uint NL = T.GetSubtreeLeafCount(Left); 77 | uint NR = T.GetSubtreeLeafCount(Left); 78 | uint N = NL + NR; 79 | asserta(N >= g_MinGroupSize); 80 | 81 | double Score_Size = GetScore_Size(N); 82 | double Score_UpperDist = GetScore_UpperDist(UpperDist); 83 | double Score_Length = GetScore_Length(Length); 84 | double Score_Balance = GetScore_Balance(NL, NR); 85 | 86 | double Score1 = (Score_Size + Score_Length + Score_Balance)/3; 87 | double Score = Score1*Score_UpperDist; 88 | 89 | return Score; 90 | } 91 | 92 | static void DoTree(TreeN &T) 93 | { 94 | const uint NodeCount = T.GetNodeCount(); 95 | vector RootDists; 96 | T.GetRootDists(RootDists); 97 | g_RootDists = &RootDists; 98 | for (uint Node= 0; Node < NodeCount; ++Node) 99 | { 100 | if (T.IsLeaf(Node)) 101 | continue; 102 | uint SubtreeLeafNodeCount = T.GetSubtreeLeafCount(Node); 103 | if (SubtreeLeafNodeCount < g_MinGroupSize) 104 | continue; 105 | double Score = GetScore(T, Node); 106 | 107 | string sScore; 108 | Ps(sScore, "%u=%.3g", Node, Score); 109 | T.UpdateLabel(Node, sScore); 110 | } 111 | } 112 | 113 | void cmd_findgroups() 114 | { 115 | const string &InputFileName = opt(findgroups); 116 | const string &OutputFileName = opt(output); 117 | FILE *fOut = CreateStdioFile(OutputFileName); 118 | 119 | vector Trees; 120 | TreesFromFile(InputFileName, Trees); 121 | const uint TreeCount = SIZE(Trees); 122 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 123 | { 124 | TreeN &T = *Trees[TreeIndex]; 125 | DoTree(T); 126 | T.ToNewickFile(fOut); 127 | } 128 | CloseStdioFile(fOut); 129 | } 130 | #else 131 | // kind of a dead end 132 | void cmd_findgroups() {} 133 | #endif 134 | -------------------------------------------------------------------------------- /src/collapse.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treen.h" 4 | 5 | double GetConf(const string &Label, bool UsePcts) 6 | { 7 | double Conf = 0; 8 | if (Label.empty()) 9 | return -1.0; 10 | 11 | if (UsePcts) 12 | { 13 | if (!IsValidIntStr(Label)) 14 | Warning("Invalid integer '%s', set to zero", 15 | Label.c_str()); 16 | else 17 | { 18 | uint Pct = StrToUint(Label); 19 | if (Pct > 100) 20 | { 21 | Warning("Invalid percent '%s', set to zero", 22 | Label.c_str()); 23 | Pct = 0; 24 | } 25 | Conf = Pct/100.0; 26 | } 27 | } 28 | else 29 | { 30 | if (!IsValidFloatStr(Label)) 31 | { 32 | Warning("Invalid float '%s', set to zero", 33 | Label.c_str()); 34 | Conf = 0; 35 | } 36 | else 37 | { 38 | Conf = StrToFloat(Label); 39 | if (Conf < 0 || Conf > 1) 40 | { 41 | Warning("Invalid fraction '%s', set to zero", 42 | Label.c_str()); 43 | Conf = 0; 44 | } 45 | } 46 | } 47 | return Conf; 48 | } 49 | 50 | void GetFractConfs(const TreeN &T, vector &Confs) 51 | { 52 | Confs.clear(); 53 | 54 | vector Nodes; 55 | T.GetNodes(Nodes); 56 | const uint NodeCount = SIZE(Nodes); 57 | uint IntNodeCount = 0; 58 | uint PctCount = 0; 59 | uint FractCount = 0; 60 | uint MissingCount = 0; 61 | uint OtherCount = 0; 62 | for (uint k = 0; k < NodeCount; ++k) 63 | { 64 | uint Node = Nodes[k]; 65 | if (T.IsRoot(Node) || T.IsLeaf(Node)) 66 | continue; 67 | const string &Label = T.GetLabel(Node); 68 | ++IntNodeCount; 69 | if (Label.empty()) 70 | ++MissingCount; 71 | else if (IsValidIntStr(Label)) 72 | { 73 | uint Pct = StrToUint(Label); 74 | if (Pct == 0 || Pct == 1) 75 | { 76 | ++PctCount; 77 | ++FractCount; 78 | } 79 | else if (Pct > 1 && Pct <= 100) 80 | ++PctCount; 81 | else 82 | ++OtherCount; 83 | } 84 | else if (IsValidFloatStr(Label)) 85 | { 86 | double Fract = StrToFloat(Label); 87 | if (Fract >= 0 && Fract <= 1) 88 | ++FractCount; 89 | else 90 | ++OtherCount; 91 | } 92 | else 93 | ++OtherCount; 94 | } 95 | 96 | bool UsePcts = false; 97 | if (PctCount > IntNodeCount/2) 98 | UsePcts = true; 99 | else if (FractCount > IntNodeCount/2) 100 | UsePcts = false; 101 | else 102 | Die("Internal node labels not recognized pcts %u, fracts %u, other %u", 103 | PctCount, FractCount, IntNodeCount - PctCount - FractCount); 104 | 105 | for (uint k = 0; k < NodeCount; ++k) 106 | { 107 | uint Node = Nodes[k]; 108 | if (T.IsRoot(Node) || T.IsLeaf(Node)) 109 | { 110 | Confs.push_back(-1); 111 | continue; 112 | } 113 | const string &Label = T.GetLabel(Node); 114 | double Conf = GetConf(Label, UsePcts); 115 | Confs.push_back(Conf); 116 | } 117 | } 118 | 119 | void cmd_collapse() 120 | { 121 | TreeN T; 122 | T.FromNewickFile(opt(collapse)); 123 | 124 | double MinConf = 0.70; 125 | if (optset_minconf) 126 | MinConf = opt(minconf); 127 | if (MinConf < 0 || MinConf > 1) 128 | Die("minconf must be in range 0 .. 1"); 129 | 130 | vector Confs; 131 | GetFractConfs(T, Confs); 132 | 133 | vector Nodes; 134 | T.GetNodes(Nodes); 135 | const uint NodeCount = SIZE(Nodes); 136 | asserta(SIZE(Confs) == NodeCount); 137 | uint CollapseCount = 0; 138 | for (uint k = 0; k < NodeCount; ++k) 139 | { 140 | uint Node = Nodes[k]; 141 | if (T.IsRoot(Node) || T.IsLeaf(Node)) 142 | continue; 143 | double Conf = Confs[Node]; 144 | if (Conf < 0) 145 | continue; 146 | if (Conf < MinConf) 147 | { 148 | ++CollapseCount; 149 | T.CollapseNode(Node); 150 | } 151 | } 152 | 153 | ProgressLog("%u / %u nodes collapsed\n", 154 | CollapseCount, NodeCount); 155 | 156 | T.ToNewickFile(opt(output)); 157 | } 158 | -------------------------------------------------------------------------------- /src/condensex.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "treex.h" 4 | 5 | static void Condense(TreeX &T, double MinTPFract) 6 | { 7 | FeatureTable FT; 8 | SetFeatureTable(T, FT); 9 | 10 | vector ValueToBestFitNode; 11 | const uint ValueCount = FT.GetValueCount(); 12 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 13 | { 14 | set LabelSet; 15 | FT.GetLabels_ByValueIndex(ValueIndex, LabelSet); 16 | 17 | set LeafNodeSet; 18 | T.LabelSetToLeafNodeSet(LabelSet, LeafNodeSet); 19 | 20 | uint TP, FP, FN; 21 | uint BestFitNode = 22 | T.GetBestFitSubtree(LeafNodeSet, MinTPFract, TP, FP, FN); 23 | 24 | ValueToBestFitNode.push_back(BestFitNode); 25 | } 26 | 27 | for (uint i = 0; i < ValueCount; ++i) 28 | { 29 | uint Nodei = ValueToBestFitNode[i]; 30 | if (Nodei == UINT_MAX) 31 | continue; 32 | for (uint j = 0; j < ValueCount; ++j) 33 | { 34 | if (i == j) 35 | continue; 36 | uint Nodej = ValueToBestFitNode[j]; 37 | if (T.IsInSubtree(Nodei, Nodej)) 38 | { 39 | const string &Valuei = FT.GetValue(i); 40 | const string &Valuej = FT.GetValue(j); 41 | Warning("%s > %s\n", Valuei.c_str(), Valuej.c_str()); 42 | 43 | ValueToBestFitNode[j] = UINT_MAX; 44 | } 45 | } 46 | } 47 | 48 | for (uint i = 0; i < ValueCount; ++i) 49 | { 50 | uint Node = ValueToBestFitNode[i]; 51 | if (Node == UINT_MAX) 52 | continue; 53 | const string &Value = FT.GetValue(i); 54 | string NewLabel = "cond_" + Value; 55 | T.DeleteSubtree(Node, NewLabel, false); 56 | #if DEBUG 57 | T.Validate(); 58 | #endif 59 | } 60 | 61 | uint DeleteCount = 0; 62 | for (;;) 63 | { 64 | bool AnyDel = false; 65 | const uint NodeCount = T.GetNodeIndexCount(); 66 | for (uint Node = 0; Node < NodeCount; ++Node) 67 | { 68 | if (!T.IsNode(Node)) 69 | continue; 70 | const string &Label = T.GetLabel(Node); 71 | vector SubtreeLabels; 72 | T.GetSubtreeLeafLabels_Rooted(Node, SubtreeLabels); 73 | if (SIZE(SubtreeLabels) == 1 && 74 | SubtreeLabels[0].substr(0, 3) == "DEL") 75 | continue; 76 | bool Found = false; 77 | for (uint i = 0; i < SIZE(SubtreeLabels); ++i) 78 | { 79 | const string &Label = SubtreeLabels[i]; 80 | if (Label.substr(0, 5) == "cond_") 81 | { 82 | Found = true; 83 | break; 84 | } 85 | } 86 | if (!Found) 87 | { 88 | AnyDel = true; 89 | string NewLabel; 90 | Ps(NewLabel, "DEL%u", ++DeleteCount); 91 | T.DeleteSubtree(Node, NewLabel, false); 92 | #if DEBUG 93 | T.Validate(); 94 | #endif 95 | } 96 | } 97 | if (!AnyDel) 98 | break; 99 | } 100 | 101 | const uint NodeCount = T.GetNodeIndexCount(); 102 | uint DelCount = 0; 103 | for (uint Node = 0; Node < NodeCount; ++Node) 104 | { 105 | if (!T.IsLeaf(Node)) 106 | continue; 107 | const string &Label = T.GetLabel(Node); 108 | if (Label.substr(0, 3) == "DEL") 109 | { 110 | #if DEBUG 111 | T.Validate(); 112 | #endif 113 | T.DeleteLeaf(Node); 114 | #if DEBUG 115 | T.Validate(); 116 | #endif 117 | ++DelCount; 118 | } 119 | } 120 | //T.Normalize(); 121 | #if DEBUG 122 | T.Validate(); 123 | #endif 124 | } 125 | 126 | void cmd_condensex() 127 | { 128 | vector Trees; 129 | TreesFromFile(opt(condensex), Trees); 130 | FILE *fTsv = CreateStdioFile(opt(tsvout)); 131 | 132 | double MinTPFract = 0.5; 133 | if (optset_mintpfract) 134 | MinTPFract = opt(mintpfract); 135 | 136 | const uint TreeCount = SIZE(Trees); 137 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 138 | { 139 | TreeX &T = *Trees[TreeIndex]; 140 | Condense(T, MinTPFract); 141 | T.CollapseUnary(); 142 | T.Ladderize(); 143 | T.Validate(); 144 | } 145 | 146 | TreesToFile(Trees, opt(output)); 147 | 148 | CloseStdioFile(fTsv); 149 | } 150 | -------------------------------------------------------------------------------- /src/treesfromdata.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "newicklexer.h" 3 | #include "treen.h" 4 | #include "tree2.h" 5 | #include "treex.h" 6 | 7 | void StringsFromFile(const string &FileName, vector &Strings); 8 | 9 | void TreesFromData(const char *Data, uint DataBytes, 10 | vector &Trees) 11 | { 12 | Trees.clear(); 13 | 14 | NewickLexer NL; 15 | NL.FromData(Data, DataBytes); 16 | vector > TokensVec; 17 | NL.SplitTokens(TokensVec); 18 | const uint TreeCount = SIZE(TokensVec); 19 | for (uint i = 0; i < TreeCount; ++i) 20 | { 21 | ProgressStep(i, TreeCount, "Trees from tokens"); 22 | const vector &Tokens = TokensVec[i]; 23 | TreeN *T = new TreeN; 24 | T->FromTokens(Tokens); 25 | Trees.push_back(T); 26 | } 27 | } 28 | 29 | void TreesFromData(const char *Data, uint DataBytes, 30 | vector &Trees) 31 | { 32 | Trees.clear(); 33 | 34 | NewickLexer NL; 35 | NL.FromData(Data, DataBytes); 36 | vector > TokensVec; 37 | NL.SplitTokens(TokensVec); 38 | NewickParser2 NP; 39 | const uint TreeCount = SIZE(TokensVec); 40 | for (uint i = 0; i < TreeCount; ++i) 41 | { 42 | ProgressStep(i, TreeCount, "Trees from tokens"); 43 | const vector &Tokens = TokensVec[i]; 44 | TreeX *T = new TreeX; 45 | NP.FromTokens(Tokens); 46 | T->FromNewickParser(NP); 47 | Trees.push_back(T); 48 | } 49 | } 50 | 51 | void TreesFromFile3(const string &FileName, vector &Trees) 52 | { 53 | Trees.clear(); 54 | vector PathNames; 55 | StringsFromFile(FileName, PathNames); 56 | const uint N = SIZE(PathNames); 57 | for (uint i = 0; i < N; ++i) 58 | { 59 | const string &PathName = PathNames[i]; 60 | Tree2 *T2 = new Tree2; 61 | T2->FromNewickFile(PathName); 62 | Trees.push_back(T2); 63 | } 64 | } 65 | 66 | void TreesFromFile(const string &FileName, vector &Trees) 67 | { 68 | FILE *f = OpenStdioFile(FileName); 69 | uint32 FileSize; 70 | const byte *Data = ReadAllStdioFile(f, FileSize); 71 | CloseStdioFile(f); 72 | TreesFromData((const char *) Data, FileSize, Trees); 73 | } 74 | 75 | void TreesFromFile(const string &FileName, vector &Trees) 76 | { 77 | FILE *f = OpenStdioFile(FileName); 78 | uint32 FileSize; 79 | const byte *Data = ReadAllStdioFile(f, FileSize); 80 | CloseStdioFile(f); 81 | TreesFromData((const char *) Data, FileSize, Trees); 82 | } 83 | 84 | void TreesFromFile2(const string &FileName, vector &Trees) 85 | { 86 | vector TreesN; 87 | TreesFromFile(FileName, TreesN); 88 | const uint N = SIZE(TreesN); 89 | for (uint i = 0; i < N; ++i) 90 | { 91 | const TreeN *TN = TreesN[i]; 92 | Tree2 *T2 = new Tree2; 93 | TN->ToTree2(*T2); 94 | Trees.push_back(T2); 95 | } 96 | } 97 | 98 | void TreesToFile(const vector &Trees, const string &FileName) 99 | { 100 | FILE *f = CreateStdioFile(FileName); 101 | const uint TreeCount = SIZE(Trees); 102 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 103 | { 104 | const TreeN &T = *Trees[TreeIndex]; 105 | T.ToNewickFile(f, false); 106 | } 107 | CloseStdioFile(f); 108 | } 109 | 110 | //void TreesToFile(const vector &Trees, const string &FileName) 111 | // { 112 | // FILE *f = CreateStdioFile(FileName); 113 | // const uint TreeCount = SIZE(Trees); 114 | // for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 115 | // { 116 | // const TreeX &T = *Trees[TreeIndex]; 117 | // T.ToNewickFile(f, false); 118 | // } 119 | // CloseStdioFile(f); 120 | // } 121 | 122 | void TreesToFile(const vector &Trees, const string &FileName) 123 | { 124 | FILE *f = CreateStdioFile(FileName); 125 | const uint TreeCount = SIZE(Trees); 126 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 127 | { 128 | const TreeX &T = *Trees[TreeIndex]; 129 | T.ToNewickFile(f, false); 130 | } 131 | CloseStdioFile(f); 132 | } 133 | -------------------------------------------------------------------------------- /src/newicklexer.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "newicklexer.h" 3 | #include "newick_token.h" 4 | 5 | char NewickLexer::GetCharFailOnEof() 6 | { 7 | int c = GetChar(); 8 | if (c == EOF) 9 | Die("Parsing tree, unexpected end-of-file"); 10 | return (char) c; 11 | } 12 | 13 | int NewickLexer::GetChar() 14 | { 15 | if (m_DataPos >= m_DataBytes) 16 | return EOF; 17 | char Ch = m_Data[m_DataPos++]; 18 | return Ch; 19 | } 20 | 21 | void NewickLexer::SkipWhite() 22 | { 23 | for (;;) 24 | { 25 | int c = GetChar(); 26 | if (c == EOF) 27 | return; 28 | if (!isspace(c)) 29 | { 30 | asserta(m_DataPos > 0); 31 | --m_DataPos; 32 | return; 33 | } 34 | } 35 | } 36 | 37 | bool NewickLexer::GetToken(string &Token) 38 | { 39 | Token.clear(); 40 | if (m_DataPos >= m_DataBytes) 41 | return false; 42 | 43 | // Skip leading white space 44 | SkipWhite(); 45 | if (m_DataPos >= m_DataBytes) 46 | return false; 47 | 48 | char c = GetCharFailOnEof(); 49 | 50 | // In case a single-character token 51 | Token = c; 52 | 53 | uint uBytesCopied = 0; 54 | NEWICK_TOKEN_TYPE TT; 55 | switch (c) 56 | { 57 | case '(': 58 | Token = "("; 59 | return true; 60 | 61 | case ')': 62 | Token = ")"; 63 | return true; 64 | 65 | case ':': 66 | Token = ":"; 67 | return true; 68 | 69 | case ';': 70 | Token = ";"; 71 | return true; 72 | 73 | case ',': 74 | Token = ","; 75 | return true; 76 | 77 | case '\'': 78 | TT = NTT_SingleQuotedString; 79 | c = GetCharFailOnEof(); 80 | break; 81 | 82 | case '"': 83 | TT = NTT_DoubleQuotedString; 84 | c = GetCharFailOnEof(); 85 | break; 86 | 87 | case '[': 88 | TT = NTT_Comment; 89 | break; 90 | 91 | default: 92 | TT = NTT_String; 93 | break; 94 | } 95 | 96 | // Discard char already added 97 | Token.clear(); 98 | for (;;) 99 | { 100 | if (TT != NTT_Comment) 101 | Token += c; 102 | int ic = GetChar(); 103 | if (ic == EOF) 104 | Die("Unexpected end of Newick file in %s", NTTToStr(TT)); 105 | c = (char) ic; 106 | 107 | switch (TT) 108 | { 109 | case NTT_String: 110 | if (0 != strchr("():;,", c)) 111 | { 112 | asserta(m_DataPos > 0); 113 | --m_DataPos; 114 | return true; 115 | } 116 | if (isspace(c)) 117 | return true; 118 | break; 119 | 120 | case NTT_SingleQuotedString: 121 | if ('\'' == c) 122 | return true; 123 | break; 124 | 125 | case NTT_DoubleQuotedString: 126 | if ('"' == c) 127 | return true; 128 | break; 129 | 130 | case NTT_Comment: 131 | if (']' == c) 132 | { 133 | bool Ok = GetToken(Token); 134 | return Ok; 135 | } 136 | break; 137 | 138 | default: 139 | Die("NewickParser::GetToken, invalid TT=%u", TT); 140 | } 141 | } 142 | } 143 | 144 | void NewickLexer::LogTokens() const 145 | { 146 | const uint N = SIZE(m_Tokens); 147 | Log("\n"); 148 | Log("%u tokens\n", N); 149 | for (uint i = 0; i < N; ++i) 150 | Log("[%5u] '%s'\n", i, m_Tokens[i].c_str()); 151 | } 152 | 153 | void NewickLexer::FromData(const char *Data, uint DataBytes) 154 | { 155 | Clear(); 156 | 157 | m_Data = Data; 158 | m_DataBytes = DataBytes; 159 | m_DataPos = 0; 160 | 161 | for (;;) 162 | { 163 | string Token; 164 | bool Ok = GetToken(Token); 165 | if (!Ok) 166 | break; 167 | m_Tokens.push_back(Token); 168 | } 169 | } 170 | 171 | void NewickLexer::FromStr(const string &Str) 172 | { 173 | uint DataBytes = SIZE(Str); 174 | FromData(Str.c_str(), DataBytes); 175 | } 176 | 177 | void NewickLexer::SplitTokens(vector > &TokenVec) const 178 | { 179 | TokenVec.clear(); 180 | const uint N = SIZE(m_Tokens); 181 | vector Split; 182 | for (uint i = 0; i < N; ++i) 183 | { 184 | const string &Token = m_Tokens[i]; 185 | Split.push_back(Token); 186 | if (Token == ";") 187 | { 188 | TokenVec.push_back(Split); 189 | Split.clear(); 190 | } 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/supermaker.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "supermaker.h" 3 | #include "sort.h" 4 | 5 | void SuperMaker::Load(const string &FileName) 6 | { 7 | asserta(m_Trees.empty()); 8 | 9 | TreesFromFile(FileName, m_Trees); 10 | 11 | const uint TreeCount = GetTreeCount(); 12 | 13 | m_RootDists.resize(TreeCount); 14 | m_LeafCounts.resize(TreeCount); 15 | m_Bootstraps.resize(TreeCount); 16 | 17 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 18 | { 19 | const TreeN &T = *m_Trees[TreeIndex]; 20 | 21 | T.GetRootDists(m_RootDists[TreeIndex]); 22 | T.GetSubtreeLeafCounts(m_LeafCounts[TreeIndex]); 23 | T.GetBootstraps(m_Bootstraps[TreeIndex]); 24 | 25 | vector Nodes; 26 | T.GetNodes(Nodes); 27 | const uint N = SIZE(Nodes); 28 | for (uint i = 0; i < N; ++i) 29 | { 30 | uint Node = Nodes[i]; 31 | if (T.IsLeaf(Node)) 32 | { 33 | const string &Label = T.GetLabel(Node); 34 | AddLabel(TreeIndex, Node, Label); 35 | } 36 | } 37 | } 38 | } 39 | 40 | void SuperMaker::AddLabel(uint TreeIndex, uint Node, const string &Label) 41 | { 42 | uint Index = UINT_MAX; 43 | if (m_LabelToIndex.find(Label) == m_LabelToIndex.end()) 44 | { 45 | Index = SIZE(m_Labels); 46 | m_Labels.push_back(Label); 47 | m_LabelToIndex[Label] = Index; 48 | 49 | vector Empty; 50 | m_LabelIndexToTreeIndexes.push_back(Empty); 51 | m_LabelIndexToLeafNodeIndexes.push_back(Empty); 52 | } 53 | else 54 | Index = m_LabelToIndex[Label]; 55 | 56 | m_LabelIndexToTreeIndexes[Index].push_back(TreeIndex); 57 | m_LabelIndexToLeafNodeIndexes[Index].push_back(Node); 58 | } 59 | 60 | const TreeN &SuperMaker::GetTree(uint TreeIndex) const 61 | { 62 | asserta(TreeIndex < SIZE(m_Trees)); 63 | return *m_Trees[TreeIndex]; 64 | } 65 | 66 | const vector &SuperMaker::GetRootDists(uint TreeIndex) const 67 | { 68 | asserta(TreeIndex < SIZE(m_RootDists)); 69 | return m_RootDists[TreeIndex]; 70 | } 71 | 72 | const vector &SuperMaker::GetBootstraps(uint TreeIndex) const 73 | { 74 | asserta(TreeIndex < SIZE(m_Bootstraps)); 75 | return m_Bootstraps[TreeIndex]; 76 | } 77 | 78 | const vector &SuperMaker::GetLeafCounts(uint TreeIndex) const 79 | { 80 | asserta(TreeIndex < SIZE(m_LeafCounts)); 81 | return m_LeafCounts[TreeIndex]; 82 | } 83 | 84 | double SuperMaker::CalcUpperLeafDist(uint TreeIndex, uint NodeIndex) const 85 | { 86 | const vector &RootDists = GetRootDists(TreeIndex); 87 | const TreeN &T = GetTree(TreeIndex); 88 | vector LeafNodes; 89 | T.GetSubtreeLeafNodes(NodeIndex, LeafNodes); 90 | asserta(NodeIndex < SIZE(RootDists)); 91 | double ThisRootDist = RootDists[NodeIndex]; 92 | vector LeafDists; 93 | const uint N = SIZE(LeafNodes); 94 | for (uint i = 0; i < N; ++i) 95 | { 96 | uint Leaf = LeafNodes[i]; 97 | asserta(Leaf < SIZE(RootDists)); 98 | double d = RootDists[Leaf]; 99 | LeafDists.push_back(d - ThisRootDist); 100 | } 101 | vector Order; 102 | QuickSortInPlace(LeafDists.data(), N); 103 | double UpperDist = LeafDists[3*N/4]; 104 | return UpperDist; 105 | } 106 | 107 | double SuperMaker::GetBootstrap(uint TreeIndex, uint NodeIndex) const 108 | { 109 | const vector& v = GetBootstraps(TreeIndex); 110 | asserta(NodeIndex < SIZE(v)); 111 | return v[NodeIndex]; 112 | } 113 | 114 | double SuperMaker::GetRootDist(uint TreeIndex, uint NodeIndex) const 115 | { 116 | const vector& v = GetRootDists(TreeIndex); 117 | asserta(NodeIndex < SIZE(v)); 118 | return v[NodeIndex]; 119 | } 120 | 121 | uint SuperMaker::GetLeafCount(uint TreeIndex, uint NodeIndex) const 122 | { 123 | const vector &v = GetLeafCounts(TreeIndex); 124 | asserta(NodeIndex < SIZE(v)); 125 | return v[NodeIndex]; 126 | } 127 | 128 | double SuperMaker::CalcScore(uint TreeIndex, uint NodeIndex) const 129 | { 130 | uint LeafCount = GetLeafCount(TreeIndex, NodeIndex); 131 | if (LeafCount < m_MinLeafCount) 132 | return 0; 133 | double Bootstrap = GetBootstrap(TreeIndex, NodeIndex); 134 | double RootDist = GetRootDist(TreeIndex, NodeIndex); 135 | return 0; 136 | } 137 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | ###################################################### 2 | # Makefile is generated by ./vcxproj_make.py 3 | # Don't edit the Makefile -- update the python script 4 | ###################################################### 5 | 6 | BINDIR := ../bin 7 | OBJDIR := o 8 | BINPATH := $(BINDIR)/newick 9 | 10 | CXX = g++ 11 | CXXFLAGS := -ffast-math -march=native -O3 -DNDEBUG -fopenmp 12 | 13 | UNAME_S := $(shell uname -s) 14 | LDFLAGS := $(LDFLAGS) -ffast-math -march=native -O3 -fopenmp 15 | 16 | HDRS = \ 17 | biparter.h \ 18 | biparterx.h \ 19 | clustermaker.h \ 20 | cmds.h \ 21 | consmaker.h \ 22 | consmakerx.h \ 23 | consorder.h \ 24 | countsort.h \ 25 | divconker.h \ 26 | featuretable.h \ 27 | gobuff.h \ 28 | layout.h \ 29 | murmur.h \ 30 | myopts.h \ 31 | myutils.h \ 32 | newick_token.h \ 33 | newicklexer.h \ 34 | newickparser2.h \ 35 | newicktree.h \ 36 | quarts.h \ 37 | sort.h \ 38 | splitter.h \ 39 | supermaker.h \ 40 | svg.h \ 41 | taxer.h \ 42 | tree2.h \ 43 | treen.h \ 44 | treex.h \ 45 | usage.h \ 46 | 47 | OBJS = \ 48 | $(OBJDIR)/addrootlabel.o \ 49 | $(OBJDIR)/bestfitsubtree.o \ 50 | $(OBJDIR)/biparterx.o \ 51 | $(OBJDIR)/cladeq.o \ 52 | $(OBJDIR)/colors.o \ 53 | $(OBJDIR)/condensex.o \ 54 | $(OBJDIR)/conf2.o \ 55 | $(OBJDIR)/confcmps.o \ 56 | $(OBJDIR)/consensus.o \ 57 | $(OBJDIR)/consensusx.o \ 58 | $(OBJDIR)/consmaker.o \ 59 | $(OBJDIR)/consmakerx.o \ 60 | $(OBJDIR)/consorder.o \ 61 | $(OBJDIR)/correl2.o \ 62 | $(OBJDIR)/deletegroup.o \ 63 | $(OBJDIR)/deleteoutgroup.o \ 64 | $(OBJDIR)/divconker.o \ 65 | $(OBJDIR)/getlcasubtrees.o \ 66 | $(OBJDIR)/olcs.o \ 67 | $(OBJDIR)/dq.o \ 68 | $(OBJDIR)/featuretablefromtree.o \ 69 | $(OBJDIR)/findgroups.o \ 70 | $(OBJDIR)/getcc.o \ 71 | $(OBJDIR)/getlcas.o \ 72 | $(OBJDIR)/getlcasx.o \ 73 | $(OBJDIR)/lcalabel.o \ 74 | $(OBJDIR)/mono.o \ 75 | $(OBJDIR)/rootbyoutgroup.o \ 76 | $(OBJDIR)/bootq.o \ 77 | $(OBJDIR)/cladogram.o \ 78 | $(OBJDIR)/cluster2.o \ 79 | $(OBJDIR)/clustermaker.o \ 80 | $(OBJDIR)/collapse.o \ 81 | $(OBJDIR)/condense.o \ 82 | $(OBJDIR)/conf.o \ 83 | $(OBJDIR)/deleteleaves.o \ 84 | $(OBJDIR)/diameter.o \ 85 | $(OBJDIR)/drawfs.o \ 86 | $(OBJDIR)/featuretable.o \ 87 | $(OBJDIR)/fixft.o \ 88 | $(OBJDIR)/getlabels.o \ 89 | $(OBJDIR)/intlabel.o \ 90 | $(OBJDIR)/ladderize.o \ 91 | $(OBJDIR)/draw.o \ 92 | $(OBJDIR)/drawf.o \ 93 | $(OBJDIR)/layout.o \ 94 | $(OBJDIR)/murmurhash.o \ 95 | $(OBJDIR)/myutils.o \ 96 | $(OBJDIR)/newicklexer.o \ 97 | $(OBJDIR)/newickparser2.o \ 98 | $(OBJDIR)/newicktree.o \ 99 | $(OBJDIR)/newick_main.o \ 100 | $(OBJDIR)/newick_token.o \ 101 | $(OBJDIR)/biparter.o \ 102 | $(OBJDIR)/quarts.o \ 103 | $(OBJDIR)/randtree.o \ 104 | $(OBJDIR)/relabel.o \ 105 | $(OBJDIR)/relabelacc.o \ 106 | $(OBJDIR)/relabelf.o \ 107 | $(OBJDIR)/relabeln.o \ 108 | $(OBJDIR)/reroot.o \ 109 | $(OBJDIR)/rofo3.o \ 110 | $(OBJDIR)/rofos.o \ 111 | $(OBJDIR)/rootbyhalves.o \ 112 | $(OBJDIR)/rootbyoutgroupx.o \ 113 | $(OBJDIR)/rotate.o \ 114 | $(OBJDIR)/shrink.o \ 115 | $(OBJDIR)/supercluster.o \ 116 | $(OBJDIR)/supermaker.o \ 117 | $(OBJDIR)/tax.o \ 118 | $(OBJDIR)/taxer.o \ 119 | $(OBJDIR)/taxq2.o \ 120 | $(OBJDIR)/taxqx.o \ 121 | $(OBJDIR)/taxtable.o \ 122 | $(OBJDIR)/testdeletesubtree.o \ 123 | $(OBJDIR)/testx.o \ 124 | $(OBJDIR)/topo.o \ 125 | $(OBJDIR)/split.o \ 126 | $(OBJDIR)/stringsfromfile.o \ 127 | $(OBJDIR)/subset.o \ 128 | $(OBJDIR)/subsetnodes.o \ 129 | $(OBJDIR)/subtree.o \ 130 | $(OBJDIR)/svg.o \ 131 | $(OBJDIR)/syncft.o \ 132 | $(OBJDIR)/syncftacc.o \ 133 | $(OBJDIR)/stats.o \ 134 | $(OBJDIR)/taxq.o \ 135 | $(OBJDIR)/tree2.o \ 136 | $(OBJDIR)/treen.o \ 137 | $(OBJDIR)/treesfromdata.o \ 138 | $(OBJDIR)/treex.o \ 139 | $(OBJDIR)/tsv.o \ 140 | $(OBJDIR)/usage.o \ 141 | 142 | .PHONY: clean 143 | 144 | $(BINPATH) : $(BINDIR)/ $(OBJDIR)/ $(OBJS) 145 | $(CXX) $(LDFLAGS) $(OBJS) -o $(BINPATH) 146 | strip $(BINPATH) 147 | 148 | $(OBJDIR)/ : 149 | mkdir -p $(OBJDIR)/ 150 | 151 | $(BINDIR)/ : 152 | mkdir -p $(BINDIR)/ 153 | 154 | $(OBJDIR)/%.o : %.cpp $(HDRS) 155 | $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< 156 | -------------------------------------------------------------------------------- /src/clustermaker.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "clustermaker.h" 3 | #include "sort.h" 4 | #include 5 | 6 | void ClusterMaker::Run(const Tree2 &T, double MaxDistFromFarthestLeaf) 7 | { 8 | Clear(); 9 | 10 | m_T = &T; 11 | m_MaxDistFromFarthestLeaf = MaxDistFromFarthestLeaf; 12 | 13 | vector Dists; 14 | const uint NodeCount = m_T->GetNodeCount(); 15 | for (uint Node = 0; Node < NodeCount; ++Node) 16 | { 17 | double d = m_T->GetMaxLeafDist(Node); 18 | if (d <= MaxDistFromFarthestLeaf) 19 | { 20 | uint Parent = m_T->GetParent(Node); 21 | double dp = m_T->GetMaxLeafDist(Parent); 22 | if (dp > MaxDistFromFarthestLeaf) 23 | { 24 | Dists.push_back(d); 25 | m_SubtreeNodes.push_back(Node); 26 | 27 | vector SubtreeLeafNodes; 28 | m_T->GetSubtreeLeafNodes(Node, SubtreeLeafNodes); 29 | m_SubtreeLeafNodesVec.push_back(SubtreeLeafNodes); 30 | } 31 | } 32 | } 33 | Validate(); 34 | } 35 | 36 | void ClusterMaker::Validate() const 37 | { 38 | set NodeSet; 39 | const uint SelectedNodeCount = SIZE(m_SubtreeNodes); 40 | asserta(SIZE(m_SubtreeLeafNodesVec) == SelectedNodeCount); 41 | for (uint i = 0; i < SelectedNodeCount; ++i) 42 | { 43 | uint Node = m_SubtreeNodes[i]; 44 | asserta(NodeSet.find(Node) == NodeSet.end()); 45 | NodeSet.insert(Node); 46 | } 47 | 48 | const uint LeafCount = m_T->GetLeafCount(); 49 | uint SelectedLeafCount = 0; 50 | for (uint i = 0; i < SelectedNodeCount; ++i) 51 | { 52 | uint Node = m_SubtreeNodes[i]; 53 | vector Path; 54 | m_T->GetPathToRoot(Node, Path); 55 | for (uint j = 0; j < SIZE(Path); ++j) 56 | { 57 | uint Node2 = Path[j]; 58 | if (j == 0) 59 | asserta(Node2 == Node); 60 | else 61 | asserta(NodeSet.find(Node2) == NodeSet.end()); 62 | } 63 | const vector &SubtreeLeafNodes = m_SubtreeLeafNodesVec[i]; 64 | uint SubtreeLeafCount = SIZE(SubtreeLeafNodes); 65 | SelectedLeafCount += SubtreeLeafCount; 66 | } 67 | asserta(SelectedLeafCount = LeafCount); 68 | } 69 | 70 | void ClusterMaker::ToTSV(const string &FileName) const 71 | { 72 | if (FileName.empty()) 73 | return; 74 | FILE *f = CreateStdioFile(FileName); 75 | 76 | const uint SelectedNodeCount = SIZE(m_SubtreeNodes); 77 | for (uint i = 0; i < SelectedNodeCount; ++i) 78 | { 79 | uint Node = m_SubtreeNodes[i]; 80 | const vector &LeafNodes = m_SubtreeLeafNodesVec[i]; 81 | const uint SubtreeLeafCount = SIZE(LeafNodes); 82 | fprintf(f, "C\t%u\t%u\n", i+1, SubtreeLeafCount); 83 | } 84 | 85 | for (uint i = 0; i < SelectedNodeCount; ++i) 86 | { 87 | uint Node = m_SubtreeNodes[i]; 88 | const vector &LeafNodes = m_SubtreeLeafNodesVec[i]; 89 | const uint SubtreeLeafCount = SIZE(LeafNodes); 90 | for (uint j = 0; j < SubtreeLeafCount; ++j) 91 | { 92 | const uint LeafNode = LeafNodes[j]; 93 | const string &Label = m_T->GetLabel(LeafNode); 94 | double Dist = m_T->GetDistance(LeafNode, Node); 95 | asserta(Dist <= m_MaxDistFromFarthestLeaf); 96 | 97 | fprintf(f, "L"); 98 | fprintf(f, "\t%u", i+1); 99 | fprintf(f, "\t%u", Node); 100 | fprintf(f, "\t%u", LeafNode); 101 | fprintf(f, "\t%.4g", Dist); 102 | fprintf(f, "\t%s", Label.c_str()); 103 | fprintf(f, "\n"); 104 | } 105 | } 106 | CloseStdioFile(f); 107 | ProgressLog("MaxDist=%.4g, Clusters=%u\n", 108 | m_MaxDistFromFarthestLeaf, SelectedNodeCount); 109 | } 110 | 111 | void ClusterMaker::ToNewick(const string &FileName) const 112 | { 113 | if (FileName.empty()) 114 | return; 115 | 116 | vector NewLabels; 117 | const uint N = SIZE(m_SubtreeNodes); 118 | for (uint i = 0; i < N; ++i) 119 | { 120 | string NewLabel; 121 | Ps(NewLabel, "Cluster%u", i+1); 122 | NewLabels.push_back(NewLabel); 123 | } 124 | 125 | Tree2 SubsetTree; 126 | MakeSubsetNodes(*m_T, m_SubtreeNodes, NewLabels, SubsetTree); 127 | 128 | SubsetTree.ToNewickFile(FileName); 129 | } 130 | 131 | void cmd_cluster() 132 | { 133 | const string &TreeFileName = opt(cluster); 134 | Tree2 T; 135 | T.FromFile(TreeFileName); 136 | 137 | double MaxDistFromFarthestLeaf = opt(maxdist); 138 | 139 | ClusterMaker CM; 140 | CM.Run(T, MaxDistFromFarthestLeaf); 141 | CM.ToTSV(opt(tsvout)); 142 | CM.ToNewick(opt(output)); 143 | } 144 | -------------------------------------------------------------------------------- /src/cladeq.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "featuretable.h" 3 | #include "biparter.h" 4 | 5 | void SetFeatureTable(const TreeN &T, FeatureTable &FT); 6 | 7 | static void GetThisOther(uint ValueIndex, uint Node, 8 | const vector &ValueIndexToTotal, const vector &ValueCounts, 9 | uint &LeftThis, uint &LeftOther, uint &RightThis, uint &RightOther) 10 | { 11 | LeftThis = 0; 12 | LeftOther = 0; 13 | RightThis = 0; 14 | RightOther = 0; 15 | 16 | const uint ValueCount = SIZE(ValueIndexToTotal); 17 | for (uint ValueIndex2 = 0; ValueIndex2 < ValueCount; ++ValueIndex2) 18 | { 19 | uint Total = ValueIndexToTotal[ValueIndex2]; 20 | uint n = ValueCounts[ValueIndex2]; 21 | asserta(n <= Total); 22 | if (ValueIndex2 == ValueIndex) 23 | { 24 | LeftThis += n; 25 | RightThis += (Total - n); 26 | } 27 | else 28 | { 29 | LeftOther += n; 30 | RightOther += (Total - n); 31 | } 32 | } 33 | } 34 | 35 | static uint GetWrongCount(uint Node, uint ValueIndex, 36 | const vector &ValueIndexToTotal, 37 | const vector > &NodeToValueCounts) 38 | { 39 | uint N = ValueIndexToTotal[ValueIndex]; 40 | const vector &ValueCounts = NodeToValueCounts[Node]; 41 | if (SIZE(ValueCounts) == 0) 42 | return UINT_MAX; 43 | 44 | uint LeftThis, LeftOther, RightThis, RightOther; 45 | GetThisOther(ValueIndex, Node, ValueIndexToTotal, ValueCounts, 46 | LeftThis, LeftOther, RightThis, RightOther); 47 | asserta(LeftThis + RightThis == N); 48 | uint WrongLeft = RightThis + LeftOther; 49 | uint WrongRight = LeftThis + RightOther; 50 | uint Wrong = min(WrongLeft, WrongRight); 51 | #if 0//TRACE 52 | Log("Node %u Value %u Lt %u Rt %u Lo %u Ro %u WL %u WR %u N %u\n", 53 | Node, ValueIndex, LeftThis, RightThis, LeftOther, RightOther, 54 | WrongLeft, WrongRight, N); 55 | #endif 56 | return Wrong; 57 | } 58 | 59 | static void DoValue(uint ValueIndex, const string &Value, 60 | const vector &ValueIndexToTotal, 61 | const vector > &NodeToValueCounts, 62 | uint &BestNode, uint &BestWrongCount) 63 | { 64 | const uint NodeCount = SIZE(NodeToValueCounts); 65 | BestWrongCount = UINT_MAX; 66 | BestNode = UINT_MAX; 67 | for (uint Node = 0; Node < NodeCount; ++Node) 68 | { 69 | uint WrongCount = GetWrongCount(Node, ValueIndex, 70 | ValueIndexToTotal, NodeToValueCounts); 71 | if (WrongCount < BestWrongCount) 72 | { 73 | BestWrongCount = WrongCount; 74 | BestNode = Node; 75 | } 76 | } 77 | uint N = ValueIndexToTotal[ValueIndex]; 78 | #if 0//TRACE 79 | Log(">>> Value %u %s WC %u / %u BestNode %u\n", 80 | ValueIndex, Value.c_str(), BestWrongCount, N, BestNode); 81 | #endif 82 | 83 | // Special case because leaf+other partitions are not considered 84 | if (BestWrongCount >= N) 85 | BestWrongCount = N - 1; 86 | } 87 | 88 | // CladeQ = Correct/N 89 | // Correct = sum_over_groups (Size-Wrong) 90 | // N = sum_over_groups (Size) 91 | // Wrong = minimum_over_edges (NrOther_in_BestSplit + NrThis_out_BestSplit) 92 | void cmd_cladeq() 93 | { 94 | const string &TreeFileName = opt(cladeq); 95 | 96 | TreeN T; 97 | T.FromNewickFile(TreeFileName); 98 | 99 | FeatureTable FT; 100 | SetFeatureTable(T, FT); 101 | 102 | BiParter BP; 103 | BP.Init(T); 104 | 105 | vector ValueIndexToTotal; 106 | vector > NodeToValueCounts; 107 | BP.CountFeatures(FT, ValueIndexToTotal, NodeToValueCounts); 108 | 109 | uint N = 0; 110 | uint SumWrong = 0; 111 | const uint ValueCount = FT.GetValueCount(); 112 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 113 | { 114 | uint BestNode = UINT_MAX; 115 | uint WrongCount = UINT_MAX; 116 | uint Total = ValueIndexToTotal[ValueIndex]; 117 | if (Total <= 1) 118 | continue; 119 | N += Total; 120 | const string &Value = FT.GetValue(ValueIndex); 121 | DoValue(ValueIndex, Value, ValueIndexToTotal, NodeToValueCounts, 122 | BestNode, WrongCount); 123 | SumWrong += WrongCount; 124 | } 125 | 126 | if (N == 0) 127 | { 128 | asserta(SumWrong == 0); 129 | ProgressLog("Wrong = %u / %u, CladeQ = undefined\n", SumWrong, N); 130 | } 131 | else 132 | { 133 | asserta(N > 0); 134 | asserta(SumWrong < N); 135 | uint Correct = N - SumWrong; 136 | double CladeQ = double(Correct)/N; 137 | ProgressLog("Wrong = %u / %u, CladeQ = %.4f\n", SumWrong, N, CladeQ); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/featuretable.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | class TreeN; 7 | class TreeX; 8 | 9 | class FeatureTable 10 | { 11 | public: 12 | vector m_Labels; 13 | vector m_FullLabels; 14 | vector m_Values; 15 | map m_LabelToIndex; 16 | map m_ValueToIndex; 17 | vector m_LabelIndexToValueIndex; 18 | string m_NA = "."; 19 | bool m_UseAccs = opt(accs); 20 | 21 | // Subset of leaf nodes with features 22 | set m_LeafNodeSet; 23 | 24 | public: 25 | void Clear() 26 | { 27 | m_Labels.clear(); 28 | m_FullLabels.clear(); 29 | m_Values.clear(); 30 | m_LabelToIndex.clear(); 31 | m_ValueToIndex.clear(); 32 | m_LabelIndexToValueIndex.clear(); 33 | m_NA = "."; 34 | m_LeafNodeSet.clear(); 35 | } 36 | 37 | void FromFile(const string &FileName); 38 | void FromVecs(const vector &Labels, 39 | const vector &Values); 40 | void DataFromFile(const string &FileName, 41 | vector &Labels, 42 | vector &Values); 43 | void FromTree(const TreeN &T, char Sep, uint FieldIndex); 44 | void FromTree(const TreeX &T, char Sep, uint FieldIndex); 45 | uint GetLabelCount() const { return SIZE(m_Labels); } 46 | uint GetValueCount() const { return SIZE(m_Values); } 47 | 48 | const string &GetLabel(uint LabelIndex) const 49 | { 50 | asserta(LabelIndex < SIZE(m_Labels)); 51 | return m_Labels[LabelIndex]; 52 | } 53 | 54 | const string &GetValue(uint ValueIndex) const 55 | { 56 | if (ValueIndex == UINT_MAX) 57 | return m_NA; 58 | asserta(ValueIndex < SIZE(m_Values)); 59 | return m_Values[ValueIndex]; 60 | } 61 | 62 | uint GetValueIndex(const string &Value) const 63 | { 64 | map::const_iterator p = m_ValueToIndex.find(Value); 65 | if (p == m_ValueToIndex.end()) 66 | return UINT_MAX; 67 | return p->second; 68 | } 69 | 70 | uint GetValueIndex_ByLabel(const string &Label) const 71 | { 72 | uint LabelIndex = GetLabelIndex(Label); 73 | if (LabelIndex == UINT_MAX) 74 | return UINT_MAX; 75 | assert(LabelIndex < SIZE(m_LabelIndexToValueIndex)); 76 | uint ValueIndex = m_LabelIndexToValueIndex[LabelIndex]; 77 | asserta(ValueIndex == UINT_MAX || ValueIndex < SIZE(m_Values)); 78 | return ValueIndex; 79 | } 80 | 81 | void GetValue_ByLabel(const string &Label, string &Value, 82 | bool ErrorIfNotFound) const 83 | { 84 | Value.clear(); 85 | uint LabelIndex = GetLabelIndex(Label); 86 | if (LabelIndex == UINT_MAX) 87 | { 88 | if (ErrorIfNotFound) 89 | Die("No value for label '%s'", Label.c_str()); 90 | return; 91 | } 92 | assert(LabelIndex < SIZE(m_LabelIndexToValueIndex)); 93 | uint ValueIndex = m_LabelIndexToValueIndex[LabelIndex]; 94 | asserta(ValueIndex == UINT_MAX || ValueIndex < SIZE(m_Values)); 95 | if (ValueIndex == UINT_MAX) 96 | { 97 | if (ErrorIfNotFound) 98 | Die("No value for label '%s'", Label.c_str()); 99 | return; 100 | } 101 | Value = m_Values[ValueIndex]; 102 | } 103 | 104 | uint GetValueIndex_ByLabelIndex(uint LabelIndex) const 105 | { 106 | asserta(LabelIndex < SIZE(m_LabelIndexToValueIndex)); 107 | uint ValueIndex = m_LabelIndexToValueIndex[LabelIndex]; 108 | asserta(ValueIndex == UINT_MAX || ValueIndex < SIZE(m_Values)); 109 | return ValueIndex; 110 | } 111 | 112 | uint GetLabelIndex_FailOnError(const string &Label) const 113 | { 114 | map::const_iterator p = m_LabelToIndex.find(Label); 115 | if (p == m_LabelToIndex.end()) 116 | Die("Unknown label '%s'", Label.c_str()); 117 | return p->second; 118 | } 119 | 120 | uint GetLabelIndex(const string &Label) const; 121 | uint GetLabelCount_ByValueIndex(uint ValueIndex) const; 122 | void GetLabels_ByValueIndex(uint ValueIndex, vector &Labels) const; 123 | void GetLabels_ByValueIndex(uint ValueIndex, set &Labels) const; 124 | 125 | void SetLeafNodeSet(const TreeN &T); 126 | void SetLeafNodeSet(const TreeX &T); 127 | 128 | public: 129 | static void GetValueFromLabel(const string &Label, char Sep, 130 | uint FieldIndex, string &Value) 131 | { 132 | Value.clear(); 133 | vector Fields; 134 | Split(Label, Fields, Sep); 135 | if (SIZE(Fields) <= FieldIndex) 136 | return; 137 | Value = Fields[FieldIndex]; 138 | } 139 | }; 140 | 141 | void SetFeatureTable(const TreeX &T, FeatureTable &FT); 142 | -------------------------------------------------------------------------------- /src/usage.h: -------------------------------------------------------------------------------- 1 | "Make a subset tree given file with leaf labels, one per line (labels\n" 2 | "do not need to be a subtree, the tree is collapsed as needed):\n" 3 | " newick -input tree.newick -labels labels.txt -output subset.newick\n" 4 | "\n" 5 | "Get leaf labels:\n" 6 | " newick -getlabels tree.newick -output labels.txt\n" 7 | "\n" 8 | "Report miscellaneous information about a Newick file:\n" 9 | " newick -stats trees.newick\n" 10 | "\n" 11 | "Calculate Robinson-Foulds (R-F) distance between two trees:\n" 12 | " newick -rofo tree1.newick -tree2 tree2.newick -log rofo.log\n" 13 | "\n" 14 | "Calculate all-vs-all R-F distances between trees in Newick file:\n" 15 | " newick -rofos trees.newick -log rofos.log\n" 16 | "\n" 17 | "Re-label trees, labels.tsv tab-separated with #1=old_label #2=new_label:\n" 18 | " newick -relabel trees.newick -labels2 labels.tsv -output relabeled_trees.newick\n" 19 | "\n" 20 | "Add integer node number labels to internal nodes:\n" 21 | " newick -intlabel tree.newick -output intlabel.newick\n" 22 | "\n" 23 | "Root by outgroup, specify labels.txt with leaf labels of outgroup or GroupName which\n" 24 | "is a substring of the outgroup labels, e.g. phylum name if format is A1234.Phylum:\n" 25 | " newick trees.newick [-labels labels.txt | -outgroup GroupName] -output rooted.newick\n" 26 | "\n" 27 | "Convert tab-separated to Newick:\n" 28 | " newick -tsv2newick tree.tsv -output tree.newick\n" 29 | "\n" 30 | "Convert Newick to tab-separated:\n" 31 | " newick -newick2tsv tree.newick -output tree.tsv\n" 32 | "\n" 33 | "Ladderize trees by rotating internal nodes so that larger subtree is always the \n" 34 | "left (default) or right subtree:\n" 35 | " newick -ladderize trees.newick -output ladderized.newick [-right]\n" 36 | "\n" 37 | "Split tree into N roughly equal-sized subtrees (clusters), output is N files\n" 38 | "named prefixi, i=1..N containing labels for each subtree:\n" 39 | " newick -split tree.newick -n N -prefix prefix\n" 40 | "\n" 41 | "Convert trees to cladograms (leaves equidistant from root):\n" 42 | " newick -clado trees.newick -output clado.newick\n" 43 | "\n" 44 | "Calculate edge confidence values from set of bootstrapped trees:\n" 45 | " newick -conf tree.newick -trees replicates.newick -output conftree.newick\n" 46 | "\n" 47 | "Condense a tree by identifying best-fit nodes for each feature group and making\n" 48 | "a tree of just those nodes; unary edges are collapsed by summing lengths and \n" 49 | "taking max confidence, leaves are labeled with features (e.g. phylum names):\n" 50 | " newick -condense trees.newick -features features.tsv -output condensed.newick\n" 51 | "\n" 52 | "Extract just the branching order by collapsing unary nodes, deleting all edge lengths\n" 53 | "and deleting all confidence values (all internal node labels removed):\n" 54 | " newick -topo trees.newick -output topos.newick\n" 55 | "\n" 56 | "Delete one or more leaves and collapse any resulting unary nodes, useful e.g. for\n" 57 | "deleting outgroup to simplify figure:\n" 58 | " newick -deleteleaves trees.newick [-label OutgroupName | -labels labels.txt] -output .newick\n" 59 | "\n" 60 | "Draw one tree or several trees with optional coloring of edges:\n" 61 | " newick -draw tree.newick -svg figure.svg\n" 62 | " newick -drawf tree.newick -features features.tsv -colors colors.tsv -svg figure.svg\n" 63 | " newick -drawfs trees.newick -features features.tsv -colors colors.tsv -svg figure.svg\n" 64 | "\n" 65 | " -features is tsv file with #1 leaf_label #2 feature_name (e.g. phylum).\n" 66 | " -colors is tsv file with #1 feature_name #2 color, where color is any valid svg color,\n" 67 | " can be rgb, hex or name e.g. red.\n" 68 | "\n" 69 | " -default_color color\n" 70 | " Color for unlabeled edges (default gray).\n" 71 | " -title text\n" 72 | " Title text.\n" 73 | " -title_font_size n\n" 74 | " Title font size (default 10).\n" 75 | " -unitlengths \n" 76 | " Treat all edge lengths as 1 (phylogram).\n" 77 | " -strokewidth n\n" 78 | " Line width for edges (default 1).\n" 79 | " -tree_width n\n" 80 | " Width of tree (default 1000).\n" 81 | " -tree_height n\n" 82 | " Height of tree (default 1000).\n" 83 | " -tree_spacing n\n" 84 | " Space between trees (default 300).\n" 85 | " -trees_per_row n\n" 86 | " Number of trees per row in figure (default 4).\n" 87 | " -legend legend.svg\n" 88 | " Legend showing features (e.g. phylum names) and colors.\n" 89 | -------------------------------------------------------------------------------- /src/stats.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "quarts.h" 3 | #include "treen.h" 4 | 5 | double CalcDiameter(const TreeN &T, string &Label1, string &Label2); 6 | 7 | static void Stats(FILE *f, const TreeN &T) 8 | { 9 | if (f == 0) 10 | return; 11 | 12 | uint N = T.GetNodeCount(); 13 | bool Rooted; 14 | bool Binary = T.IsBinary(Rooted); 15 | uint L = T.GetLeafCount(); 16 | 17 | uint MissingLengthCount = 0; 18 | uint InternalNodeLabelCount = 0; 19 | uint LeafLabelCount = 0; 20 | uint IntLabelCount = 0; 21 | uint FloatLabelCount = 0; 22 | uint LengthCount = 0; 23 | uint EdgeCount = 0; 24 | 25 | vector DegreeToCount; 26 | 27 | for (uint Node = 0; Node < N; ++Node) 28 | { 29 | uint Degree = T.GetChildCount(Node); 30 | if (Degree >= SIZE(DegreeToCount)) 31 | DegreeToCount.resize(Degree+1, 0); 32 | ++DegreeToCount[Degree]; 33 | EdgeCount += Degree; 34 | double Length = T.GetLength(Node); 35 | if (Length != MISSING_LENGTH) 36 | ++LengthCount; 37 | const string &Label = T.GetLabel(Node); 38 | if (!Label.empty()) 39 | { 40 | if (T.IsLeaf(Node)) 41 | ++LeafLabelCount; 42 | else 43 | { 44 | ++InternalNodeLabelCount; 45 | if (IsValidIntStr(Label)) 46 | ++IntLabelCount; 47 | else if (IsValidFloatStr(Label)) 48 | ++FloatLabelCount; 49 | } 50 | } 51 | } 52 | 53 | fprintf(f, "Binary %s (%s)\n", 54 | Binary ? "Yes" : "No", 55 | Rooted ? "rooted" : "unrooted"); 56 | 57 | fprintf(f, "Leaves %u\n", L); 58 | fprintf(f, "Nodes %u\n", N); 59 | fprintf(f, "Lengths %u\n", LengthCount); 60 | fprintf(f, "Leaf labels %u\n", LeafLabelCount); 61 | fprintf(f, "Int. node labels %u (%u ints %u floats)\n", 62 | InternalNodeLabelCount, IntLabelCount, FloatLabelCount); 63 | 64 | vector RootDists; 65 | vector LeafRootDists; 66 | T.GetRootDists(RootDists); 67 | T.GetLeafRootDists(LeafRootDists); 68 | 69 | QuartsDouble QD; 70 | GetQuarts(LeafRootDists, QD); 71 | fprintf(f, "Min height %.3g\n", QD.Min); 72 | fprintf(f, "Avg height %.3g\n", QD.Avg); 73 | fprintf(f, "Max height %.3g\n", QD.Max); 74 | 75 | if (Binary) 76 | { 77 | string Label1, Label2; 78 | double Diameter = CalcDiameter(T, Label1, Label2); 79 | if (Label1 != "" && Label2 != "") 80 | { 81 | uint Node1 = T.GetNodeByLabel(Label1, true); 82 | uint Node2 = T.GetNodeByLabel(Label2, true); 83 | double RootDist1 = RootDists[Node1]; 84 | double RootDist2 = RootDists[Node2]; 85 | 86 | fprintf(f, "Diameter %.3g %s (%.3g) ---> %s (%.3g)\n", 87 | Diameter, Label1.c_str(), RootDist1, Label2.c_str(), RootDist2); 88 | } 89 | } 90 | 91 | fprintf(f, "\n"); 92 | fprintf(f, "Degree Nodes\n"); 93 | for (uint Degree = 0; Degree < SIZE(DegreeToCount); ++Degree) 94 | { 95 | uint n = DegreeToCount[Degree]; 96 | if (n > 0) 97 | fprintf(f, "%6u %5u\n", Degree, n); 98 | } 99 | } 100 | 101 | void cmd_stats() 102 | { 103 | const string &FileName = opt(stats); 104 | vector Trees; 105 | TreesFromFile(FileName, Trees); 106 | const uint TreeCount = SIZE(Trees); 107 | 108 | FILE *f = CreateStdioFile(opt(output)); 109 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 110 | { 111 | const TreeN &T = *Trees[TreeIndex]; 112 | Stats(stdout, T); 113 | Stats(f, T); 114 | if (opt(log_tree)) 115 | { 116 | Log("\n"); 117 | if (TreeCount > 0) 118 | Log("Tree[%u]\n", TreeIndex); 119 | T.LogMe(); 120 | } 121 | } 122 | CloseStdioFile(f); 123 | } 124 | 125 | void cmd_rootdists() 126 | { 127 | const string &FileName = opt(rootdists); 128 | vector Trees; 129 | TreesFromFile(FileName, Trees); 130 | const uint TreeCount = SIZE(Trees); 131 | 132 | FILE *f = CreateStdioFile(opt(output)); 133 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 134 | { 135 | const TreeN &T = *Trees[TreeIndex]; 136 | const uint Root = T.m_Root; 137 | asserta(Root != UINT_MAX); 138 | 139 | vector Nodes; 140 | T.GetNodes(Nodes); 141 | const uint NodeCount = SIZE(Nodes); 142 | for (uint i = 0; i < NodeCount; ++i) 143 | { 144 | uint NodeIndex = Nodes[i]; 145 | if (T.IsLeaf(NodeIndex)) 146 | { 147 | double Dist = T.GetRootDist(NodeIndex); 148 | const char *Label = T.GetLabel(NodeIndex).c_str(); 149 | Pf(f, "%.4g\t%s\n", Dist, Label); 150 | } 151 | } 152 | } 153 | CloseStdioFile(f); 154 | } 155 | -------------------------------------------------------------------------------- /src/colors.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "svg.h" 3 | #include "treen.h" 4 | 5 | void TreesFromFile(const string &FileName, vector &Trees); 6 | void GetValuesFromTrees(vector &Trees, char Sep, uint FieldIndex, 7 | const string &MissingValue, vector &Values); 8 | 9 | // https://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically/ 10 | 11 | void ColorsFromFile(const string &FileName, 12 | vector &Values, vector &Colors) 13 | { 14 | Values.clear(); 15 | Colors.clear(); 16 | FILE *f = OpenStdioFile(FileName); 17 | string Line; 18 | vector Fields; 19 | while (ReadLineStdioFile(f, Line)) 20 | { 21 | Split(Line, Fields, '\t'); 22 | asserta(SIZE(Fields) == 2); 23 | const string &Value = Fields[0]; 24 | const string &Color = Fields[1]; 25 | Values.push_back(Value); 26 | Colors.push_back(Color); 27 | } 28 | CloseStdioFile(f); 29 | } 30 | 31 | static uint hsv_to_rgb(double h, double s, double v) 32 | { 33 | int h_i = int(h*6); 34 | double f = h*6 - h_i; 35 | double p = v * (1 - s); 36 | double q = v * (1 - f*s); 37 | double t = v * (1 - (1 - f) * s); 38 | 39 | double r, g, b; 40 | #define Assign(x, y, z) { r = x; b = y; g = z; } 41 | if (h_i == 0) 42 | Assign(v, t, p) 43 | else if (h_i == 1) 44 | Assign(q, v, p) 45 | else if (h_i == 2) 46 | Assign(p, v, t) 47 | else if (h_i == 3) 48 | Assign(p, q, v) 49 | else if (h_i == 4) 50 | Assign(t, p, v) 51 | else if (h_i == 5) 52 | Assign(v, p, q) 53 | else 54 | asserta(false); 55 | #undef Assign 56 | 57 | uint R = uint(r*256); 58 | uint G = uint(g*256); 59 | uint B = uint(b*256); 60 | asserta(R >= 0 && R < 256); 61 | asserta(G >= 0 && G < 256); 62 | asserta(B >= 0 && B < 256); 63 | 64 | uint RGB = (R << 16) + (G << 8) + B; 65 | return RGB; 66 | } 67 | 68 | static double g_h = 3.1415; 69 | uint GetRandomColor() 70 | { 71 | g_h += 0.618033988749895; 72 | g_h = fmod(g_h, 1.0); 73 | uint RGB = hsv_to_rgb(g_h, 0.5, 0.95); 74 | return RGB; 75 | } 76 | 77 | void GetRandomColors(uint N, vector &Colors) 78 | { 79 | Colors.clear(); 80 | for (uint i = 0; i < N; ++i) 81 | { 82 | uint RGB = GetRandomColor(); 83 | string Color; 84 | Ps(Color, "#%06x", RGB); 85 | Colors.push_back(Color); 86 | } 87 | } 88 | 89 | void InitRandomColor(uint Seed) 90 | { 91 | g_h = double(Seed)*1023.456; 92 | g_h = fmod(g_h, 1.0); 93 | } 94 | 95 | void cmd_palette() 96 | { 97 | const string &OutputFileName = opt(palette); 98 | 99 | uint Seed = 1; 100 | if (optset_randseed) 101 | Seed = opt(randseed); 102 | 103 | const uint MAXN = 32; 104 | 105 | const double SZ = 10; 106 | const double MARGIN = 3; 107 | 108 | double FigWidth = (SZ + MARGIN)*MAXN + 3*MARGIN + 20; 109 | double FigHeight = (SZ + 2*MARGIN)*MAXN + 3*MARGIN + 20; 110 | 111 | Svg S; 112 | S.Open(OutputFileName, FigWidth, FigHeight); 113 | 114 | double Y = 3*MARGIN; 115 | for (uint N = 2; N <= MAXN; ++N) 116 | { 117 | InitRandomColor(Seed); 118 | vector Colors; 119 | GetRandomColors(N, Colors); 120 | double X = 3*MARGIN; 121 | for (uint i = 0; i < N; ++i) 122 | { 123 | const string Color = Colors[i]; 124 | S.Rect(X, Y, SZ, SZ, 1, Color, Color); 125 | X += SZ + MARGIN; 126 | } 127 | Y += SZ + 2*MARGIN; 128 | } 129 | 130 | S.Close(); 131 | } 132 | 133 | void cmd_tree2palette() 134 | { 135 | const string &TreeFileName = opt(tree2palette); 136 | asserta(optset_ff); 137 | 138 | const string &FF = opt(ff); 139 | if (SIZE(FF) != 2 || !isdigit(FF[1])) 140 | Die("Invalid ff"); 141 | 142 | char Sep = FF[0]; 143 | char Digit = FF[1]; 144 | if (Digit == '0') 145 | Die("Invalid ff (field must be >0)"); 146 | uint FieldIndex = uint(Digit - '0') - 1; 147 | 148 | vector Trees; 149 | TreesFromFile(TreeFileName, Trees); 150 | 151 | vector Values; 152 | GetValuesFromTrees(Trees, Sep, FieldIndex, ".", Values); 153 | 154 | 155 | uint Seed = 1; 156 | if (optset_randseed) 157 | Seed = opt(randseed); 158 | InitRandomColor(Seed); 159 | 160 | const uint ValueCount = SIZE(Values); 161 | vector Colors; 162 | GetRandomColors(ValueCount, Colors); 163 | asserta(SIZE(Colors) == ValueCount); 164 | 165 | FILE *fOut = CreateStdioFile(opt(output)); 166 | for (uint ValueIndex = 0; ValueIndex < ValueCount; ++ValueIndex) 167 | { 168 | const string &Value = Values[ValueIndex]; 169 | const string &Color = Colors[ValueIndex]; 170 | ProgressLog(" %s %s\n", Color.c_str(), Value.c_str()); 171 | Pf(fOut, "%s %s\n", Value.c_str(), Color.c_str()); 172 | } 173 | CloseStdioFile(fOut); 174 | } 175 | -------------------------------------------------------------------------------- /src/shrink.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "treen.h" 3 | #include "sort.h" 4 | 5 | #if 0 6 | 7 | /*** 8 | "Shrink" a tree by collapsing leaf nodes into subtrees 9 | with maximum LCA-leaf distance approximately d, where d 10 | is a parameter, ignoring outlier long branches which may 11 | be artifacts. 12 | ***/ 13 | 14 | static double g_MaxUpperDist = 1.5; 15 | static uint g_MinGroupSize = 5; 16 | static uint g_MaxIters = 10; 17 | static const vector *g_RootDists; 18 | 19 | static double GetUpperLeafDist(TreeN &T, uint Node) 20 | { 21 | const vector &RootDists = *g_RootDists; 22 | vector LeafNodes; 23 | T.GetSubtreeLeafNodes(Node, LeafNodes); 24 | asserta(Node < SIZE(RootDists)); 25 | double ThisRootDist = RootDists[Node]; 26 | vector LeafDists; 27 | const uint N = SIZE(LeafNodes); 28 | for (uint i = 0; i < N; ++i) 29 | { 30 | uint Leaf = LeafNodes[i]; 31 | asserta(Leaf < SIZE(RootDists)); 32 | double d = RootDists[Leaf]; 33 | LeafDists.push_back(d - ThisRootDist); 34 | } 35 | vector Order; 36 | QuickSortInPlace(LeafDists.data(), N); 37 | double UpperDist = LeafDists[3*N/4]; 38 | return UpperDist; 39 | } 40 | 41 | static void DoTree(TreeN &T) 42 | { 43 | // const uint NodeCount = T.GetNodeCount(); 44 | vector RootDists; 45 | T.GetRootDists(RootDists); 46 | g_RootDists = &RootDists; 47 | vector SelectedNodes; 48 | set SelectedLeafNodes; 49 | vector LeafNodes; 50 | vector Nodes; 51 | for (uint Iter = 0; Iter < g_MaxIters; ++Iter) 52 | { 53 | uint BestNode = UINT_MAX; 54 | uint BestSize = UINT_MAX; 55 | T.GetNodes(Nodes); 56 | for (uint i = 0; i < SIZE(Nodes); ++i) 57 | { 58 | uint Node = Nodes[i]; 59 | if (T.IsLeaf(Node)) 60 | continue; 61 | if (T.IsRoot(Node)) 62 | continue; 63 | uint Parent = T.GetParent(Node); 64 | if (T.IsRoot(Parent)) 65 | continue; 66 | uint SubtreeLeafNodeCount = T.GetSubtreeLeafCount(Node); 67 | if (SubtreeLeafNodeCount < g_MinGroupSize) 68 | continue; 69 | double ud = GetUpperLeafDist(T, Node); 70 | T.GetSubtreeLeafNodes(Node, LeafNodes); 71 | uint Size = SIZE(LeafNodes); 72 | if (Size > g_MinGroupSize && 73 | ud <= g_MaxUpperDist) 74 | { 75 | bool Found = false; 76 | for (uint i = 0; i < Size; ++i) 77 | { 78 | uint LeafNode = LeafNodes[i]; 79 | if (SelectedLeafNodes.find(LeafNode) != 80 | SelectedLeafNodes.end()) 81 | { 82 | Found = true; 83 | break; 84 | } 85 | } 86 | if (Found) 87 | continue; 88 | 89 | if (BestNode == UINT_MAX || Size > BestSize) 90 | { 91 | BestNode = Node; 92 | BestSize = Size; 93 | } 94 | } 95 | } 96 | if (BestNode == UINT_MAX) 97 | break; 98 | SelectedNodes.push_back(BestNode); 99 | T.GetSubtreeLeafNodes(BestNode, LeafNodes); 100 | asserta(SIZE(LeafNodes) == BestSize); 101 | for (uint i = 0; i < BestSize; ++i) 102 | SelectedLeafNodes.insert(LeafNodes[i]); 103 | } 104 | for (uint i = 0; i < SIZE(SelectedNodes); ++i) 105 | { 106 | uint SelectedNode = SelectedNodes[i]; 107 | string NewLabel; 108 | Ps(NewLabel, "SELECTED_%u", i); 109 | T.UpdateLabel(SelectedNode, NewLabel); 110 | 111 | const vector &ChildNodes = T.GetChildren(SelectedNode); 112 | for (uint i = 0; i < SIZE(ChildNodes); ++i) 113 | T.DeleteNode(ChildNodes[i], false); 114 | } 115 | T.SetDerived(); 116 | T.CollapseUnary(); 117 | 118 | T.GetNodes(Nodes); 119 | for (uint i = 0; i < SIZE(Nodes); ++i) 120 | { 121 | uint Node = Nodes[i]; 122 | if (T.IsLeaf(Node)) 123 | { 124 | string Label; 125 | T.GetLabel(Node, Label); 126 | if (Label == "") 127 | { 128 | Warning("Empty leaf label"); 129 | Ps(Label, "_leaf_%u", Node); 130 | T.UpdateLabel(Node, Label); 131 | } 132 | continue; 133 | } 134 | 135 | string Label; 136 | T.GetLabel(Node, Label); 137 | if (Label != "") 138 | continue; 139 | string NewLabel; 140 | Ps(NewLabel, "_node_%u", Node); 141 | T.UpdateLabel(Node, NewLabel); 142 | } 143 | bool Rooted; 144 | asserta(T.IsBinary(Rooted)); 145 | asserta(Rooted); 146 | } 147 | 148 | void cmd_shrink() 149 | { 150 | const string &InputFileName = opt(shrink); 151 | const string &OutputFileName = opt(output); 152 | FILE *fOut = CreateStdioFile(OutputFileName); 153 | 154 | vector Trees; 155 | TreesFromFile(InputFileName, Trees); 156 | const uint TreeCount = SIZE(Trees); 157 | for (uint TreeIndex = 0; TreeIndex < TreeCount; ++TreeIndex) 158 | { 159 | TreeN &T = *Trees[TreeIndex]; 160 | DoTree(T); 161 | T.ToNewickFile(fOut); 162 | } 163 | CloseStdioFile(fOut); 164 | } 165 | #else 166 | // Kind of a dead end 167 | void cmd_shrink() {} 168 | #endif 169 | -------------------------------------------------------------------------------- /src/conf2.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "biparter.h" 3 | 4 | void GetFractConfs(const TreeN &T, vector &Confs); 5 | 6 | static FILE *g_fTab; 7 | 8 | static bool IsSubtreeConflict(const set &Subtree1, const set &Subtree2) 9 | { 10 | const uint N1 = SIZE(Subtree1); 11 | uint FoundIn2Count = 0; 12 | for (set::const_iterator p = Subtree1.begin(); 13 | p != Subtree1.end(); ++p) 14 | { 15 | uint Node = *p; 16 | if (Subtree2.find(Node) != Subtree2.end()) 17 | ++FoundIn2Count; 18 | } 19 | bool Conflict = (FoundIn2Count > 0 && FoundIn2Count < N1); 20 | return Conflict; 21 | } 22 | 23 | void GetSubtreeVec(const TreeN &T, const vector &FractConfs, 24 | const map &LabelToIndex, double MinConf, 25 | vector &HiConfNodes, vector > &SubtreeVec) 26 | { 27 | asserta(T.IsNormalized()); 28 | 29 | SubtreeVec.clear(); 30 | HiConfNodes.clear(); 31 | 32 | const uint NodeCount = T.GetNodeCount(); 33 | asserta(SIZE(FractConfs) == NodeCount); 34 | 35 | for (uint Node = 0; Node < NodeCount; ++Node) 36 | { 37 | double Conf = FractConfs[Node]; 38 | if (Conf < MinConf) 39 | continue; 40 | if (T.IsLeaf(Node) || T.IsRoot(Node)) 41 | continue; 42 | 43 | set LeafNodes; 44 | T.AppendSubtreeLeafNodes(Node, LeafNodes); 45 | 46 | set LabelIndexes; 47 | for (set::const_iterator p = LeafNodes.begin(); 48 | p != LeafNodes.end(); ++p) 49 | { 50 | uint LeafNodeIndex = *p; 51 | asserta(T.IsLeaf(LeafNodeIndex)); 52 | const string &Label = T.GetLabel(LeafNodeIndex); 53 | if (!Label.empty()) 54 | { 55 | map::const_iterator q = 56 | LabelToIndex.find(Label); 57 | asserta(q != LabelToIndex.end()); 58 | uint LabelIndex = q->second; 59 | LabelIndexes.insert(LabelIndex); 60 | } 61 | } 62 | 63 | HiConfNodes.push_back(Node); 64 | SubtreeVec.push_back(LabelIndexes); 65 | } 66 | } 67 | 68 | // Search for conflicting edges with conf above threshold 69 | void cmd_conf2() 70 | { 71 | const string &FileName1 = opt(conf2); 72 | const string &FileName2 = opt(tree2); 73 | const string &OutputFileName = opt(output); 74 | g_fTab = CreateStdioFile(OutputFileName); 75 | double MinConf = 0.9; 76 | if (optset_minconf) 77 | MinConf = opt(minconf); 78 | asserta(MinConf >= 0 && MinConf <= 1); 79 | 80 | TreeN T1; 81 | TreeN T2; 82 | T1.FromNewickFile(FileName1); 83 | T2.FromNewickFile(FileName2); 84 | 85 | asserta(T1.IsNormalized()); 86 | asserta(T2.IsNormalized()); 87 | 88 | map LabelToIndex; 89 | T1.GetSortedLeafLabelToIndex(LabelToIndex); 90 | 91 | for (map::const_iterator p = LabelToIndex.begin(); 92 | p != LabelToIndex.end(); ++p) 93 | { 94 | const string &Label = p->first; 95 | uint Node = T2.GetNodeByLabel(Label, false); 96 | if (Node == UINT_MAX) 97 | Die("Not found in tree2 >%s", Label.c_str()); 98 | } 99 | 100 | vector FractConfs1; 101 | vector FractConfs2; 102 | GetFractConfs(T1, FractConfs1); 103 | GetFractConfs(T2, FractConfs2); 104 | 105 | vector HiConfNodes1; 106 | vector HiConfNodes2; 107 | 108 | vector > SubtreeVec1; 109 | vector > SubtreeVec2; 110 | 111 | GetSubtreeVec(T1, FractConfs1, LabelToIndex, MinConf, 112 | HiConfNodes1, SubtreeVec1); 113 | GetSubtreeVec(T2, FractConfs2, LabelToIndex, MinConf, 114 | HiConfNodes2, SubtreeVec2); 115 | 116 | const uint N1 = SIZE(HiConfNodes1); 117 | const uint N2 = SIZE(HiConfNodes2); 118 | asserta(SIZE(SubtreeVec1) == N1); 119 | asserta(SIZE(SubtreeVec2) == N2); 120 | 121 | for (uint i1 = 0; i1 < N1; ++i1) 122 | { 123 | uint Node1 = HiConfNodes1[i1]; 124 | const set Subtree1 = SubtreeVec1[i1]; 125 | for (uint i2 = 0; i2 < N2; ++i2) 126 | { 127 | uint Node2 = HiConfNodes2[i2]; 128 | const set Subtree2 = SubtreeVec2[i2]; 129 | 130 | bool Conflict12 = IsSubtreeConflict(Subtree1, Subtree2); 131 | bool Conflict21 = IsSubtreeConflict(Subtree2, Subtree1); 132 | 133 | if (Conflict12 || Conflict21) 134 | { 135 | uint Size1 = SIZE(Subtree1); 136 | uint Size2 = SIZE(Subtree2); 137 | 138 | double Conf1 = FractConfs1[Node1]; 139 | double Conf2 = FractConfs2[Node2]; 140 | 141 | Pf(g_fTab, "conflict"); 142 | Pf(g_fTab, "\tnode1=%u", Node1); 143 | Pf(g_fTab, "\tsize1=%u", Size1); 144 | Pf(g_fTab, "\tconf1=%.3f", Conf1); 145 | Pf(g_fTab, "\tnode2=%u", Node2); 146 | Pf(g_fTab, "\tsize2=%u", Size2); 147 | Pf(g_fTab, "\tconf2=%.3f", Conf2); 148 | Pf(g_fTab, "\tconflict12=%c", yon(Conflict12)); 149 | Pf(g_fTab, "\tconflict21=%c", yon(Conflict21)); 150 | Pf(g_fTab, "\n"); 151 | } 152 | } 153 | } 154 | 155 | CloseStdioFile(g_fTab); 156 | } 157 | -------------------------------------------------------------------------------- /src/split.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "splitter.h" 3 | #include "sort.h" 4 | 5 | /*** 6 | Attempts to split tree into SplitCount subtrees of 7 | approximately equal size. 8 | Greedily selects the largest remaining subtree. 9 | ***/ 10 | 11 | void Splitter::Run(const Tree2 &T, uint SplitCount) 12 | { 13 | m_T = &T; 14 | const uint NodeCount = m_T->GetNodeCount(); 15 | const uint Root = m_T->GetRoot(); 16 | m_TargetSize = NodeCount/SplitCount; 17 | m_SplitCount = SplitCount; 18 | asserta(m_TargetSize > 1); 19 | m_SubtreeNodes.push_back(m_T->m_Root); 20 | for (m_SplitIndex = 1; m_SplitIndex < m_SplitCount; ++m_SplitIndex) 21 | { 22 | asserta(SIZE(m_SubtreeNodes) == m_SplitIndex); 23 | uint BiggestNode = GetBiggestNode(); 24 | uint Left = m_T->GetLeft(BiggestNode); 25 | uint Right = m_T->GetRight(BiggestNode); 26 | vector NewSubtreeNodes; 27 | for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) 28 | { 29 | uint Node = m_SubtreeNodes[i]; 30 | if (Node == BiggestNode) 31 | { 32 | uint Left = m_T->GetLeft(BiggestNode); 33 | NewSubtreeNodes.push_back(Left); 34 | NewSubtreeNodes.push_back(Right); 35 | } 36 | else 37 | NewSubtreeNodes.push_back(Node); 38 | } 39 | m_SubtreeNodes = NewSubtreeNodes; 40 | LogState(); 41 | } 42 | asserta(SIZE(m_SubtreeNodes) == m_SplitCount); 43 | } 44 | 45 | void Splitter::GetSizeOrder(vector &Order) const 46 | { 47 | vector Sizes; 48 | for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) 49 | { 50 | uint Node = m_SubtreeNodes[i]; 51 | uint Size = m_T->GetSubtreeLeafCount(Node); 52 | Sizes.push_back(Size); 53 | } 54 | uint N = SIZE(Sizes); 55 | Order.resize(N); 56 | QuickSortOrderDesc(Sizes.data(), N, Order.data()); 57 | } 58 | 59 | void Splitter::GetSubtree(Tree2 &Subtree) const 60 | { 61 | asserta(m_T != 0); 62 | const uint Size = SIZE(m_SubtreeNodes); 63 | const char *Prefix = ""; 64 | if (optset_prefix) 65 | Prefix = opt(prefix).c_str(); 66 | vector SubsetLabels; 67 | for (uint i = 0; i < Size; ++i) 68 | { 69 | string Label; 70 | Ps(Label, "%s%u", Prefix, i+1); 71 | SubsetLabels.push_back(Label); 72 | } 73 | 74 | MakeSubsetNodes(*m_T, m_SubtreeNodes, SubsetLabels, 75 | Subtree); 76 | } 77 | 78 | void Splitter::LogState() const 79 | { 80 | Log("\n"); 81 | Log("_______________ Split %u ______________\n", m_SplitIndex); 82 | Log(" Node Size LSize RSize\n"); 83 | // 12345 12345 12345 12345\n"); 84 | vector Order; 85 | GetSizeOrder(Order); 86 | uint SumSize = 0; 87 | for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) 88 | { 89 | uint k = Order[i]; 90 | uint Node = m_SubtreeNodes[k]; 91 | uint Size = m_T->GetSubtreeLeafCount(Node); 92 | SumSize += Size; 93 | Log("%5u", Node); 94 | Log(" %5u", Size); 95 | if (!m_T->IsLeaf(Node)) 96 | { 97 | uint Left = m_T->GetLeft(Node); 98 | uint Right = m_T->GetLeft(Node); 99 | uint LSize = m_T->GetSubtreeLeafCount(Left); 100 | uint RSize = m_T->GetSubtreeLeafCount(Right); 101 | Log(" %5u %5u", LSize, RSize); 102 | } 103 | Log("\n"); 104 | } 105 | Log("Total %u\n", SumSize); 106 | } 107 | 108 | uint Splitter::GetBiggestNode() const 109 | { 110 | uint MaxSize = 0; 111 | uint MaxNode = UINT_MAX; 112 | for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) 113 | { 114 | uint Node = m_SubtreeNodes[i]; 115 | uint Size = m_T->GetSubtreeLeafCount(Node); 116 | if (Size > MaxSize) 117 | { 118 | MaxNode = Node; 119 | MaxSize = Size; 120 | } 121 | } 122 | asserta(MaxNode != UINT_MAX); 123 | return MaxNode; 124 | } 125 | 126 | void Splitter::WriteLabels(const string &FileNamePrefix) const 127 | { 128 | if (FileNamePrefix.empty()) 129 | return; 130 | 131 | vector Order; 132 | GetSizeOrder(Order); 133 | 134 | string LabelsFileName; 135 | for (uint i = 0; i < SIZE(m_SubtreeNodes); ++i) 136 | { 137 | uint k = Order[i]; 138 | uint Node = m_SubtreeNodes[k]; 139 | vector Labels; 140 | m_T->GetSubtreeLeafLabels(Node, Labels); 141 | LabelsFileName = opt(prefix); 142 | Psa(LabelsFileName, "%u", i+1); 143 | FILE *f = CreateStdioFile(LabelsFileName); 144 | for (uint j = 0; j < SIZE(Labels); ++j) 145 | fprintf(f, "%s\n", Labels[j].c_str()); 146 | CloseStdioFile(f); 147 | } 148 | } 149 | 150 | void cmd_split() 151 | { 152 | const string &TreeFileName = opt(split); 153 | 154 | uint n = 16; 155 | if (optset_n) 156 | n = opt(n); 157 | asserta(n > 1); 158 | 159 | Tree2 T; 160 | T.FromFile(TreeFileName); 161 | asserta(T.IsRooted()); 162 | 163 | Splitter S; 164 | S.Run(T, n); 165 | S.WriteLabels(opt(prefix)); 166 | 167 | if (optset_output) 168 | { 169 | Tree2 Subtree; 170 | S.GetSubtree(Subtree); 171 | Subtree.ToNewickFile(opt(output)); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/usage.txt: -------------------------------------------------------------------------------- 1 | # newick 2 | Manipulate and draw trees in Newick format. 3 | 4 | ## Downloads 5 | 6 | Binary files are self-contained, no dependencies. 7 | 8 | Linux [newick](https://github.com/rcedgar/newick/raw/main/binaries/newick) 9 | Windows [newick.exe](https://github.com/rcedgar/newick/raw/main/binaries/newick.exe) 10 | 11 | ## Usage 12 | 13 | Make a subset tree given file with leaf labels, one per line (labels 14 | do not need to be a subtree, the tree is collapsed as needed): 15 | newick -input tree.newick -labels labels.txt -output subset.newick 16 | 17 | Get leaf labels: 18 | newick -getlabels tree.newick -output labels.txt 19 | 20 | Report miscellaneous information about a Newick file: 21 | newick -stats trees.newick 22 | 23 | Calculate Robinson-Foulds (R-F) distance between two trees: 24 | newick -rofo tree1.newick -tree2 tree2.newick -log rofo.log 25 | 26 | Calculate all-vs-all R-F distances between trees in Newick file: 27 | newick -rofos trees.newick -log rofos.log 28 | 29 | Re-label trees, labels.tsv tab-separated with #1=old_label #2=new_label: 30 | newick -relabel trees.newick -labels labels.tsv -output relabeled_trees.newick 31 | 32 | Add integer node number labels to internal nodes: 33 | newick -intlabel tree.newick -output intlabel.newick 34 | 35 | Root by outgroup, specify labels.txt with leaf labels of outgroup or GroupName which 36 | is a substring of the outgroup labels, e.g. phylum name if format is A1234.Phylum: 37 | newick trees.newick [-labels labels.txt | -outgroup GroupName] -output rooted.newick 38 | 39 | Convert tab-separated to Newick: 40 | newick -tsv2newick tree.tsv -output tree.newick 41 | 42 | Convert Newick to tab-separated: 43 | newick -newick2tsv tree.newick -output tree.tsv 44 | 45 | Ladderize trees by rotating internal nodes so that larger subtree is always the 46 | left (default) or right subtree: 47 | newick -ladderize trees.newick -output ladderized.newick [-right] 48 | 49 | Split tree into N roughly equal-sized subtrees (clusters), output is N files 50 | named prefixi, i=1..N containing labels for each subtree: 51 | newick -split tree.newick -n N -prefix prefix 52 | 53 | Convert trees to cladograms (leaves equidistant from root): 54 | newick -clado trees.newick -output clado.newick 55 | 56 | Calculate edge confidence values from set of bootstrapped trees: 57 | newick -conf tree.newick -trees replicates.newick -output conftree.newick 58 | 59 | Condense a tree by identifying best-fit nodes for each feature group and making 60 | a tree of just those nodes; unary edges are collapsed by summing lengths and 61 | taking max confidence, leaves are labeled with features (e.g. phylum names): 62 | newick -condense trees.newick -features features.tsv -output condensed.newick 63 | 64 | Extract just the branching order by collapsing unary nodes, deleting all edge lengths 65 | and deleting all confidence values (all internal node labels removed): 66 | newick -topo trees.newick -output topos.newick 67 | 68 | Delete one or more leaves and collapse any resulting unary nodes, useful e.g. for 69 | deleting outgroup to simplify figure: 70 | newick -deleteleaves trees.newick [-label OutgroupName | -labels labels.txt] -output .newick 71 | 72 | Draw one tree or several trees with optional coloring of edges: 73 | newick -draw tree.newick -svg figure.svg 74 | newick -drawf tree.newick -features features.tsv -colors colors.tsv -svg figure.svg 75 | newick -drawfs trees.newick -features features.tsv -colors colors.tsv -svg figure.svg 76 | 77 | -features is tsv file with #1 leaf_label #2 feature_name (e.g. phylum). 78 | -colors is tsv file with #1 feature_name #2 color, where color is any valid svg color, 79 | can be rgb, hex or name e.g. red. 80 | 81 | -default_color color 82 | Color for unlabeled edges (default gray). 83 | -title text 84 | Title text. 85 | -title_font_size n 86 | Title font size (default 10). 87 | -unitlengths 88 | Treat all edge lengths as 1 (phylogram). 89 | -strokewidth n 90 | Line width for edges (default 1). 91 | -tree_width n 92 | Width of tree (default 1000). 93 | -tree_height n 94 | Height of tree (default 1000). 95 | -tree_spacing n 96 | Space between trees (default 300). 97 | -trees_per_row n 98 | Number of trees per row in figure (default 4). 99 | -legend legend.svg 100 | Legend showing features (e.g. phylum names) and colors. 101 | -------------------------------------------------------------------------------- /src/getcc.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | 3 | #define TRACE 0 4 | 5 | #if TRACE 6 | static vector g_Labels; 7 | #endif 8 | 9 | void GetConnComps(const vector &FromIndexes, const vector &ToIndexes, 10 | vector > &CCs, bool ShowProgress) 11 | { 12 | CCs.clear(); 13 | 14 | const uint EdgeCount = SIZE(FromIndexes); 15 | asserta(SIZE(ToIndexes) == EdgeCount); 16 | if (EdgeCount == 0) 17 | return; 18 | 19 | uint MaxIndex = 0; 20 | for (uint EdgeIndex = 0; EdgeIndex < EdgeCount; ++EdgeIndex) 21 | { 22 | uint From = FromIndexes[EdgeIndex]; 23 | uint To = ToIndexes[EdgeIndex]; 24 | 25 | if (From > MaxIndex) 26 | MaxIndex = From; 27 | if (To > MaxIndex) 28 | MaxIndex = To; 29 | } 30 | uint SeqCount = MaxIndex + 1; 31 | 32 | vector > AdjMx(SeqCount); 33 | for (uint NodeIndex = 0; NodeIndex < SeqCount; ++NodeIndex) 34 | AdjMx[NodeIndex].push_back(NodeIndex); 35 | 36 | for (uint EdgeIndex = 0; EdgeIndex < EdgeCount; ++EdgeIndex) 37 | { 38 | if (ShowProgress) 39 | ProgressStep(EdgeIndex, EdgeCount, "CCs adj mx"); 40 | 41 | uint From = FromIndexes[EdgeIndex]; 42 | asserta(From < SeqCount); 43 | 44 | uint To = ToIndexes[EdgeIndex]; 45 | asserta(To < SeqCount); 46 | #if TRACE 47 | Log("BuildMx %s(%u) -> %s(%u)\n", 48 | g_Labels[From].c_str(), From, g_Labels[To].c_str(), To); 49 | #endif 50 | if (From != To) 51 | { 52 | AdjMx[From].push_back(To); 53 | AdjMx[To].push_back(From); 54 | } 55 | } 56 | 57 | #if DEBUG 58 | { 59 | for (uint NodeIndex1 = 0; NodeIndex1 < SeqCount; ++NodeIndex1) 60 | { 61 | const vector &v1 = AdjMx[NodeIndex1]; 62 | for (uint i = 0; i < SIZE(v1); ++i) 63 | { 64 | uint NodeIndex2 = v1[i]; 65 | const vector &v2 = AdjMx[NodeIndex2]; 66 | bool Found = false; 67 | for (uint j = 0; j < SIZE(v2); ++j) 68 | { 69 | if (v2[j] == NodeIndex1) 70 | { 71 | Found = true; 72 | break; 73 | } 74 | } 75 | asserta(Found); 76 | } 77 | } 78 | } 79 | #endif 80 | 81 | #if TRACE 82 | { 83 | Log("AdjMx\n"); 84 | for (uint NodeIndex1 = 0; NodeIndex1 < SeqCount; ++NodeIndex1) 85 | { 86 | const vector &v = AdjMx[NodeIndex1]; 87 | Log("%s(%u): ", g_Labels[NodeIndex1].c_str(), NodeIndex1); 88 | for (uint i = 0; i < SIZE(v); ++i) 89 | { 90 | uint NodeIndex2 = v[i]; 91 | Log(" %s(%u)", g_Labels[NodeIndex2].c_str(), NodeIndex2); 92 | } 93 | Log("\n"); 94 | } 95 | } 96 | #endif 97 | vector Assigned(SeqCount, false); 98 | vector Pended(SeqCount, false); 99 | 100 | uint CCCount = 0; 101 | uint DoneCount = 0; 102 | for (uint NodeIndex = 0; NodeIndex < SeqCount; ++NodeIndex) 103 | { 104 | if (ShowProgress) 105 | ProgressStep(NodeIndex, SeqCount, "CCs clustering"); 106 | 107 | if (Assigned[NodeIndex]) 108 | { 109 | asserta(Pended[NodeIndex]); 110 | continue; 111 | } 112 | #if TRACE 113 | Log("\n"); 114 | Log("CC%u New CC, seed %u (%s)\n", CCCount, NodeIndex, g_Labels[NodeIndex].c_str()); 115 | #endif 116 | 117 | vector Empty; 118 | CCs.push_back(Empty); 119 | 120 | vector Pending; 121 | asserta(!Pended[NodeIndex]); 122 | Pending.push_back(NodeIndex); 123 | Pended[NodeIndex] = true; 124 | ++DoneCount; 125 | 126 | while (!Pending.empty()) 127 | { 128 | uint n = SIZE(Pending); 129 | asserta(n > 0); 130 | 131 | uint NodeIndex2 = Pending.back(); 132 | Pending.pop_back(); 133 | #if TRACE 134 | Log("CC%u pop %u(%s)\n", CCCount, NodeIndex2, g_Labels[NodeIndex2].c_str()); 135 | #endif 136 | 137 | asserta(NodeIndex2 < SeqCount); 138 | asserta(Pended[NodeIndex2]); 139 | asserta(!Assigned[NodeIndex2]); 140 | CCs[CCCount].push_back(NodeIndex2); 141 | #if TRACE 142 | Log("CC%u assign %u(%s)\n", CCCount, NodeIndex2, g_Labels[NodeIndex2].c_str()); 143 | #endif 144 | Assigned[NodeIndex2] = true; 145 | const vector &Neighbors = AdjMx[NodeIndex2]; 146 | uint NeighborCount = SIZE(Neighbors); 147 | for (uint i = 0; i < NeighborCount; ++i) 148 | { 149 | uint NeighborNodeIndex = Neighbors[i]; 150 | asserta(NeighborNodeIndex < SeqCount); 151 | #if TRACE 152 | Log("CC%u neighbor %u(%s) -> %u(%s), pended %c\n", 153 | CCCount, 154 | NodeIndex2, g_Labels[NodeIndex2].c_str(), 155 | NeighborNodeIndex, g_Labels[NeighborNodeIndex].c_str(), 156 | tof(Pended[NeighborNodeIndex])); 157 | #endif 158 | if (!Pended[NeighborNodeIndex]) 159 | { 160 | asserta(!Assigned[NeighborNodeIndex]); 161 | Pending.push_back(NeighborNodeIndex); 162 | Pended[NeighborNodeIndex] = true; 163 | ++DoneCount; 164 | #if TRACE 165 | Log("CC%u pend %u(%s)\n", CCCount, NeighborNodeIndex, g_Labels[NeighborNodeIndex].c_str()); 166 | #endif 167 | } 168 | } 169 | } 170 | ++CCCount; 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Newick](http://drive5.com/images/newick_header3.png) 2 | 3 | # newick 4 | Manipulate and draw trees in [Newick format](https://en.wikipedia.org/wiki/Newick_format). 5 | 6 | ## Downloads 7 | 8 | Binary files are self-contained, no dependencies. 9 | 10 | Linux [newick](https://github.com/rcedgar/newick/raw/main/binaries/newick) 11 | Windows [newick.exe](https://github.com/rcedgar/newick/raw/main/binaries/newick.exe) 12 | 13 | ## Usage 14 | 15 | Make a subset tree given file with leaf labels, one per line (labels 16 | do not need to be a subtree, the tree is collapsed as needed): 17 | newick -subset tree.newick -labels labels.txt -output subset.newick 18 | 19 | Get leaf labels: 20 | newick -getlabels tree.newick -output labels.txt 21 | 22 | Report miscellaneous information about a Newick file: 23 | newick -stats trees.newick 24 | 25 | Calculate Robinson-Foulds (R-F) distance between two trees: 26 | newick -rofo tree1.newick -tree2 tree2.newick -log rofo.log 27 | 28 | Calculate all-vs-all R-F distances between trees in Newick file: 29 | newick -rofos trees.newick -log rofos.log 30 | 31 | Re-label trees, labels.tsv tab-separated with #1=old_label #2=new_label: 32 | newick -relabel trees.newick -labels labels.tsv -output relabeled_trees.newick 33 | 34 | Add integer node number labels to internal nodes: 35 | newick -intlabel tree.newick -output intlabel.newick 36 | 37 | Root by outgroup, specify labels.txt with leaf labels of outgroup or GroupName which 38 | is a substring of the outgroup labels, e.g. phylum name if format is A1234.Phylum: 39 | newick trees.newick [-labels labels.txt | -outgroup GroupName] -output rooted.newick 40 | 41 | Convert tab-separated to Newick: 42 | newick -tsv2newick tree.tsv -output tree.newick 43 | 44 | Convert Newick to tab-separated: 45 | newick -newick2tsv tree.newick -output tree.tsv 46 | 47 | Ladderize trees by rotating internal nodes so that larger subtree is always the 48 | left (default) or right subtree: 49 | newick -ladderize trees.newick -output ladderized.newick [-right] 50 | 51 | Split tree into N roughly equal-sized subtrees (clusters), output is N files 52 | named prefixi, i=1..N containing labels for each subtree: 53 | newick -split tree.newick -n N -prefix prefix 54 | 55 | Convert trees to cladograms (leaves equidistant from root): 56 | newick -clado trees.newick -output clado.newick 57 | 58 | Calculate edge confidence values from set of bootstrapped trees: 59 | newick -conf tree.newick -trees replicates.newick -output conftree.newick 60 | 61 | Condense a tree by identifying best-fit nodes for each feature group and making 62 | a tree of just those nodes; unary edges are collapsed by summing lengths and 63 | taking max confidence, leaves are labeled with features (e.g. phylum names): 64 | newick -condense trees.newick -features features.tsv -output condensed.newick 65 | 66 | Extract just the branching order by collapsing unary nodes, deleting all edge lengths 67 | and deleting all confidence values (all internal node labels removed): 68 | newick -topo trees.newick -output topos.newick 69 | 70 | Delete one or more leaves and collapse any resulting unary nodes, useful e.g. for 71 | deleting outgroup to simplify figure: 72 | newick -deleteleaves trees.newick [-label OutgroupName | -labels labels.txt] -output .newick 73 | 74 | Draw one tree or several trees with optional coloring of edges: 75 | newick -draw tree.newick -svg figure.svg 76 | newick -drawf tree.newick -features features.tsv -colors colors.tsv -svg figure.svg 77 | newick -drawfs trees.newick -features features.tsv -colors colors.tsv -svg figure.svg 78 | 79 | -features is tsv file with #1 leaf_label #2 feature_name (e.g. phylum). 80 | -colors is tsv file with #1 feature_name #2 color, where color is any valid svg color, 81 | can be rgb, hex or name e.g. red. 82 | 83 | -default_color color 84 | Color for unlabeled edges (default gray). 85 | -title text 86 | Title text. 87 | -title_font_size n 88 | Title font size (default 10). 89 | -unitlengths 90 | Treat all edge lengths as 1 (phylogram). 91 | -strokewidth n 92 | Line width for edges (default 1). 93 | -tree_width n 94 | Width of tree (default 1000). 95 | -tree_height n 96 | Height of tree (default 1000). 97 | -tree_spacing n 98 | Space between trees (default 300). 99 | -trees_per_row n 100 | Number of trees per row in figure (default 4). 101 | -triangles w,h 102 | Draw triangles at leaves with width w and height h. 103 | -legend legend.svg 104 | Legend showing features (e.g. phylum names) and colors. 105 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore 5 | 6 | test_output/ 7 | test_logs/ 8 | github_releases/ 9 | 10 | # User-specific files 11 | *.rsuser 12 | *.suo 13 | *.user 14 | *.userosscache 15 | *.sln.docstates 16 | 17 | # User-specific files (MonoDevelop/Xamarin Studio) 18 | *.userprefs 19 | 20 | # Mono auto generated files 21 | mono_crash.* 22 | 23 | # Build results 24 | [Dd]ebug/ 25 | [Dd]ebugPublic/ 26 | [Rr]elease/ 27 | [Rr]eleases/ 28 | x64/ 29 | x86/ 30 | [Ww][Ii][Nn]32/ 31 | [Aa][Rr][Mm]/ 32 | [Aa][Rr][Mm]64/ 33 | bld/ 34 | [Bb]in/ 35 | [Oo]bj/ 36 | [Ll]og/ 37 | [Ll]ogs/ 38 | 39 | # Visual Studio 2015/2017 cache/options directory 40 | .vs/ 41 | __pycache__/ 42 | *.o 43 | *.pyc 44 | 45 | # Cake - Uncomment if you are using it 46 | # tools/** 47 | # !tools/packages.config 48 | 49 | # Tabs Studio 50 | *.tss 51 | 52 | # Telerik's JustMock configuration file 53 | *.jmconfig 54 | 55 | # BizTalk build output 56 | *.btp.cs 57 | *.btm.cs 58 | *.odx.cs 59 | *.xsd.cs 60 | 61 | # OpenCover UI analysis results 62 | OpenCover/ 63 | 64 | # Azure Stream Analytics local run output 65 | ASALocalRun/ 66 | 67 | # MSBuild Binary and Structured Log 68 | *.binlog 69 | 70 | # NVidia Nsight GPU debugger configuration file 71 | *.nvuser 72 | 73 | # MFractors (Xamarin productivity tool) working folder 74 | .mfractor/ 75 | 76 | # Local History for Visual Studio 77 | .localhistory/ 78 | 79 | # Visual Studio History (VSHistory) files 80 | .vshistory/ 81 | 82 | # BeatPulse healthcheck temp database 83 | healthchecksdb 84 | 85 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 86 | MigrationBackufiles 87 | _Chutzpah* 88 | 89 | # Visual C++ cache files 90 | ipch/ 91 | *.aps 92 | *.ncb 93 | *.opendb 94 | *.opensdf 95 | *.sdf 96 | *.cachefile 97 | *.VC.db 98 | *.VC.VC.opendb 99 | 100 | # Visual Studio profiler 101 | *.psess 102 | *.vsp 103 | *.vspx 104 | *.sap 105 | 106 | # Visual Studio Trace Files 107 | *.e2e 108 | 109 | # TFS 2012 Local Workspace 110 | $tf/ 111 | 112 | # Guidance Automation Toolkit 113 | *.gpState 114 | 115 | # ReSharper is a .NET coding add-in 116 | _ReSharper*/ 117 | *.[Rr]e[Ss]harper 118 | *.DotSettings.user 119 | 120 | # TeamCity is a build add-in 121 | _TeamCity* 122 | 123 | # DotCover 124 | # files ending in .cache can be ignored 125 | *.[Cc]ache 126 | # but keep track of directories ending in .cache 127 | !?*.[Cc]ache/ 128 | 129 | # Others 130 | ClientBin/ 131 | ~$* 132 | *~ 133 | *.dbmdl 134 | *.dbproj.schemaview 135 | *.jfm 136 | *.pfx 137 | *.publishsettings 138 | orleans.codegen.cs 139 | 140 | # Including strong name files can present a security risk 141 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 142 | #*.snk 143 | 144 | # Since there are multiple workflows, uncomment next line to ignore bower_components 145 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 146 | #bower_components/ 147 | 148 | # RIA/Silverlight projects 149 | Generated_Code/ 150 | 151 | # Backup & report files from converting an old project file 152 | # to a newer Visual Studio version. Backup files are not needed, 153 | # because we have git ;-) 154 | _UpgradeReport_Files/ 155 | Backup*/ 156 | UpgradeLog*.XML 157 | UpgradeLog*.htm 158 | ServiceFabricBackup/ 159 | *.rptproj.bak 160 | 161 | # SQL Server files 162 | *.mdf 163 | *.ldf 164 | *.ndf 165 | 166 | # Business Intelligence projects 167 | *.rdl.data 168 | *.bim.layout 169 | *.bim_*.sep/ 170 | 171 | # Ionide (cross platform F# VS Code tools) working folder 172 | .ionide/ 173 | 174 | # Fody - auto-generated XML schema 175 | FodyWeavers.xsd 176 | 177 | # VS Code files for those working on multiple tools 178 | .vscode/* 179 | !.vscode/settings.json 180 | !.vscode/tasks.json 181 | !.vscode/launch.json 182 | !.vscode/extensions.json 183 | *.code-workspace 184 | 185 | # Local History for Visual Studio Code 186 | .history/ 187 | 188 | # Windows Installer files from build outputs 189 | *.cab 190 | *.msi 191 | *.msix 192 | *.msm 193 | *.msp 194 | 195 | # JetBrains Rider 196 | *.sln.iml 197 | asePS/ 198 | dlldata.c 199 | 200 | # Benchmark Results 201 | BenchmarkDotNet.Artifacts/ 202 | 203 | # .NET Core 204 | project.lock.json 205 | project.fragment.lock.json 206 | artifacts/ 207 | 208 | # ASP.NET Scaffolding 209 | ScaffoldingReadMe.txt 210 | 211 | # StyleCop 212 | StyleCopReport.xml 213 | 214 | # Files built by Visual Studio 215 | *_i.c 216 | *_p.c 217 | *_h.h 218 | *.ilk 219 | *.meta 220 | *.obj 221 | *.iobj 222 | *.pch 223 | *.pdb 224 | *.ipdb 225 | *.pgc 226 | *.pgd 227 | *.rsp 228 | *.sbr 229 | *.tlb 230 | *.tli 231 | *.tlh 232 | *.tmp 233 | *.tmp_proj 234 | *_wpftmp.csproj 235 | *.log 236 | *.tlog 237 | *.vspscc 238 | *.vssscc 239 | .builds 240 | *.pidb 241 | *.svclog 242 | *.scc 243 | make.stderr 244 | test.stderr 245 | make.stdout 246 | 247 | # Chutzpah Test 248 | 249 | CYGWIN*/ 250 | [lL]inux/ 251 | Darwin/ 252 | 253 | gitver.txt 254 | -------------------------------------------------------------------------------- /src/newicktree.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "newicktree.h" 3 | 4 | void NewickTree::LogMe() const 5 | { 6 | const uint NodeCount = GetNodeCount(); 7 | asserta(SIZE(m_Parents) == NodeCount); 8 | asserta(SIZE(m_Labels) == NodeCount); 9 | asserta(SIZE(m_Lengths) == NodeCount); 10 | 11 | Log("\n"); 12 | Log("%u nodes\n", NodeCount); 13 | Log(" Node Parent Length Type Children Label\n"); 14 | // 1234567890 1234567890 1234567890 1234 12345678 15 | vector > Edges; 16 | GetNonParentEdges(Edges); 17 | for (uint Node = 0; Node < NodeCount; ++Node) 18 | { 19 | uint Parent = m_Parents[Node]; 20 | const string &Label = m_Labels[Node]; 21 | double Length = m_Lengths[Node]; 22 | bool IsLeaf = m_IsLeafs[Node]; 23 | const vector &NodeEdges = Edges[Node]; 24 | uint ChildCount = SIZE(NodeEdges); 25 | 26 | Log("%10u", Node); 27 | if (Parent == UINT_MAX) 28 | Log(" %10.10s", "*"); 29 | else 30 | Log(" %10u", Parent); 31 | if (Length == MISSING_LENGTH) 32 | Log(" %10.10s", "*"); 33 | else 34 | Log(" %10.4g", Length); 35 | if (Node == m_Root) 36 | Log(" %4.4s", "Root"); 37 | else 38 | Log(" %4.4s", IsLeaf ? "Leaf" : "Int"); 39 | if (!IsLeaf) 40 | Log(" %8u", ChildCount); 41 | else 42 | Log(" %8.8s", ""); 43 | Log(" %s", Label.c_str()); 44 | Log("\n"); 45 | } 46 | } 47 | 48 | uint NewickTree::GetLeafCount() const 49 | { 50 | const uint NodeCount = GetNodeCount(); 51 | uint LeafCount = 0; 52 | for (uint Node = 0; Node < NodeCount; ++Node) 53 | { 54 | bool IsLeaf = m_IsLeafs[Node]; 55 | if (IsLeaf) 56 | ++LeafCount; 57 | } 58 | return LeafCount; 59 | } 60 | 61 | void NewickTree::Validate() const 62 | { 63 | const uint NodeCount = GetNodeCount(); 64 | asserta(SIZE(m_Parents) == NodeCount); 65 | asserta(SIZE(m_Labels) == NodeCount); 66 | asserta(SIZE(m_Lengths) == NodeCount); 67 | asserta(SIZE(m_Labels) == NodeCount); 68 | asserta(SIZE(m_IsLeafs) == NodeCount); 69 | asserta(m_Root < NodeCount); 70 | 71 | // Exactly one root node 72 | // (If actually unrooted, arbitrary choice) 73 | bool RootFound = false; 74 | 75 | // Every node except leaves must have at least one child 76 | vector HasChild(NodeCount, false); 77 | for (uint Node = 0; Node < NodeCount; ++Node) 78 | { 79 | uint Parent = m_Parents[Node]; 80 | if (Parent == UINT_MAX) 81 | { 82 | asserta(!RootFound); 83 | asserta(Node == m_Root); 84 | RootFound = true; 85 | } 86 | else 87 | { 88 | asserta(Parent < NodeCount); 89 | HasChild[Parent] = true; 90 | } 91 | } 92 | 93 | for (uint Node = 0; Node < NodeCount; ++Node) 94 | { 95 | bool IsLeaf = m_IsLeafs[Node]; 96 | bool IsParent = HasChild[Node]; 97 | asserta(IsLeaf || IsParent); 98 | asserta(!(IsLeaf && IsParent)); 99 | } 100 | } 101 | 102 | double NewickTree::GetLength(uint Node) const 103 | { 104 | asserta(Node < SIZE(m_Lengths)); 105 | double Length = m_Lengths[Node]; 106 | return Length; 107 | } 108 | 109 | uint NewickTree::GetParent(uint Node) const 110 | { 111 | asserta(Node < SIZE(m_Parents)); 112 | uint Parent = m_Parents[Node]; 113 | return Parent; 114 | } 115 | 116 | const string &NewickTree::GetLabel(uint Node) const 117 | { 118 | asserta(Node < SIZE(m_Labels)); 119 | const string &Label = m_Labels[Node]; 120 | return Label; 121 | } 122 | 123 | bool NewickTree::HasBinaryRoot() const 124 | { 125 | vector > Edges; 126 | GetNonParentEdges(Edges); 127 | const uint NodeCount = GetNodeCount(); 128 | asserta(SIZE(Edges) == NodeCount); 129 | asserta(m_Root < NodeCount); 130 | const vector &RootNodeEdges = Edges[m_Root]; 131 | uint RootChildCount = SIZE(RootNodeEdges); 132 | if (RootChildCount == 2) 133 | return true; 134 | return false; 135 | } 136 | 137 | bool NewickTree::IsBinary() const 138 | { 139 | vector > Edges; 140 | GetNonParentEdges(Edges); 141 | const uint NodeCount = GetNodeCount(); 142 | asserta(SIZE(Edges) == NodeCount); 143 | asserta(m_Root < NodeCount); 144 | for (uint NodeIndex = 0; NodeIndex < NodeCount; ++NodeIndex) 145 | { 146 | const vector &NodeEdges = Edges[NodeIndex]; 147 | uint ChildCount = SIZE(NodeEdges); 148 | if (ChildCount == 0) 149 | continue; 150 | if (NodeIndex == m_Root) 151 | { 152 | if (ChildCount != 2 && ChildCount != 3) 153 | return false; 154 | } 155 | else 156 | { 157 | if (ChildCount != 2) 158 | return false; 159 | } 160 | } 161 | return true; 162 | } 163 | 164 | void NewickTree::GetNonParentEdges(vector > &Edges) const 165 | { 166 | Edges.clear(); 167 | const uint NodeCount = GetNodeCount(); 168 | Edges.resize(NodeCount); 169 | 170 | bool RootFound = false; 171 | for (uint Node = 0; Node < NodeCount; ++Node) 172 | { 173 | uint Parent = m_Parents[Node]; 174 | if (Parent == UINT_MAX) 175 | { 176 | asserta(!RootFound); 177 | RootFound = true; 178 | continue; 179 | } 180 | asserta(Parent < NodeCount); 181 | Edges[Parent].push_back(Node); 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/tree2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "newicktree.h" 5 | 6 | class Tree2; 7 | class TreeN; 8 | typedef void fn_OnNode(const Tree2 &T, uint Node); 9 | 10 | class Tree2 11 | { 12 | public: 13 | uint m_Root; // UINT_MAX if not rooted 14 | vector m_Nbrs1; 15 | vector m_Nbrs2; 16 | vector m_Nbrs3; 17 | vector m_Labels; 18 | map, double> m_EdgeToLength; 19 | 20 | public: 21 | Tree2() { Clear(); } 22 | void Clear() 23 | { 24 | m_Root = UINT_MAX; 25 | m_Nbrs1.clear(); 26 | m_Nbrs2.clear(); 27 | m_Nbrs3.clear(); 28 | m_Labels.clear(); 29 | m_EdgeToLength.clear(); 30 | } 31 | 32 | public: 33 | void FromData(const char *Data, uint Bytes); 34 | void FromFile(const string &FileName); 35 | void FromNewickTree(const NewickTree &T); 36 | void FromNewickFile(const string &FileName); 37 | void FromCStr(const char *CStr); 38 | void FromStr(const string &Str) { FromCStr(Str.c_str()); } 39 | void FromVectors(const vector &Labels, 40 | const vector &Parents, const vector &Lengths); 41 | void FromTree(const Tree2 &T); 42 | void FromTreeN(const TreeN &T); 43 | 44 | void ToNewickFile(const string &FileName) const; 45 | void ToNewickFile(FILE *f) const; 46 | void ToNewickStr(string &Str, bool WithLineBreaks = false) const; 47 | 48 | void GetLabelToLeafNodeIndexMap(map &LabelToLeafNodeIndex) const; 49 | 50 | void LogMe(FILE *f = g_fLog) const; 51 | void ToTSV(FILE *f) const; 52 | void ToJust(FILE *f) const; 53 | void ToTSV(const string &FileName) const; 54 | void ToTSVStrings(vector &Lines) const; 55 | void ToJustifiedStrings(vector &Lines) const; 56 | void FromTSVStrings(const vector &Lines); 57 | void FromTSVFile(const string &FileName); 58 | 59 | void Unroot(); 60 | void SetRoot(uint Node1, uint Node2); 61 | void Rotate(uint Node); 62 | uint Ladderize(bool MoreRight); 63 | 64 | uint GetRoot() const { return m_Root; } 65 | bool IsRooted() const { return m_Root != UINT_MAX; } 66 | uint GetNodeCount() const { return SIZE(m_Nbrs1); } 67 | uint GetEdgeCount() const; 68 | uint GetLeafCount() const; 69 | void Validate() const; 70 | void ValidateEdge(uint Node, uint Edge) const; 71 | bool IsLeaf(uint Node) const; 72 | bool IsRoot(uint Node) const; 73 | uint GetNodeByLabel(const string &Label, bool ErrorIfNotFound) const; 74 | uint GetNodeByAcc(const string &Acc, bool ErrorIfNotFound) const; 75 | const string &GetLabel(uint Node) const; 76 | void GetLabel(uint Node, string &Label) const; 77 | void SetEdgeLength(uint Node, double Length); 78 | double GetEdgeLength(uint Node1, uint Node2, bool FailOnError = true) const; 79 | double GetEdgeLengthToParent(uint Node, bool FailOnError = true) const; 80 | double GetEdgeLengthToLeftChild(uint Node, bool FailOnError = true) const; 81 | double GetEdgeLengthToRightChild(uint Node, bool FailOnError = true) const; 82 | void AppendLeaves(uint Node, vector &Leaves) const; 83 | uint GetSubtreeLeafCount(uint Node) const; 84 | void GetSubtreeLeafLabels(uint Node, vector &Labels) const; 85 | void GetSubtreeLeafNodes(uint Node, vector &LeafNodes) const; 86 | void GetSubtrees(vector > &LeafNodesVec) const; 87 | void GetLeafLabels(vector &Labels, bool ErrorIfEmpty) const; 88 | void GetPathToRoot(uint Node, vector &Path) const; 89 | double GetDistance(uint Node, uint AncNode) const; 90 | void FindNCA(const vector &Nodes, uint &Node1, uint &Node2) const; 91 | 92 | uint GetLeft(uint Node) const; 93 | uint GetRight(uint Node) const; 94 | uint GetParent(uint Node) const; 95 | 96 | uint GetEdge1(uint Node) const; 97 | uint GetEdge2(uint Node) const; 98 | uint GetEdge3(uint Node) const; 99 | uint GetEdge(uint Node, uint i) const; 100 | bool IsEdge(uint Node1, uint Ndoe2) const; 101 | 102 | double GetRootDist(uint Node) const; 103 | void GetRootDists(vector &Dists) const; 104 | void GetLeafRootDists(vector &Dists) const; 105 | 106 | double GetMaxLeafDist(uint Node) const; 107 | uint GetNodeCountToRoot(uint Node) const; 108 | uint GetMaxNodeCountToRoot() const; 109 | uint GetNodeCountToFurthestLeaf(uint Node) const; 110 | 111 | void Inorder(uint Node, fn_OnNode OnNode) const; 112 | void Preorder(uint Node, fn_OnNode OnNode) const; 113 | void Postorder(uint Node, fn_OnNode OnNode) const; 114 | 115 | void SetLength(uint Node1, uint Node2, double Length); 116 | bool IsEdgeInMap(uint Node1, uint Node2) const; 117 | 118 | private: 119 | void SetEdge(uint FromNode, uint i, uint ToNode); 120 | void OrientNode(uint Node, uint Parent); 121 | void GetEdgePair(uint Node1, uint Node2, 122 | pair &EdgePair, bool FailOnError = true) const; 123 | 124 | private: 125 | void AppendNodeToNewickStr(string &Str, uint Node, bool WithLineBreaks) const; 126 | }; 127 | 128 | void GenerateRandomTree(uint LeafCount, bool Rooted, 129 | double MinLength, double MaxLength, Tree2 &T); 130 | 131 | void MakeSubsetNodes(const Tree2 &InputTree, 132 | const vector &SubsetNodes, 133 | const vector &SubsetLabels, 134 | Tree2 &SubsetTree); 135 | -------------------------------------------------------------------------------- /src/fixft.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "newickparser2.h" 3 | #include "tree2.h" 4 | 5 | uint NewickParser2::FixFT() 6 | { 7 | if (IsBinary()) 8 | return 0; 9 | 10 | vector > Edges; 11 | GetNonParentEdges(Edges); 12 | const uint NodeCount = GetNodeCount(); 13 | asserta(SIZE(Edges) == NodeCount); 14 | asserta(m_Root < NodeCount); 15 | uint DupeCount = 0; 16 | 17 | vector NewParents = m_Parents; 18 | vector NewLabels = m_Labels; 19 | vector NewIsLeafs = m_IsLeafs; 20 | vector NewLengths = m_Lengths; 21 | 22 | for (uint Node = 0; Node < NodeCount; ++Node) 23 | { 24 | const vector &NodeEdges = Edges[Node]; 25 | uint ChildCount = SIZE(NodeEdges); 26 | if (ChildCount == 0) 27 | continue; 28 | if (ChildCount == 1) 29 | Die("Unary node"); 30 | if (Node == m_Root) 31 | { 32 | if (ChildCount != 2 && ChildCount != 3) 33 | Die("Root is non-binary"); 34 | } 35 | else 36 | { 37 | if (ChildCount > 2) 38 | { 39 | DupeCount += ChildCount - 1; 40 | for (uint i = 0; i < ChildCount; ++i) 41 | { 42 | uint Edge = NodeEdges[i]; 43 | bool IsLeaf = m_IsLeafs[Edge]; 44 | double Length = m_Lengths[Edge]; 45 | if (!IsLeaf) 46 | Die("%u-ary node has non-leaf child", ChildCount); 47 | if (Length != 0) 48 | Die("%u-ary node length %.3g", ChildCount, Length); 49 | } 50 | 51 | uint FirstNewNode = SIZE(NewLabels); 52 | for (uint i = 0; i + 2 < ChildCount; ++i) 53 | { 54 | uint ChildNode = NodeEdges[i]; 55 | uint NewNode = FirstNewNode + i; 56 | 57 | uint NewLeftChild = (i == 0 ? NodeEdges[0] : NewNode - 1); 58 | uint NewRightChild = NodeEdges[i+1]; 59 | uint NewParent = (i + 3 == ChildCount ? Node : NewNode + 1); 60 | 61 | if (i == 0) 62 | assert(m_Parents[NewLeftChild] == Node); 63 | assert(m_Parents[NewRightChild] == Node); 64 | 65 | NewParents[NewLeftChild] = NewNode; 66 | NewParents[NewRightChild] = NewNode; 67 | 68 | NewLabels.push_back(""); 69 | NewParents.push_back(NewParent); 70 | NewLengths.push_back(0); 71 | NewIsLeafs.push_back(false); 72 | 73 | m_Labels = NewLabels; 74 | m_IsLeafs = NewIsLeafs; 75 | m_Parents = NewParents; 76 | m_Lengths = NewLengths; 77 | } 78 | // Validate(); 79 | } 80 | } 81 | } 82 | Validate(); 83 | return DupeCount; 84 | } 85 | 86 | void cmd_fixft() 87 | { 88 | const string &InputFileName = opt(fixft); 89 | const string &OutputFileName = opt(output); 90 | 91 | NewickParser2 NP; 92 | NP.FromFile(InputFileName); 93 | #if 0 94 | vector > Edges; 95 | NP.GetNonParentEdges(Edges); 96 | const uint NodeCount = NP.GetNodeCount(); 97 | asserta(SIZE(Edges) == NodeCount); 98 | asserta(NP.m_Root < NodeCount); 99 | uint DupeCount = 0; 100 | 101 | vector NewParents = NP.m_Parents; 102 | vector NewLabels = NP.m_Labels; 103 | vector NewIsLeafs = NP.m_IsLeafs; 104 | vector NewLengths = NP.m_Lengths; 105 | 106 | for (uint Node = 0; Node < NodeCount; ++Node) 107 | { 108 | const vector &NodeEdges = Edges[Node]; 109 | uint ChildCount = SIZE(NodeEdges); 110 | if (ChildCount == 0) 111 | continue; 112 | if (ChildCount == 1) 113 | Die("Unary node"); 114 | if (Node == NP.m_Root) 115 | { 116 | if (ChildCount != 2 && ChildCount != 3) 117 | Die("Root is non-binary"); 118 | } 119 | else 120 | { 121 | if (ChildCount > 2) 122 | { 123 | DupeCount += ChildCount - 1; 124 | for (uint i = 0; i < ChildCount; ++i) 125 | { 126 | uint Edge = NodeEdges[i]; 127 | bool IsLeaf = NP.m_IsLeafs[Edge]; 128 | double Length = NP.m_Lengths[Edge]; 129 | if (!IsLeaf) 130 | Die("%u-ary node has non-leaf child", ChildCount); 131 | if (Length != 0) 132 | Die("%u-ary node length %.3g", ChildCount, Length); 133 | } 134 | 135 | uint FirstNewNode = SIZE(NewLabels); 136 | for (uint i = 0; i + 2 < ChildCount; ++i) 137 | { 138 | uint ChildNode = NodeEdges[i]; 139 | uint NewNode = FirstNewNode + i; 140 | 141 | uint NewLeftChild = (i == 0 ? NodeEdges[0] : NewNode - 1); 142 | uint NewRightChild = NodeEdges[i+1]; 143 | uint NewParent = (i + 3 == ChildCount ? Node : NewNode + 1); 144 | 145 | if (i == 0) 146 | assert(NP.m_Parents[NewLeftChild] == Node); 147 | assert(NP.m_Parents[NewRightChild] == Node); 148 | 149 | NewParents[NewLeftChild] = NewNode; 150 | NewParents[NewRightChild] = NewNode; 151 | 152 | NewLabels.push_back(""); 153 | NewParents.push_back(NewParent); 154 | NewLengths.push_back(0); 155 | NewIsLeafs.push_back(false); 156 | 157 | NP.m_Labels = NewLabels; 158 | NP.m_IsLeafs = NewIsLeafs; 159 | NP.m_Parents = NewParents; 160 | NP.m_Lengths = NewLengths; 161 | } 162 | NP.Validate(); 163 | } 164 | } 165 | } 166 | NP.Validate(); 167 | #endif // 0 168 | 169 | uint DupeCount = NP.FixFT(); 170 | ProgressLog("%u dupes resolved\n", DupeCount); 171 | 172 | Tree2 T; 173 | T.FromNewickTree(NP); 174 | T.ToNewickFile(OutputFileName); 175 | } 176 | -------------------------------------------------------------------------------- /src/randtree.cpp: -------------------------------------------------------------------------------- 1 | #include "myutils.h" 2 | #include "tree2.h" 3 | #include "treen.h" 4 | #include "treex.h" 5 | #include 6 | #include 7 | 8 | void StringsFromFile(const string &FileName, vector &Strings); 9 | 10 | /*** 11 | Fisher-Yates shuffle: 12 | To shuffle an array a of n elements (indices 0 .. n-1): 13 | for i from n - 1 downto 1 do 14 | j := random integer with 0 <= j <= i 15 | exchange a[j] and a[i] 16 | ***/ 17 | void Shuffle(vector &v) 18 | { 19 | const unsigned N = SIZE(v); 20 | for (unsigned i = N - 1; i >= 1; --i) 21 | { 22 | unsigned j = randu32()%(i + 1); 23 | 24 | unsigned vi = v[i]; 25 | unsigned vj = v[j]; 26 | 27 | v[i] = vj; 28 | v[j] = vi; 29 | } 30 | } 31 | 32 | /*** 33 | Unrooted: 34 | N is even 35 | E = N - 1 36 | L = N/2 + 1 37 | N = 2*(L - 1) 38 | ***/ 39 | 40 | static double GetRandomLength(double MinLength, double MaxLength) 41 | { 42 | asserta(MinLength <= MaxLength); 43 | uint r = randu32()%100; 44 | double d = MinLength + ((MaxLength - MinLength)*r)/100.0; 45 | asserta(d <= MaxLength); 46 | return d; 47 | } 48 | 49 | static double GetLength(double Min, double Max) 50 | { 51 | const uint M = 1000000; 52 | uint r = randu32()%M; 53 | double f = double(r)/M; 54 | double Length = Min + f*f*(Max - Min); 55 | return Length; 56 | } 57 | 58 | static uint GetRandomPending(vector &Pending) 59 | { 60 | uint N = SIZE(Pending); 61 | asserta(N > 0); 62 | uint r = randu32()%N; 63 | uint NextNode = Pending[r]; 64 | vector NewPending; 65 | for (uint i = 0; i < N; ++i) 66 | { 67 | uint PendingNode = Pending[i]; 68 | if (PendingNode != NextNode) 69 | NewPending.push_back(PendingNode); 70 | } 71 | Pending = NewPending; 72 | return NextNode; 73 | } 74 | 75 | void GenerateRandomTree(const vector &LeafLabels, bool Rooted, 76 | double MinLength, double MaxLength, Tree2 &T) 77 | { 78 | const uint LeafCount = SIZE(LeafLabels); 79 | 80 | asserta(LeafCount > 1); 81 | asserta(MinLength <= MaxLength); 82 | T.Clear(); 83 | 84 | uint NodeCount = 2*LeafCount - 1; 85 | vector Lengths; 86 | for (uint i = 0; i < NodeCount; ++i) 87 | Lengths.push_back(GetLength(MinLength, MaxLength)); 88 | 89 | vector Parents(NodeCount, UINT_MAX); 90 | 91 | vector Labels; 92 | vector Pending; 93 | for (uint i = 0; i < LeafCount; ++i) 94 | { 95 | Pending.push_back(i); 96 | Labels.push_back(LeafLabels[i]); 97 | } 98 | 99 | uint InternalNodeCount = LeafCount - 1; 100 | for (uint i = 0; i < InternalNodeCount; ++i) 101 | { 102 | uint Parent = LeafCount + i; 103 | uint Left = GetRandomPending(Pending); 104 | uint Right = GetRandomPending(Pending); 105 | 106 | double Length = GetLength(MinLength, MaxLength); 107 | Labels.push_back(""); 108 | 109 | asserta(Parents[Left] == UINT_MAX); 110 | asserta(Parents[Right] == UINT_MAX); 111 | 112 | Parents[Left] = Parent; 113 | Parents[Right] = Parent; 114 | 115 | Pending.push_back(Parent); 116 | } 117 | 118 | T.FromVectors(Labels, Parents, Lengths); 119 | if (!Rooted) 120 | { 121 | // T.LogMe(); 122 | T.Unroot(); 123 | } 124 | // T.LogMe(); 125 | T.Validate(); 126 | } 127 | 128 | void GenerateRandomTree(const vector &LeafLabels, bool Rooted, 129 | double MinLength, double MaxLength, TreeN &T) 130 | { 131 | Tree2 T2; 132 | GenerateRandomTree(LeafLabels, Rooted, MinLength, MaxLength, T2); 133 | T.FromTree2(T2); 134 | } 135 | 136 | void GenerateRandomTree(const vector &LeafLabels, bool Rooted, 137 | double MinLength, double MaxLength, TreeX &T) 138 | { 139 | TreeN TN; 140 | GenerateRandomTree(LeafLabels, Rooted, MinLength, MaxLength, TN); 141 | string Str; 142 | TN.ToNewickStr(Str, true); 143 | T.FromNewickStr(Str); 144 | } 145 | 146 | void GenerateRandomTree(uint LeafCount, bool Rooted, 147 | double MinLength, double MaxLength, Tree2 &T) 148 | { 149 | asserta(LeafCount > 1); 150 | asserta(MinLength <= MaxLength); 151 | T.Clear(); 152 | 153 | vector Labels; 154 | for (uint i = 0; i < LeafCount; ++i) 155 | { 156 | string Label; 157 | Ps(Label, "Leaf%u", i+1); 158 | Labels.push_back(Label); 159 | } 160 | 161 | GenerateRandomTree(Labels, Rooted, MinLength, MaxLength, T); 162 | } 163 | 164 | static void _cmd_test() 165 | { 166 | opt(test); 167 | ResetRand(1); 168 | Tree2 T; 169 | bool Rooted = true; 170 | for (uint i = 0; i < 100; ++i) 171 | { 172 | ProgressStep(i, 100, "Rand trees"); 173 | uint LeafCount = 3 + randu32()%10; 174 | Rooted = !Rooted; 175 | GenerateRandomTree(LeafCount, Rooted, 0.1, 0.2, T); 176 | T.LogMe(); 177 | } 178 | } 179 | 180 | void cmd_randtree() 181 | { 182 | const string &LabelsFileName = opt(randtree); 183 | double MinLength = 1.0; 184 | double MaxLength = 1.0; 185 | if (optset_minlength) 186 | MinLength = opt(minlength); 187 | if (optset_maxlength) 188 | MaxLength = opt(maxlength); 189 | asserta(MinLength <= MaxLength); 190 | const bool IsRooted = opt(rooted); 191 | 192 | vector Labels; 193 | StringsFromFile(LabelsFileName, Labels); 194 | 195 | Tree2 T; 196 | GenerateRandomTree(Labels, IsRooted, MinLength, MaxLength, T); 197 | T.ToNewickFile(opt(output)); 198 | } 199 | --------------------------------------------------------------------------------