├── phylopandas ├── treeio │ ├── tests │ │ ├── __init__.py │ │ └── test_read.py │ ├── __init__.py │ ├── read.py │ └── write.py ├── __version__.py ├── utils.py ├── seqio │ ├── __init__.py │ ├── read.py │ └── write.py ├── tests │ ├── test_series.py │ ├── __init__.py │ ├── dat │ │ ├── PF08793_seed.phylip │ │ ├── PF08793_seed.fasta │ │ └── PF08793_seed.clustal │ └── test_frame.py ├── __init__.py └── core.py ├── docs ├── _pages │ ├── add-format.rst │ ├── formats.rst │ ├── cookbook.rst │ └── dataframe.rst ├── _images │ ├── jlab.png │ └── tree.png ├── _logo │ ├── banner.png │ ├── logo-2.png │ ├── logo-02.svg │ ├── banner.svg │ ├── logo-2.svg │ └── logo.svg ├── Makefile ├── make.bat ├── index.rst └── conf.py ├── requirements.txt ├── .travis.yml ├── setup.cfg ├── examples ├── PF08793_seed.newick ├── PF08793_seed.phylip ├── PF08793_seed.fasta ├── PF08793_seed.clustal └── intro-notebook.ipynb ├── LICENSE ├── .gitignore ├── README.md └── setup.py /phylopandas/treeio/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import pytest -------------------------------------------------------------------------------- /docs/_pages/add-format.rst: -------------------------------------------------------------------------------- 1 | Add a format 2 | ============ 3 | 4 | To add a format... 5 | -------------------------------------------------------------------------------- /docs/_images/jlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zsailer/phylopandas/HEAD/docs/_images/jlab.png -------------------------------------------------------------------------------- /docs/_images/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zsailer/phylopandas/HEAD/docs/_images/tree.png -------------------------------------------------------------------------------- /docs/_logo/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zsailer/phylopandas/HEAD/docs/_logo/banner.png -------------------------------------------------------------------------------- /docs/_logo/logo-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zsailer/phylopandas/HEAD/docs/_logo/logo-2.png -------------------------------------------------------------------------------- /phylopandas/__version__.py: -------------------------------------------------------------------------------- 1 | 2 | VERSION = (0, 8, 0) 3 | 4 | __version__ = '.'.join(map(str, VERSION)) 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=0.22.0 2 | pandas_flavor>=0.1.0 3 | biopython 4 | dendropy 5 | vega 6 | phylovega -------------------------------------------------------------------------------- /phylopandas/treeio/__init__.py: -------------------------------------------------------------------------------- 1 | from .read import (read_nexml, 2 | read_nexus_tree, 3 | read_newick, 4 | read_dendropy) 5 | 6 | from . import write 7 | -------------------------------------------------------------------------------- /docs/_pages/formats.rst: -------------------------------------------------------------------------------- 1 | Supported Formats 2 | ================= 3 | 4 | Sequence Formats 5 | ---------------- 6 | 7 | - Fasta 8 | - 9 | 10 | 11 | Tree formats 12 | ------------ 13 | 14 | - Newick 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # https://travis-ci.org/Zsailer/phylopandas 2 | language: python 3 | python: 4 | - 3.6 5 | - 3.5 6 | install: 7 | - pip install -r requirements.txt 8 | - python setup.py install 9 | script: pytest 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | -------------------------------------------------------------------------------- /phylopandas/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | def get_random_id(length): 5 | """Generate a random, alpha-numerical id.""" 6 | alphabet = string.ascii_uppercase + string.ascii_lowercase + string.digits 7 | return ''.join(random.choice(alphabet) for _ in range(length)) 8 | -------------------------------------------------------------------------------- /phylopandas/seqio/__init__.py: -------------------------------------------------------------------------------- 1 | from .read import (read_embl, 2 | read_fasta, 3 | read_fastq, 4 | read_nexus_seq, 5 | read_swiss, 6 | read_phylip, 7 | read_clustal, 8 | read_blast_xml, 9 | read_phylip_relaxed, 10 | read_phylip_sequential) 11 | 12 | from . import write 13 | -------------------------------------------------------------------------------- /phylopandas/tests/test_series.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | import phylopandas as ph 5 | from . import path_to_dat, clean_dat # noqa 6 | 7 | 8 | @pytest.mark.usefixtures("clean_dat") 9 | def test_to_fasta(path_to_dat): 10 | path = os.path.join(path_to_dat, 'PF08793_seed.fasta') 11 | df = ph.read_fasta(path) 12 | 13 | # Extract a single row 14 | row = df.iloc[0] 15 | 16 | # Write row to fasta 17 | fasta_path = os.path.join(path_to_dat, 'test.fasta') 18 | row.phylo.to_fasta(fasta_path) 19 | assert os.path.exists(fasta_path) 20 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = phylopandas 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /phylopandas/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os, glob 3 | 4 | @pytest.fixture 5 | def path_to_dat(): 6 | """Get path to dat folder with test data.""" 7 | # Get path to test directory. 8 | path_to_test = os.path.dirname(os.path.realpath(__file__)) 9 | 10 | # Build path to dat 11 | path_to_dat = os.path.join(path_to_test, 'dat') 12 | return path_to_dat 13 | 14 | @pytest.fixture() 15 | def clean_dat(path_to_dat): 16 | yield clean_dat 17 | 18 | # Get files in dat_files 19 | dat_files = glob.glob(os.path.join(path_to_dat,"*")) 20 | 21 | # Remove files from dat folder that begin with 'test'. 22 | for datf in dat_files: 23 | path, f = os.path.split(datf) 24 | if f[:4] == 'test': 25 | os.remove(datf) 26 | -------------------------------------------------------------------------------- /examples/PF08793_seed.newick: -------------------------------------------------------------------------------- 1 | (Q8QUQ5_ISKNN/45-79:0.38376442,Q8QUQ6_ISKNN/37-75:0.93473288,(Q8QUQ5_ISKNN/123-157:1.14582942,(Q0E553_SFAVA/142-176:0.94308689,(Q0E553_SFAVA/184-218:0.98977147,(Q0E553_SFAVA/60-94:0.95706148,(((019R_FRG3G/5-39:0.06723315,(019R_FRG3G/139-172:0.05690376,(019R_FRG3G/249-283:0.95772959,019R_FRG3G/302-336:0.58361302)2.745285:0.61968795)1.680162:0.12814819)8.545520:0.30724093,((VF232_IIV6/64-98:0.77338949,((VF380_IIV6/7-45:0.56133629,VF380_IIV3/8-47:0.64307079)7.484104:0.37367018,(VF378_IIV6/4-38:0.31530205,O41158_PBCV1/63-96:0.46076842)1.909391:0.20522645)0.218717:0.09388521)2.531435:0.20551347,Q0E553_SFAVA/14-48:1.58834786)0.265099:0.00027193)6.209727:0.37908212,(Q8QUQ5_ISKNN/164-198:0.63907222,Q8QUQ5_ISKNN/7-42:0.96743219)2.806276:0.362965)0.677978:0.20054193)0.718698:0.20642561)2.503850:0.27168922)1.162623:0.15868612)6.040602:0.48939921); 2 | -------------------------------------------------------------------------------- /docs/_pages/cookbook.rst: -------------------------------------------------------------------------------- 1 | Cookbook 2 | ======== 3 | 4 | Merge two sequence files 5 | ------------------------ 6 | 7 | Use ``pandas.concat`` to merge two sequence files. 8 | 9 | .. code-block:: python 10 | 11 | import phylopandas as ph 12 | 13 | seq1 = ph.read_fasta('seq1.fasta') 14 | seq2 = ph.read_fasta('seq2.fasta') 15 | 16 | # Merge two files 17 | seqs = seq1.concat(seq2, ignore_index=False) 18 | 19 | # Write to file. 20 | seqs.to_fasta('seqs.fasta') 21 | 22 | 23 | 24 | Merge alignment and sequence data 25 | --------------------------------- 26 | 27 | Add alignment column to sequence DataFrame. 28 | 29 | .. code-block:: python 30 | 31 | import phylopandas as ph 32 | 33 | # Read sequences and alignments. 34 | seq = ph.read_fasta('sequences.fasta') 35 | ali = ph.read_fasta('alignment.fasta') 36 | 37 | # Merge data. 38 | seq.merge(ali, on='id') 39 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=phylopandas 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /phylopandas/treeio/tests/test_read.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ..read import read_newick 4 | 5 | # Newick strings pulled from https://en.wikipedia.org/wiki/Newick_format 6 | NEWICKS = [ 7 | "(,,(,));", # no nodes are named 8 | "(A,B,(C,D));", # leaf nodes are named 9 | "(A,B,(C,D)E)F;", # all nodes are named 10 | "(:0.1,:0.2,(:0.3,:0.4):0.5);", # all but root node have a distance to parent 11 | "(:0.1,:0.2,(:0.3,:0.4):0.5):0.0;", # all have a distance to parent 12 | "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);", # distances and leaf names (popular) 13 | "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;", # distances and all names 14 | "((B:0.2,(C:0.3,D:0.4)E:0.5)A:0.1)F;" # a tree rooted on a leaf node (rare) 15 | ] 16 | 17 | 18 | def test_read_newick(): 19 | for n in NEWICKS: 20 | print("Testing: {}".format(n)) 21 | n = read_newick(data=n) 22 | 23 | assert True -------------------------------------------------------------------------------- /examples/PF08793_seed.phylip: -------------------------------------------------------------------------------- 1 | 18 40 2 | seq-0 KCIAFDK--- -ND-KINPFT GRPINENNDT YRMIYSMCHG 3 | seq-1 ACALYYD--- -DP-TVNPFT DEPLRRYSPI DDLLYRNCES 4 | seq-2 YCTNFHR--- -DE-SRNPLT GKKLVPTSPI RKAWHKMCSG 5 | seq-3 LCAEYKR--- -SP-RYNPWT DRTLAPGSPK HNLISGMCGG 6 | seq-4 VCNDLALCSQ HTD-TYNPWT DRALLPDSPV HDMIDYVCNT 7 | seq-5 VCERFAA--- -DP-TRNPVT GSPLSRNDPL YTDLMEICKG 8 | seq-6 TCEAFCR--- -DP-TRNPVT GQKMRRNGIE YQMFAEECDC 9 | seq-7 KCDEWEKIRL NSS-PKNPFT KRNVKKDGPT YKKIDLICKH 10 | seq-8 KCYEWDIAKK KSPLPKSPLT GRKLKQHGPT WKKITAECAT 11 | seq-9 KCSKWHE--- -QP-LINPLT NRKIKKNGPT YKELERECGP 12 | seq-10 LCSKWKA--- -NP-LVNPAT GRKIKKDGPV YEKIQKKCS- 13 | seq-11 YCDEFER--- -NP-TRNPRT GRTIKRGGPV FRALERECSD 14 | seq-12 -CPEFAR--- -DP-TRNPRT GRTIKRGGPT YRALEAECAD 15 | seq-13 ECEQWLA--- -NK-GINPRT GKAIKIGGPT YKKLEMECKE 16 | seq-14 VCKKFLA--- -NK-TVSPYS GRPIKPGKKL YNDLEKHCSG 17 | seq-15 QCRAFEE--- -NP-DVNPNT GRRISPTGPI ASSMRRRCMN 18 | seq-16 KCNQLRN--- -NRYTVNPVS NRAIAPRGDT ANTLRRICEQ 19 | seq-17 QCETFKR--- -NKQAVSPLT NCPIDKFGRT AARFRKECD- 20 | -------------------------------------------------------------------------------- /phylopandas/tests/dat/PF08793_seed.phylip: -------------------------------------------------------------------------------- 1 | 18 40 2 | seq-0 KCIAFDK--- -ND-KINPFT GRPINENNDT YRMIYSMCHG 3 | seq-1 ACALYYD--- -DP-TVNPFT DEPLRRYSPI DDLLYRNCES 4 | seq-2 YCTNFHR--- -DE-SRNPLT GKKLVPTSPI RKAWHKMCSG 5 | seq-3 LCAEYKR--- -SP-RYNPWT DRTLAPGSPK HNLISGMCGG 6 | seq-4 VCNDLALCSQ HTD-TYNPWT DRALLPDSPV HDMIDYVCNT 7 | seq-5 VCERFAA--- -DP-TRNPVT GSPLSRNDPL YTDLMEICKG 8 | seq-6 TCEAFCR--- -DP-TRNPVT GQKMRRNGIE YQMFAEECDC 9 | seq-7 KCDEWEKIRL NSS-PKNPFT KRNVKKDGPT YKKIDLICKH 10 | seq-8 KCYEWDIAKK KSPLPKSPLT GRKLKQHGPT WKKITAECAT 11 | seq-9 KCSKWHE--- -QP-LINPLT NRKIKKNGPT YKELERECGP 12 | seq-10 LCSKWKA--- -NP-LVNPAT GRKIKKDGPV YEKIQKKCS- 13 | seq-11 YCDEFER--- -NP-TRNPRT GRTIKRGGPV FRALERECSD 14 | seq-12 -CPEFAR--- -DP-TRNPRT GRTIKRGGPT YRALEAECAD 15 | seq-13 ECEQWLA--- -NK-GINPRT GKAIKIGGPT YKKLEMECKE 16 | seq-14 VCKKFLA--- -NK-TVSPYS GRPIKPGKKL YNDLEKHCSG 17 | seq-15 QCRAFEE--- -NP-DVNPNT GRRISPTGPI ASSMRRRCMN 18 | seq-16 KCNQLRN--- -NRYTVNPVS NRAIAPRGDT ANTLRRICEQ 19 | seq-17 QCETFKR--- -NKQAVSPLT NCPIDKFGRT AARFRKECD- 20 | -------------------------------------------------------------------------------- /phylopandas/__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | PhyloPandas 3 | =========== 4 | 5 | *Pandas DataFrames for phylogenetics.* 6 | 7 | PhyloPandas provides a Pandas-like interface for reading various sequence 8 | formats into DataFrames. This enables easy manipulation of phylogenetic data 9 | using familiar Python/Pandas functions. Finally, phylogenetics for humans! 10 | 11 | 12 | How does it work? 13 | ----------------- 14 | 15 | Don't worry, we didn't reinvent the wheel. PhyloPandas is simply a DataFrame 16 | (great for human-accessible data storage) interface on top of Biopython 17 | (great for parsing/writing sequence data). 18 | 19 | When you import PhyloPandas, you import Pandas with a PhyloPandas flavor. 20 | That means, the usual read_ functions are available ('read_csv', 21 | 'read_excel', etc.), but the returned DataFrame includes extra to_ methods 22 | (to_fasta, to_phylip, etc.) 23 | """ 24 | # Import new read functions 25 | from pandas import DataFrame 26 | 27 | # Register PhyloPandas Methods 28 | from .core import PhyloPandasDataFrameMethods as _PhyloPandasDataFrameMethods 29 | from .core import PhyloPandasSeriesMethods as _PhyloPandasSeriesMethods 30 | from .seqio import * 31 | from .treeio import * 32 | -------------------------------------------------------------------------------- /examples/PF08793_seed.fasta: -------------------------------------------------------------------------------- 1 | >Q0E553_SFAVA/184-218 2 | KCIAFDK----ND-KINPFTGRPINENNDTYRMIYSMCHG 3 | >Q8QUQ5_ISKNN/123-157 4 | ACALYYD----DP-TVNPFTDEPLRRYSPIDDLLYRNCES 5 | >Q0E553_SFAVA/142-176 6 | YCTNFHR----DE-SRNPLTGKKLVPTSPIRKAWHKMCSG 7 | >Q8QUQ5_ISKNN/45-79 8 | LCAEYKR----SP-RYNPWTDRTLAPGSPKHNLISGMCGG 9 | >Q8QUQ6_ISKNN/37-75 10 | VCNDLALCSQHTD-TYNPWTDRALLPDSPVHDMIDYVCNT 11 | >019R_FRG3G/249-283 12 | VCERFAA----DP-TRNPVTGSPLSRNDPLYTDLMEICKG 13 | >019R_FRG3G/302-336 14 | TCEAFCR----DP-TRNPVTGQKMRRNGIEYQMFAEECDC 15 | >VF380_IIV6/7-45 16 | KCDEWEKIRLNSS-PKNPFTKRNVKKDGPTYKKIDLICKH 17 | >VF380_IIV3/8-47 18 | KCYEWDIAKKKSPLPKSPLTGRKLKQHGPTWKKITAECAT 19 | >VF378_IIV6/4-38 20 | KCSKWHE----QP-LINPLTNRKIKKNGPTYKELERECGP 21 | >O41158_PBCV1/63-96 22 | LCSKWKA----NP-LVNPATGRKIKKDGPVYEKIQKKCS- 23 | >019R_FRG3G/5-39 24 | YCDEFER----NP-TRNPRTGRTIKRGGPVFRALERECSD 25 | >019R_FRG3G/139-172 26 | -CPEFAR----DP-TRNPRTGRTIKRGGPTYRALEAECAD 27 | >VF232_IIV6/64-98 28 | ECEQWLA----NK-GINPRTGKAIKIGGPTYKKLEMECKE 29 | >Q0E553_SFAVA/60-94 30 | VCKKFLA----NK-TVSPYSGRPIKPGKKLYNDLEKHCSG 31 | >Q8QUQ5_ISKNN/164-198 32 | QCRAFEE----NP-DVNPNTGRRISPTGPIASSMRRRCMN 33 | >Q8QUQ5_ISKNN/7-42 34 | KCNQLRN----NRYTVNPVSNRAIAPRGDTANTLRRICEQ 35 | >Q0E553_SFAVA/14-48 36 | QCETFKR----NKQAVSPLTNCPIDKFGRTAARFRKECD- 37 | -------------------------------------------------------------------------------- /phylopandas/tests/dat/PF08793_seed.fasta: -------------------------------------------------------------------------------- 1 | >seq-0 Q0E553_SFAVA/184-218 2 | KCIAFDK----ND-KINPFTGRPINENNDTYRMIYSMCHG 3 | >seq-1 Q8QUQ5_ISKNN/123-157 4 | ACALYYD----DP-TVNPFTDEPLRRYSPIDDLLYRNCES 5 | >seq-2 Q0E553_SFAVA/142-176 6 | YCTNFHR----DE-SRNPLTGKKLVPTSPIRKAWHKMCSG 7 | >seq-3 Q8QUQ5_ISKNN/45-79 8 | LCAEYKR----SP-RYNPWTDRTLAPGSPKHNLISGMCGG 9 | >seq-4 Q8QUQ6_ISKNN/37-75 10 | VCNDLALCSQHTD-TYNPWTDRALLPDSPVHDMIDYVCNT 11 | >seq-5 019R_FRG3G/249-283 12 | VCERFAA----DP-TRNPVTGSPLSRNDPLYTDLMEICKG 13 | >seq-6 019R_FRG3G/302-336 14 | TCEAFCR----DP-TRNPVTGQKMRRNGIEYQMFAEECDC 15 | >seq-7 VF380_IIV6/7-45 16 | KCDEWEKIRLNSS-PKNPFTKRNVKKDGPTYKKIDLICKH 17 | >seq-8 VF380_IIV3/8-47 18 | KCYEWDIAKKKSPLPKSPLTGRKLKQHGPTWKKITAECAT 19 | >seq-9 VF378_IIV6/4-38 20 | KCSKWHE----QP-LINPLTNRKIKKNGPTYKELERECGP 21 | >seq-10 O41158_PBCV1/63-96 22 | LCSKWKA----NP-LVNPATGRKIKKDGPVYEKIQKKCS- 23 | >seq-11 019R_FRG3G/5-39 24 | YCDEFER----NP-TRNPRTGRTIKRGGPVFRALERECSD 25 | >seq-12 019R_FRG3G/139-172 26 | -CPEFAR----DP-TRNPRTGRTIKRGGPTYRALEAECAD 27 | >seq-13 VF232_IIV6/64-98 28 | ECEQWLA----NK-GINPRTGKAIKIGGPTYKKLEMECKE 29 | >seq-14 Q0E553_SFAVA/60-94 30 | VCKKFLA----NK-TVSPYSGRPIKPGKKLYNDLEKHCSG 31 | >seq-15 Q8QUQ5_ISKNN/164-198 32 | QCRAFEE----NP-DVNPNTGRRISPTGPIASSMRRRCMN 33 | >seq-16 Q8QUQ5_ISKNN/7-42 34 | KCNQLRN----NRYTVNPVSNRAIAPRGDTANTLRRICEQ 35 | >seq-17 Q0E553_SFAVA/14-48 36 | QCETFKR----NKQAVSPLTNCPIDKFGRTAARFRKECD- 37 | -------------------------------------------------------------------------------- /docs/_pages/dataframe.rst: -------------------------------------------------------------------------------- 1 | The PhyloPandas DataFrame 2 | ========================= 3 | 4 | The phylopandas dataframe is the core datastructure in this package. It defines 5 | a set of columns (or grammar) for phylogenetic data. A few advantages of 6 | defining such a grammar is: 1) we can leverage powerful+interactive 7 | visualization tools like Vega and 2) we standardize phylogenetic data in a 8 | familiar format. 9 | 10 | Columns of a Phylopandas DataFrame 11 | ---------------------------------- 12 | 13 | When reading sequence data, the following information will be stored on the dataframe. 14 | 15 | 1. ``sequence`` : DNA or protein sequence. 16 | 2. ``id``: user defined label or identifier. 17 | 3. ``description``: user defined description. 18 | 19 | When reading tree data, the following information will be stored on the dataframe. 20 | 21 | 1. ``type`` : label describing the type of node; either "leaf" or "node". 22 | 2. ``parent`` : label of parent node. 23 | 3. ``branch_length`` : distance from parent node. 24 | 25 | PhyloPandas indexes each sequence using a randomly generated 10 character key. 26 | 27 | If reading tree data from a PhyloPandas DataFrame containing sequence data, the 28 | two dataframes will be merged on the randomly generated index (unless otherwise specified). 29 | 30 | If reading sequence data from a PhyloPandas DataFrae containing tree data, the two dataframes will be merged on the randomly generated index (unless otherwise specified). 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Zachary Sailer 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | -------------------------------------------------------------------------------- /examples/PF08793_seed.clustal: -------------------------------------------------------------------------------- 1 | CLUSTAL X (1.81) multiple sequence alignment 2 | 3 | 4 | seq-0 KCIAFDK----ND-KINPFTGRPINENNDTYRMIYSMCHG 5 | seq-1 ACALYYD----DP-TVNPFTDEPLRRYSPIDDLLYRNCES 6 | seq-2 YCTNFHR----DE-SRNPLTGKKLVPTSPIRKAWHKMCSG 7 | seq-3 LCAEYKR----SP-RYNPWTDRTLAPGSPKHNLISGMCGG 8 | seq-4 VCNDLALCSQHTD-TYNPWTDRALLPDSPVHDMIDYVCNT 9 | seq-5 VCERFAA----DP-TRNPVTGSPLSRNDPLYTDLMEICKG 10 | seq-6 TCEAFCR----DP-TRNPVTGQKMRRNGIEYQMFAEECDC 11 | seq-7 KCDEWEKIRLNSS-PKNPFTKRNVKKDGPTYKKIDLICKH 12 | seq-8 KCYEWDIAKKKSPLPKSPLTGRKLKQHGPTWKKITAECAT 13 | seq-9 KCSKWHE----QP-LINPLTNRKIKKNGPTYKELERECGP 14 | seq-10 LCSKWKA----NP-LVNPATGRKIKKDGPVYEKIQKKCS- 15 | seq-11 YCDEFER----NP-TRNPRTGRTIKRGGPVFRALERECSD 16 | seq-12 -CPEFAR----DP-TRNPRTGRTIKRGGPTYRALEAECAD 17 | seq-13 ECEQWLA----NK-GINPRTGKAIKIGGPTYKKLEMECKE 18 | seq-14 VCKKFLA----NK-TVSPYSGRPIKPGKKLYNDLEKHCSG 19 | seq-15 QCRAFEE----NP-DVNPNTGRRISPTGPIASSMRRRCMN 20 | seq-16 KCNQLRN----NRYTVNPVSNRAIAPRGDTANTLRRICEQ 21 | seq-17 QCETFKR----NKQAVSPLTNCPIDKFGRTAARFRKECD- 22 | 23 | 24 | -------------------------------------------------------------------------------- /phylopandas/tests/dat/PF08793_seed.clustal: -------------------------------------------------------------------------------- 1 | CLUSTAL X (1.81) multiple sequence alignment 2 | 3 | 4 | seq-0 KCIAFDK----ND-KINPFTGRPINENNDTYRMIYSMCHG 5 | seq-1 ACALYYD----DP-TVNPFTDEPLRRYSPIDDLLYRNCES 6 | seq-2 YCTNFHR----DE-SRNPLTGKKLVPTSPIRKAWHKMCSG 7 | seq-3 LCAEYKR----SP-RYNPWTDRTLAPGSPKHNLISGMCGG 8 | seq-4 VCNDLALCSQHTD-TYNPWTDRALLPDSPVHDMIDYVCNT 9 | seq-5 VCERFAA----DP-TRNPVTGSPLSRNDPLYTDLMEICKG 10 | seq-6 TCEAFCR----DP-TRNPVTGQKMRRNGIEYQMFAEECDC 11 | seq-7 KCDEWEKIRLNSS-PKNPFTKRNVKKDGPTYKKIDLICKH 12 | seq-8 KCYEWDIAKKKSPLPKSPLTGRKLKQHGPTWKKITAECAT 13 | seq-9 KCSKWHE----QP-LINPLTNRKIKKNGPTYKELERECGP 14 | seq-10 LCSKWKA----NP-LVNPATGRKIKKDGPVYEKIQKKCS- 15 | seq-11 YCDEFER----NP-TRNPRTGRTIKRGGPVFRALERECSD 16 | seq-12 -CPEFAR----DP-TRNPRTGRTIKRGGPTYRALEAECAD 17 | seq-13 ECEQWLA----NK-GINPRTGKAIKIGGPTYKKLEMECKE 18 | seq-14 VCKKFLA----NK-TVSPYSGRPIKPGKKLYNDLEKHCSG 19 | seq-15 QCRAFEE----NP-DVNPNTGRRISPTGPIASSMRRRCMN 20 | seq-16 KCNQLRN----NRYTVNPVSNRAIAPRGDTANTLRRICEQ 21 | seq-17 QCETFKR----NKQAVSPLTNCPIDKFGRTAARFRKECD- 22 | 23 | 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. phylopandas documentation master file, created by 2 | sphinx-quickstart on Mon Oct 30 16:22:28 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | .. image:: _logo/banner.png 8 | 9 | 10 | *Bringing the Pandas DataFrame to phylogenetics.* 11 | 12 | PhyloPandas provides a Pandas-like interface for reading various sequence formats into DataFrames. This enables easy manipulation of phylogenetic data using familiar Python/Pandas functions. Finally, phylogenetics for humans! 13 | 14 | .. image:: _images/jlab.png 15 | :align: center 16 | 17 | | 18 | 19 | How does it work? 20 | ----------------- 21 | 22 | Don't worry, we didn't reinvent the wheel. **PhyloPandas** is simply a DataFrame_ 23 | (great for human-accessible data storage) interface on top of Biopython_ (great for parsing/writing sequence data). 24 | 25 | .. _DataFrame: https://github.com/pandas-dev/pandas 26 | .. _Biopython: https://github.com/biopython/biopython 27 | 28 | Basic Usage 29 | ~~~~~~~~~~~ 30 | 31 | Read sequence file into DataFrame. 32 | 33 | .. code-block:: python 34 | 35 | import phylopandas as ph 36 | 37 | df1 = ph.read_fasta('sequences.fasta') 38 | 39 | Write ``phylopandas.DataFrame`` data to sequence file. 40 | 41 | .. code-block:: python 42 | 43 | df1.to_clustal('sequences.clustal') 44 | 45 | Convert between two sequence formats. 46 | 47 | .. code-block:: python 48 | 49 | # Read from fasta. 50 | df = phypd.read_fasta('sequences.fasta') 51 | 52 | # Write to phylip. 53 | df.to_phylip('sequences.phy') 54 | 55 | See the Cookbook_ page for more things you can do. 56 | 57 | .. _Cookbook: _pages/cookbook.html 58 | 59 | Contributing 60 | ~~~~~~~~~~~~ 61 | 62 | It's *easy* to create new read/write functions and methods for PhyloPandas. If you 63 | have a format you'd like to add, please submit PRs! There are many more formats 64 | in Biopython that I haven't had the time to add myself, so please don't be afraid 65 | to add then yourself! I thank you ahead of time! 66 | 67 | 68 | Table of Contents 69 | ~~~~~~~~~~~~~~~~~ 70 | 71 | .. toctree:: 72 | :maxdepth: 1 73 | 74 | _pages/dataframe 75 | _pages/cookbook 76 | 77 | 78 | Indices and tables 79 | ~~~~~~~~~~~~~~~~~~ 80 | 81 | * :ref:`genindex` 82 | * :ref:`modindex` 83 | * :ref:`search` 84 | -------------------------------------------------------------------------------- /phylopandas/tests/test_frame.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import phylopandas as ph 4 | from . import path_to_dat, clean_dat 5 | 6 | 7 | def test_read_fasta(path_to_dat): 8 | # Get path 9 | path = os.path.join(path_to_dat, 'PF08793_seed.fasta') 10 | df = ph.read_fasta(path) 11 | 12 | # Tests 13 | keys = df.keys() 14 | assert type(df) == ph.DataFrame 15 | assert 'id' in keys 16 | assert 'sequence' in keys 17 | assert 'description' in keys 18 | 19 | 20 | def test_read_clustal(path_to_dat): 21 | # Get path 22 | path = os.path.join(path_to_dat, 'PF08793_seed.clustal') 23 | df = ph.read_clustal(path) 24 | 25 | # Tests 26 | keys = df.keys() 27 | assert type(df) == ph.DataFrame 28 | assert 'id' in keys 29 | assert 'sequence' in keys 30 | assert 'description' in keys 31 | 32 | 33 | def test_read_phylip(path_to_dat): 34 | # Get path 35 | path = os.path.join(path_to_dat, 'PF08793_seed.phylip') 36 | df = ph.read_phylip(path) 37 | 38 | # Tests 39 | keys = df.keys() 40 | assert type(df) == ph.DataFrame 41 | assert 'id' in keys 42 | assert 'sequence' in keys 43 | assert 'description' in keys 44 | 45 | 46 | def test_read_phylip_relaxed(path_to_dat): 47 | # Get path 48 | path = os.path.join(path_to_dat, 'PF08793_seed.phylip') 49 | df = ph.read_phylip_relaxed(path) 50 | 51 | # Tests 52 | keys = df.keys() 53 | assert type(df) == ph.DataFrame 54 | assert 'id' in keys 55 | assert 'sequence' in keys 56 | assert 'description' in keys 57 | 58 | 59 | def test_read_phylip_sequential(path_to_dat): 60 | # Get path 61 | path = os.path.join(path_to_dat, 'PF08793_seed.phylip') 62 | df = ph.read_phylip_sequential(path) 63 | 64 | # Tests 65 | keys = df.keys() 66 | assert type(df) == ph.DataFrame 67 | assert 'id' in keys 68 | assert 'sequence' in keys 69 | assert 'description' in keys 70 | 71 | 72 | class Testframe(object): 73 | 74 | @pytest.mark.usefixtures("clean_dat") 75 | def test_to_fasta(self, path_to_dat): 76 | path = os.path.join(path_to_dat, 'PF08793_seed.fasta') 77 | df = ph.read_fasta(path) 78 | print(df.phylo) 79 | # Write to fasta 80 | fasta_path = os.path.join(path_to_dat, 'test.fasta') 81 | df.phylo.to_fasta(fasta_path) 82 | assert os.path.exists(fasta_path) 83 | 84 | @pytest.mark.usefixtures("clean_dat") 85 | def test_to_phylip(self, path_to_dat): 86 | path = os.path.join(path_to_dat, 'PF08793_seed.fasta') 87 | df = ph.read_fasta(path) 88 | 89 | # Write to csv 90 | phylip_path = os.path.join(path_to_dat, 'test.phylip') 91 | df.phylo.to_phylip(phylip_path) 92 | assert os.path.exists(phylip_path) 93 | 94 | @pytest.mark.usefixtures("clean_dat") 95 | def test_to_embl(self, path_to_dat): 96 | path = os.path.join(path_to_dat, 'PF08793_seed.fasta') 97 | df = ph.read_fasta(path) 98 | 99 | # Write to csv 100 | embl_path = os.path.join(path_to_dat, 'test.embl') 101 | df.phylo.to_embl(alphabet='protein', filename=embl_path) 102 | assert os.path.exists(embl_path) 103 | 104 | @pytest.mark.usefixtures("clean_dat") 105 | def test_to_nexus_seq(self, path_to_dat): 106 | path = os.path.join(path_to_dat, 'PF08793_seed.fasta') 107 | df = ph.read_fasta(path) 108 | 109 | # Write to csv 110 | nexus_path = os.path.join(path_to_dat, 'test.nexus') 111 | df.phylo.to_nexus_seq(alphabet='protein', filename=nexus_path) 112 | assert os.path.exists(nexus_path) 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | [![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/phylopandas/Lobby) 4 | [![Documentation Status](http://readthedocs.org/projects/phylopandas/badge/?version=latest)](http://phylopandas.readthedocs.io/en/latest/?badge=latest) 5 | [![Build Status](https://travis-ci.org/Zsailer/phylopandas.svg?branch=master)](https://travis-ci.org/Zsailer/phylopandas) 6 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Zsailer/phylopandas/master?filepath=examples%2Fintro-notebook.ipynb) 7 | 8 | **Bringing the [Pandas](https://github.com/pandas-dev/pandas) `DataFrame` to phylogenetics.** 9 | 10 | 11 | PhyloPandas provides a Pandas-like interface for reading sequence and phylogenetic tree data into pandas DataFrames. This enables easy manipulation of phylogenetic data using familiar Python/Pandas functions. Finally, phylogenetics for humans! 12 | 13 | 14 | 15 | ## How does it work? 16 | 17 | Don't worry, we didn't reinvent the wheel. **PhyloPandas** is simply a [DataFrame](https://github.com/pandas-dev/pandas) 18 | (great for human-accessible data storage) interface on top of [Biopython](https://github.com/biopython/biopython) (great for parsing/writing sequence data) and [DendroPy](https://github.com/jeetsukumaran/DendroPy) (great for reading tree data). 19 | 20 | PhyloPandas does two things: 21 | 1. It offers new `read` functions to read sequence/tree data directly into a DataFrame. 22 | 2. It attaches a new `phylo` **accessor** to the Pandas DataFrame. This accessor provides writing methods for sequencing/tree data (powered by Biopython and dendropy). 23 | 24 | ## Basic Usage 25 | 26 | **Sequence data:** 27 | 28 | Read in a sequence file. 29 | ```python 30 | import phylopandas as ph 31 | 32 | df1 = ph.read_fasta('sequences.fasta') 33 | df2 = ph.read_phylip('sequences.phy') 34 | ``` 35 | 36 | Write to various sequence file formats. 37 | 38 | ```python 39 | df1.phylo.to_clustal('sequences.clustal') 40 | ``` 41 | 42 | Convert between formats. 43 | 44 | ```python 45 | # Read a format. 46 | df = ph.read_fasta('sequences.fasta') 47 | 48 | # Write to a different format. 49 | df.phylo.to_phylip('sequences.phy') 50 | ``` 51 | 52 | **Tree data:** 53 | 54 | Read newick tree data 55 | ```python 56 | df = ph.read_newick('tree.newick') 57 | ``` 58 | 59 | Visualize the phylogenetic data (powered by [phylovega](https://github.com/Zsailer/phylovega)). 60 | ```python 61 | df.phylo.display( 62 | height=500, 63 | ) 64 | ``` 65 | 66 | 67 | 68 | ## Contributing 69 | 70 | If you have ideas for the project, please share them on the project's [Gitter chat](https://gitter.im/phylopandas/Lobby). 71 | 72 | It's *easy* to create new read/write functions and methods for PhyloPandas. If you 73 | have a format you'd like to add, please submit PRs! There are many more formats 74 | in Biopython that I haven't had the time to add myself, so please don't be afraid 75 | to add them! I thank you ahead of time! 76 | 77 | ## Testing 78 | 79 | PhyloPandas includes a small [pytest](https://docs.pytest.org/en/latest/) suite. Run these tests from base directory. 80 | ``` 81 | $ cd phylopandas 82 | $ pytest 83 | ``` 84 | 85 | ## Install 86 | 87 | Install from PyPI: 88 | ``` 89 | pip install phylopandas 90 | ``` 91 | 92 | Install from source: 93 | ``` 94 | git clone https://github.com/Zsailer/phylopandas 95 | cd phylopandas 96 | pip install -e . 97 | ``` 98 | 99 | ## Dependencies 100 | 101 | - [BioPython](https://github.com/biopython/biopython): Library for managing and manipulating biological data. 102 | - [DendroPy](https://github.com/jeetsukumaran/DendroPy): Library for phylogenetic scripting, simulation, data processing and manipulation 103 | - [Pandas](https://github.com/pandas-dev/pandas): Flexible and powerful data analysis / manipulation library for Python 104 | - [pandas_flavor](https://github.com/Zsailer/pandas_flavor): Flavor pandas objects with new accessors using pandas' new register API (with backwards compatibility). 105 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Note: To use the 'upload' functionality of this file, you must: 5 | # $ pip install twine 6 | 7 | import io 8 | import os 9 | import sys 10 | from shutil import rmtree 11 | 12 | from setuptools import find_packages, setup, Command 13 | 14 | # Package meta-data. 15 | NAME = 'phylopandas' 16 | DESCRIPTION = 'Pandas for phylogenetics' 17 | URL = 'https://github.com/Zsailer/phylopandas' 18 | EMAIL = 'zachsailer@gmail.com' 19 | AUTHOR = 'Zachary Sailer' 20 | REQUIRES_PYTHON = '>=3.0' 21 | VERSION = None 22 | 23 | # What packages are required for this module to be executed? 24 | REQUIRED = ["pandas>=0.22.0", 25 | "pandas_flavor>=0.1.0", 26 | "biopython", 27 | "dendropy"] 28 | 29 | # What packages are optional? 30 | EXTRAS = {} 31 | 32 | # The rest you shouldn't have to touch too much :) 33 | # ------------------------------------------------ 34 | # Except, perhaps the License and Trove Classifiers! 35 | # If you do change the License, remember to change the Trove Classifier for that! 36 | 37 | here = os.path.abspath(os.path.dirname(__file__)) 38 | 39 | # Import the README and use it as the long-description. 40 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 41 | try: 42 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 43 | long_description = '\n' + f.read() 44 | except FileNotFoundError: 45 | long_description = DESCRIPTION 46 | 47 | # Load the package's __version__.py module as a dictionary. 48 | about = {} 49 | if not VERSION: 50 | with open(os.path.join(here, NAME, '__version__.py')) as f: 51 | exec(f.read(), about) 52 | else: 53 | about['__version__'] = VERSION 54 | 55 | 56 | class UploadCommand(Command): 57 | """Support setup.py upload.""" 58 | 59 | description = 'Build and publish the package.' 60 | user_options = [] 61 | 62 | @staticmethod 63 | def status(s): 64 | """Prints things in bold.""" 65 | print('\033[1m{0}\033[0m'.format(s)) 66 | 67 | def initialize_options(self): 68 | pass 69 | 70 | def finalize_options(self): 71 | pass 72 | 73 | def run(self): 74 | try: 75 | self.status('Removing previous builds…') 76 | rmtree(os.path.join(here, 'dist')) 77 | except OSError: 78 | pass 79 | 80 | self.status('Building Source and Wheel (universal) distribution…') 81 | os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) 82 | 83 | self.status('Uploading the package to PyPI via Twine…') 84 | os.system('twine upload dist/*') 85 | 86 | self.status('Pushing git tags…') 87 | os.system('git tag v{0}'.format(about['__version__'])) 88 | os.system('git push --tags') 89 | 90 | sys.exit() 91 | 92 | # Where the magic happens: 93 | setup( 94 | name=NAME, 95 | version=about['__version__'], 96 | description=DESCRIPTION, 97 | long_description=long_description, 98 | long_description_content_type='text/markdown', 99 | author=AUTHOR, 100 | author_email=EMAIL, 101 | python_requires=REQUIRES_PYTHON, 102 | url=URL, 103 | packages=find_packages(exclude=('tests',)), 104 | # If your package is a single module, use this instead of 'packages': 105 | # py_modules=['mypackage'], 106 | 107 | # entry_points={ 108 | # 'console_scripts': ['mycli=mymodule:cli'], 109 | # }, 110 | install_requires=REQUIRED, 111 | extras_require=EXTRAS, 112 | include_package_data=True, 113 | license='MIT', 114 | classifiers=[ 115 | # Trove classifiers 116 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 117 | 'License :: OSI Approved :: MIT License', 118 | 'Programming Language :: Python', 119 | 'Programming Language :: Python :: 3', 120 | 'Programming Language :: Python :: 3.6', 121 | 'Programming Language :: Python :: Implementation :: CPython', 122 | 'Programming Language :: Python :: Implementation :: PyPy' 123 | ], 124 | # $ setup.py publish support. 125 | cmdclass={ 126 | 'upload': UploadCommand, 127 | }, 128 | ) 129 | -------------------------------------------------------------------------------- /phylopandas/seqio/read.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | Functions for reading sequence files into pandas DataFrame. 3 | """ 4 | 5 | # Imports 6 | from Bio import SeqIO 7 | from Bio.Seq import Seq 8 | from Bio.SeqRecord import SeqRecord 9 | from Bio.Blast import NCBIXML 10 | import Bio.Alphabet 11 | 12 | # Import Phylopandas DataFrame 13 | import pandas as pd 14 | from ..utils import get_random_id 15 | 16 | 17 | def _read_doc_template(schema): 18 | s = """Read a {} file. 19 | 20 | Construct a PhyloPandas DataFrame with columns: 21 | - name 22 | - id 23 | - description 24 | - sequence 25 | 26 | Parameters 27 | ---------- 28 | filename : str 29 | File name of {} file. 30 | 31 | seq_label : str (default='sequence') 32 | Sequence column name in DataFrame. 33 | """.format(schema, schema, schema) 34 | return s 35 | 36 | 37 | def _read( 38 | filename, 39 | schema, 40 | seq_label='sequence', 41 | alphabet=None, 42 | use_uids=True, 43 | **kwargs): 44 | """Use BioPython's sequence parsing module to convert any file format to 45 | a Pandas DataFrame. 46 | 47 | The resulting DataFrame has the following columns: 48 | - name 49 | - id 50 | - description 51 | - sequence 52 | """ 53 | # Check Alphabet if given 54 | if alphabet is None: 55 | alphabet = Bio.Alphabet.Alphabet() 56 | 57 | elif alphabet in ['dna', 'rna', 'protein', 'nucleotide']: 58 | alphabet = getattr(Bio.Alphabet, 'generic_{}'.format(alphabet)) 59 | 60 | else: 61 | raise Exception( 62 | "The alphabet is not recognized. Must be 'dna', 'rna', " 63 | "'nucleotide', or 'protein'.") 64 | 65 | kwargs.update(alphabet=alphabet) 66 | 67 | # Prepare DataFrame fields. 68 | data = { 69 | 'id': [], 70 | seq_label: [], 71 | 'description': [], 72 | 'label': [] 73 | } 74 | if use_uids: 75 | data['uid'] = [] 76 | 77 | # Parse Fasta file. 78 | for i, s in enumerate(SeqIO.parse(filename, format=schema, **kwargs)): 79 | data['id'].append(s.id) 80 | data[seq_label].append(str(s.seq)) 81 | data['description'].append(s.description) 82 | data['label'].append(s.name) 83 | 84 | if use_uids: 85 | data['uid'].append(get_random_id(10)) 86 | 87 | # Port to DataFrame. 88 | return pd.DataFrame(data) 89 | 90 | 91 | def _read_method(schema): 92 | """Add a write method for named schema to a class. 93 | """ 94 | def func( 95 | self, 96 | filename, 97 | seq_label='sequence', 98 | alphabet=None, 99 | combine_on='uid', 100 | use_uids=True, 101 | **kwargs): 102 | # Use generic write class to write data. 103 | df0 = self._data 104 | df1 = _read( 105 | filename=filename, 106 | schema=schema, 107 | seq_label=seq_label, 108 | alphabet=alphabet, 109 | use_uids=use_uids, 110 | **kwargs 111 | ) 112 | return df0.phylo.combine(df1, on=combine_on) 113 | 114 | # Update docs 115 | func.__doc__ = _read_doc_template(schema) 116 | return func 117 | 118 | 119 | def _read_function(schema): 120 | """Add a write method for named schema to a class. 121 | """ 122 | def func( 123 | filename, 124 | seq_label='sequence', 125 | alphabet=None, 126 | use_uids=True, 127 | **kwargs): 128 | # Use generic write class to write data. 129 | return _read( 130 | filename=filename, 131 | schema=schema, 132 | seq_label=seq_label, 133 | alphabet=alphabet, 134 | use_uids=use_uids, 135 | **kwargs 136 | ) 137 | # Update docs 138 | func.__doc__ = _read_doc_template(schema) 139 | return func 140 | 141 | 142 | # Various read functions to various formats. 143 | read_fasta = _read_function('fasta') 144 | read_phylip = _read_function('phylip') 145 | read_clustal = _read_function('clustal') 146 | read_embl = _read_function('embl') 147 | read_nexus_seq = _read_function('nexus') 148 | read_swiss = _read_function('swiss') 149 | read_fastq = _read_function('fastq') 150 | read_phylip_sequential = _read_function('phylip-sequential') 151 | read_phylip_relaxed = _read_function('phylip-relaxed') 152 | 153 | 154 | def read_blast_xml(filename, **kwargs): 155 | """Read BLAST XML format.""" 156 | # Read file. 157 | with open(filename, 'r') as f: 158 | blast_record = NCBIXML.read(f) 159 | 160 | # Prepare DataFrame fields. 161 | data = {'accession': [], 162 | 'hit_def': [], 163 | 'hit_id': [], 164 | 'title': [], 165 | 'length': [], 166 | 'e_value': [], 167 | 'sequence': [], 168 | 'subject_start': [], 169 | 'subject_end':[], 170 | 'query_start':[], 171 | 'query_end':[], 172 | 'uid':[]} 173 | 174 | # Get alignments from blast result. 175 | for i, s in enumerate(blast_record.alignments): 176 | data['accession'].append(s.accession) 177 | data['hit_def'].append(s.hit_def) 178 | data['hit_id'].append(s.hit_id) 179 | data['title'].append(s.title) 180 | data['length'].append(s.length) 181 | data['e_value'].append(s.hsps[0].expect) 182 | data['sequence'].append(s.hsps[0].sbjct) 183 | data['subject_start'].append(s.hsps[0].sbjct_start) 184 | data['subject_end'].append(s.hsps[0].sbjct_end) 185 | data['query_start'].append(s.hsps[0].query_start) 186 | data['query_end'].append(s.hsps[0].query_end) 187 | data['uid'].append(get_random_id(10)) 188 | 189 | # Port to DataFrame. 190 | return pd.DataFrame(data) 191 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # phylopandas documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Oct 30 16:22:28 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.mathjax'] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'phylopandas' 50 | copyright = '2017, Zach Sailer' 51 | author = 'Zach Sailer' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '0.1.2' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '0.1.2' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'alabaster' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | # Custom sidebar templates, must be a dictionary that maps document names 100 | # to template names. 101 | # 102 | # This is required for the alabaster theme 103 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 104 | html_sidebars = { 105 | '**': [ 106 | 'relations.html', # needs 'show_related': True theme option to display 107 | 'searchbox.html', 108 | ] 109 | } 110 | 111 | 112 | # -- Options for HTMLHelp output ------------------------------------------ 113 | 114 | # Output file base name for HTML help builder. 115 | htmlhelp_basename = 'phylopandasdoc' 116 | 117 | 118 | # -- Options for LaTeX output --------------------------------------------- 119 | 120 | latex_elements = { 121 | # The paper size ('letterpaper' or 'a4paper'). 122 | # 123 | # 'papersize': 'letterpaper', 124 | 125 | # The font size ('10pt', '11pt' or '12pt'). 126 | # 127 | # 'pointsize': '10pt', 128 | 129 | # Additional stuff for the LaTeX preamble. 130 | # 131 | # 'preamble': '', 132 | 133 | # Latex figure (float) alignment 134 | # 135 | # 'figure_align': 'htbp', 136 | } 137 | 138 | # Grouping the document tree into LaTeX files. List of tuples 139 | # (source start file, target name, title, 140 | # author, documentclass [howto, manual, or own class]). 141 | latex_documents = [ 142 | (master_doc, 'phylopandas.tex', 'phylopandas Documentation', 143 | 'Zach Sailer', 'manual'), 144 | ] 145 | 146 | 147 | # -- Options for manual page output --------------------------------------- 148 | 149 | # One entry per manual page. List of tuples 150 | # (source start file, name, description, authors, manual section). 151 | man_pages = [ 152 | (master_doc, 'phylopandas', 'phylopandas Documentation', 153 | [author], 1) 154 | ] 155 | 156 | 157 | # -- Options for Texinfo output ------------------------------------------- 158 | 159 | # Grouping the document tree into Texinfo files. List of tuples 160 | # (source start file, target name, title, author, 161 | # dir menu entry, description, category) 162 | texinfo_documents = [ 163 | (master_doc, 'phylopandas', 'phylopandas Documentation', 164 | author, 'phylopandas', 'One line description of project.', 165 | 'Miscellaneous'), 166 | ] 167 | -------------------------------------------------------------------------------- /phylopandas/treeio/read.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import dendropy 3 | from ..utils import get_random_id 4 | 5 | def _read_doc_template(schema): 6 | doc = """ 7 | Read a {} tree into a phylopandas.DataFrame. 8 | 9 | The resulting DataFrame has the following columns: 10 | - name: label for each taxa or node. 11 | - id: unique id (created by phylopandas) given to each node. 12 | - type: type of node (leaf, internal, or root). 13 | - parent: parent id. necessary for constructing trees. 14 | - length: length of branch from parent to node. 15 | - distance: distance from root. 16 | 17 | Parameters 18 | ---------- 19 | filename: str (default is None) 20 | {} file to read into DataFrame. 21 | 22 | data: str (default is None) 23 | {} string to parse and read into DataFrame. 24 | 25 | add_node_labels: bool 26 | If true, labels the internal nodes with numbers. 27 | 28 | Returns 29 | ------- 30 | df: phylopandas.DataFrame 31 | """.format(schema, schema, schema) 32 | return doc 33 | 34 | 35 | def _dendropy_to_dataframe( 36 | tree, 37 | add_node_labels=True, 38 | use_uids=True): 39 | """Convert Dendropy tree to Pandas dataframe.""" 40 | # Initialize the data object. 41 | idx = [] 42 | data = { 43 | 'type': [], 44 | 'id': [], 45 | 'parent': [], 46 | 'length': [], 47 | 'label': [], 48 | 'distance': []} 49 | 50 | if use_uids: 51 | data['uid'] = [] 52 | 53 | # Add labels to internal nodes if set to true. 54 | if add_node_labels: 55 | for i, node in enumerate(tree.internal_nodes()): 56 | node.label = str(i) 57 | 58 | # Check is branch lengths were given. 59 | branch_lengths_given = tree.length() > 0 60 | 61 | for node in tree.nodes(): 62 | # Get node type 63 | if node.is_leaf(): 64 | type_ = 'leaf' 65 | # Check if node has taxon 66 | if hasattr(node.taxon, 'label'): 67 | label = str(node.taxon.label).replace(' ', '_') 68 | else: 69 | label = None 70 | elif node.is_internal(): 71 | type_ = 'node' 72 | label = str(node.label) 73 | 74 | # Set node label and parent. 75 | id_ = label 76 | parent_node = node.parent_node 77 | length = node.edge_length 78 | if length is None: 79 | distance = None 80 | else: 81 | distance = node.distance_from_root() 82 | 83 | # Is this node a root? 84 | if parent_node is None: 85 | parent_label = None 86 | parent_node = None 87 | if length is None and branch_lengths_given: 88 | length = 0 89 | distance = 0 90 | type_ = 'root' 91 | 92 | # Set parent node label 93 | elif parent_node.is_internal(): 94 | parent_label = str(parent_node.label) 95 | 96 | else: 97 | raise Exception("Subtree is not attached to tree?") 98 | 99 | # Add this node to the data. 100 | data['type'].append(type_) 101 | data['id'].append(id_) 102 | data['parent'].append(parent_label) 103 | data['length'].append(length) 104 | data['label'].append(label) 105 | data['distance'].append(distance) 106 | 107 | if use_uids: 108 | data['uid'].append(get_random_id(10)) 109 | 110 | # Construct dataframe. 111 | df = pandas.DataFrame(data) 112 | return df 113 | 114 | 115 | def _read( 116 | filename=None, 117 | data=None, 118 | schema=None, 119 | add_node_labels=True, 120 | use_uids=True 121 | ): 122 | """Read a phylogenetic tree into a phylopandas.DataFrame. 123 | 124 | The resulting DataFrame has the following columns: 125 | - name: label for each taxa or node. 126 | - id: unique id (created by phylopandas) given to each node. 127 | - type: type of node (leaf, internal, or root). 128 | - parent: parent id. necessary for constructing trees. 129 | - length: length of branch from parent to node. 130 | - distance: distance from root. 131 | 132 | Parameters 133 | ---------- 134 | filename: str (default is None) 135 | newick file to read into DataFrame. 136 | 137 | data: str (default is None) 138 | newick string to parse and read into DataFrame. 139 | 140 | add_node_labels: bool 141 | If true, labels the internal nodes with numbers. 142 | 143 | Returns 144 | ------- 145 | df: phylopandas.DataFrame. 146 | """ 147 | if filename is not None: 148 | # Use Dendropy to parse tree. 149 | tree = dendropy.Tree.get( 150 | path=filename, 151 | schema=schema, 152 | preserve_underscores=True) 153 | elif data is not None: 154 | tree = dendropy.Tree.get( 155 | data=data, 156 | schema=schema, 157 | preserve_underscores=True) 158 | else: 159 | raise Exception('No tree given?') 160 | 161 | df = _dendropy_to_dataframe( 162 | tree, 163 | add_node_labels=add_node_labels, 164 | use_uids=use_uids 165 | ) 166 | return df 167 | 168 | 169 | def _read_method(schema): 170 | """Add a write method for named schema to a class. 171 | """ 172 | def func( 173 | self, 174 | filename=None, 175 | data=None, 176 | add_node_labels=True, 177 | combine_on='index', 178 | use_uids=True, 179 | **kwargs): 180 | # Use generic write class to write data. 181 | df0 = self._data 182 | df1 = _read( 183 | filename=filename, 184 | data=data, 185 | schema=schema, 186 | add_node_labels=add_node_labels, 187 | use_uids=use_uids, 188 | **kwargs 189 | ) 190 | return df0.phylo.combine(df1, on=combine_on) 191 | 192 | # Update docs 193 | func.__doc__ = _read_doc_template(schema) 194 | return func 195 | 196 | 197 | def _read_function(schema): 198 | """Add a write method for named schema to a class. 199 | """ 200 | def func( 201 | filename=None, 202 | data=None, 203 | add_node_labels=True, 204 | use_uids=True, 205 | **kwargs): 206 | # Use generic write class to write data. 207 | return _read( 208 | filename=filename, 209 | data=data, 210 | schema=schema, 211 | add_node_labels=add_node_labels, 212 | use_uids=use_uids, 213 | **kwargs 214 | ) 215 | # Update docs 216 | func.__doc__ = _read_doc_template(schema) 217 | return func 218 | 219 | 220 | def read_dendropy( 221 | df, 222 | add_node_labels=True, 223 | use_uids=True): 224 | __doc__ = _read_doc_template('dendropy') 225 | 226 | df = _dendropy_to_dataframe( 227 | tree, 228 | add_node_labels=add_node_labels, 229 | use_uids=use_uids 230 | ) 231 | return df 232 | 233 | read_newick = _read_function('newick') 234 | read_nexml = _read_function('nexml') 235 | read_nexus_tree = _read_function('nexus') 236 | -------------------------------------------------------------------------------- /phylopandas/core.py: -------------------------------------------------------------------------------- 1 | # Import pandas 2 | import pandas as pd 3 | from pandas_flavor import register_dataframe_accessor, register_series_accessor 4 | 5 | from . import seqio 6 | from . import treeio 7 | 8 | 9 | try: 10 | from phylovega import TreeChart 11 | except ImportError: 12 | TreeChart = None 13 | 14 | 15 | @register_series_accessor('phylo') 16 | class PhyloPandasSeriesMethods(object): 17 | """ 18 | """ 19 | def __init__(self, data): 20 | self._data = data 21 | 22 | # ----------------------------------------------------------- 23 | # Extra write methods. 24 | # ----------------------------------------------------------- 25 | 26 | to_fasta = seqio.write._write_method('fasta') 27 | to_phylip = seqio.write._write_method('phylip') 28 | to_clustal = seqio.write._write_method('clustal') 29 | to_embl = seqio.write._write_method('embl') 30 | to_nexus = seqio.write._write_method('nexus') 31 | to_swiss = seqio.write._write_method('swiss') 32 | to_fastq = seqio.write._write_method('fastq') 33 | to_fasta_twoline = seqio.write._write_method('fasta-2line') 34 | to_phylip_sequential = seqio.write._write_method('phylip-sequential') 35 | to_phylip_relaxed = seqio.write._write_method('phylip-relaxed') 36 | 37 | 38 | @register_dataframe_accessor('phylo') 39 | class PhyloPandasDataFrameMethods(object): 40 | """PhyloPandas accessor to the Pandas DataFrame. 41 | 42 | This accessor adds reading/writing methods to the pandas DataFrame that 43 | are specific to phylogenetic data. 44 | """ 45 | def __init__(self, data): 46 | self._data = data 47 | 48 | # ----------------------------------------------------------- 49 | # Extra read methods. 50 | # ----------------------------------------------------------- 51 | 52 | # Sequence file reading methods 53 | read_fasta = seqio.read._read_method('fasta') 54 | read_phylip = seqio.read._read_method('phylip') 55 | read_clustal = seqio.read._read_method('clustal') 56 | read_embl = seqio.read._read_method('embl') 57 | read_nexus_seq = seqio.read._read_method('nexus') 58 | read_swiss = seqio.read._read_method('swiss') 59 | read_fastq = seqio.read._read_method('fastq') 60 | read_fasta_twoline = seqio.read._read_method('fasta-2line') 61 | read_phylip_sequential = seqio.read._read_method('phylip-sequential') 62 | read_phylip_relaxed = seqio.read._read_method('phylip-relaxed') 63 | 64 | # Tree file reading methods. 65 | read_newick = treeio.read._read_method('newick') 66 | read_nexus_tree = treeio.read._read_method('nexus') 67 | 68 | def read_dendropy( 69 | self, 70 | add_node_labels=True, 71 | combine_on='index', 72 | use_uids=True): 73 | df0 = self._data 74 | df1 = treeio.read.read_dendropy( 75 | self._data, 76 | add_node_labels=add_node_labels, 77 | use_uids=use_uids 78 | ) 79 | return df0.phylo.combine(df1, on=combine_on) 80 | 81 | 82 | # ----------------------------------------------------------- 83 | # Extra write methods. 84 | # ----------------------------------------------------------- 85 | 86 | to_fasta = seqio.write._write_method('fasta') 87 | to_phylip = seqio.write._write_method('phylip') 88 | to_clustal = seqio.write._write_method('clustal') 89 | to_embl = seqio.write._write_method('embl') 90 | to_nexus_seq = seqio.write._write_method('nexus') 91 | to_swiss = seqio.write._write_method('swiss') 92 | to_fastq = seqio.write._write_method('fastq') 93 | to_fasta_twoline = seqio.write._write_method('fasta-2line') 94 | to_phylip_sequential = seqio.write._write_method('phylip-sequential') 95 | to_phylip_relaxed = seqio.write._write_method('phylip-relaxed') 96 | 97 | # Tree file reading methods. 98 | to_newick = treeio.write._write_method('newick') 99 | to_nexus_tree = treeio.write._write_method('nexus') 100 | 101 | def to_dendropy( 102 | self, 103 | taxon_col='uid', 104 | taxon_annotations=[], 105 | node_col='uid', 106 | node_annotations=[], 107 | branch_lengths=True): 108 | return treeio.write.to_dendropy( 109 | self._data, 110 | taxon_col=taxon_col, 111 | taxon_annotations=taxon_annotations, 112 | node_col=node_col, 113 | node_annotations=node_annotations, 114 | branch_lengths=branch_lengths, 115 | ) 116 | 117 | # ----------------------------------------------------------- 118 | # Useful dataframe methods specific to sequencing data. 119 | # ----------------------------------------------------------- 120 | 121 | def match_value(self, column, value): 122 | """Return a subset dataframe that column values match the given value. 123 | 124 | Parameters 125 | ---------- 126 | column : string 127 | column to search for matches 128 | 129 | value : float, int, list, etc. 130 | values to match. 131 | """ 132 | # Get column 133 | col = self._data[column] 134 | 135 | # Get items in a list 136 | try: 137 | idx = col[col.isin(value)].index 138 | 139 | # Or value is a single item? 140 | except TypeError: 141 | idx = col[col == value].index 142 | 143 | return self._data.loc[idx] 144 | 145 | 146 | def combine(self, other, on='index'): 147 | """Combine two dataframes. Update the first dataframe with second. 148 | New columns are added to the right of the first dataframe. Overlapping 149 | columns update the values of the columns. 150 | 151 | Technical note: maintains order of columns, appending new dataframe to 152 | old. 153 | 154 | Parameters 155 | ---------- 156 | other : DataFrame 157 | Index+Columns that match self will be updated with new values. 158 | New rows will be added separately. 159 | 160 | on : str 161 | Column to update index. 162 | """ 163 | # Determine column labels for new dataframe (Maintain order of columns) 164 | column_idx = {k: None for k in self._data.columns} 165 | column_idx.update({k: None for k in other.columns}) 166 | column_idx = list(column_idx.keys()) 167 | 168 | df0 = self._data.copy() 169 | df1 = other.copy() 170 | 171 | # Set index to whatever column is given 172 | df0 = df0.set_index(on, inplace=False, drop=False) 173 | df1 = df1.set_index(on, inplace=False, drop=False) 174 | 175 | # Write out both dataframes to dictionaries 176 | data0 = df0.to_dict(orient="index") 177 | data1 = df1.to_dict(orient="index") 178 | 179 | # Update. 180 | for key in data1.keys(): 181 | try: 182 | data0[key].update(data1[key]) 183 | except KeyError: 184 | data0[key] = data1[key] 185 | 186 | # Build new dataframe 187 | df = pd.DataFrame(data0).T 188 | 189 | # Check for missing columns 190 | for key in column_idx: 191 | if key not in df.columns: 192 | df[key] = None 193 | 194 | # Reset the index. 195 | df.reset_index(inplace=True) 196 | 197 | # Return dataframe (maintaining original order) 198 | return df[column_idx] 199 | 200 | def display(self, **kwargs): 201 | __doc__ = TreeChart.__doc__ 202 | # Show the tree using phylovega. 203 | try: 204 | if TreeChart is None: 205 | raise NameError 206 | return TreeChart(self._data.to_dict(orient='records'), **kwargs) 207 | except NameError: 208 | raise NameError("Looks like phylovega couldn't be imported. Is phylovega installed?") 209 | 210 | -------------------------------------------------------------------------------- /phylopandas/treeio/write.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import dendropy 3 | 4 | def _write_doc_template(schema): 5 | s = """Write to {} format. 6 | 7 | Parameters 8 | ---------- 9 | filename : str 10 | File to write {} string to. If no filename is given, a {} string 11 | will be returned. 12 | 13 | taxon_col : str (default='sequence') 14 | Sequence column name in DataFrame. 15 | 16 | taxon_annotations : str 17 | List of columns to annotation in the tree taxon. 18 | 19 | node_col : str (default='id') 20 | ID column name in DataFrame 21 | 22 | node_annotations : str 23 | List of columns to annotation in the node taxon. 24 | 25 | branch_lengths : bool (default=False) 26 | If True, use only the ID column to label sequences in fasta. 27 | """.format(schema, schema, schema) 28 | return s 29 | 30 | 31 | def _pandas_df_to_dendropy_tree( 32 | df, 33 | taxon_col='uid', 34 | taxon_annotations=[], 35 | node_col='uid', 36 | node_annotations=[], 37 | branch_lengths=True, 38 | ): 39 | """Turn a phylopandas dataframe into a dendropy tree. 40 | 41 | Parameters 42 | ---------- 43 | df : DataFrame 44 | DataFrame containing tree data. 45 | 46 | taxon_col : str (optional) 47 | Column in dataframe to label the taxon. If None, the index will be used. 48 | 49 | taxon_annotations : str 50 | List of columns to annotation in the tree taxon. 51 | 52 | node_col : str (optional) 53 | Column in dataframe to label the nodes. If None, the index will be used. 54 | 55 | node_annotations : str 56 | List of columns to annotation in the node taxon. 57 | 58 | branch_lengths : bool 59 | If True, inclues branch lengths. 60 | """ 61 | if isinstance(taxon_col, str) is False: 62 | raise Exception("taxon_col must be a string.") 63 | 64 | if isinstance(node_col, str) is False: 65 | raise Exception("taxon_col must be a string.") 66 | 67 | # Construct a list of nodes from dataframe. 68 | taxon_namespace = dendropy.TaxonNamespace() 69 | nodes = {} 70 | for idx in df.index: 71 | # Get node data. 72 | data = df.loc[idx] 73 | 74 | # Get taxon for node (if leaf node). 75 | taxon = None 76 | if data['type'] == 'leaf': 77 | taxon = dendropy.Taxon(label=data[taxon_col]) 78 | # Add annotations data. 79 | for ann in taxon_annotations: 80 | taxon.annotations.add_new(ann, data[ann]) 81 | taxon_namespace.add_taxon(taxon) 82 | 83 | # Get label for node. 84 | label = data[node_col] 85 | 86 | # Get edge length. 87 | edge_length = None 88 | if branch_lengths is True: 89 | edge_length = data['length'] 90 | 91 | # Build a node 92 | n = dendropy.Node( 93 | taxon=taxon, 94 | label=label, 95 | edge_length=edge_length 96 | ) 97 | 98 | # Add node annotations 99 | for ann in node_annotations: 100 | n.annotations.add_new(ann, data[ann]) 101 | 102 | nodes[idx] = n 103 | 104 | # Build branching pattern for nodes. 105 | root = None 106 | for idx, node in nodes.items(): 107 | # Get node data. 108 | data = df.loc[idx] 109 | 110 | # Get children nodes 111 | children_idx = df[df['parent'] == data['id']].index 112 | children_nodes = [nodes[i] for i in children_idx] 113 | 114 | # Set child nodes 115 | nodes[idx].set_child_nodes(children_nodes) 116 | 117 | # Check if this is root. 118 | if data['parent'] is None: 119 | root = nodes[idx] 120 | 121 | # Build tree. 122 | tree = dendropy.Tree( 123 | seed_node=root, 124 | taxon_namespace=taxon_namespace 125 | ) 126 | return tree 127 | 128 | 129 | def _write( 130 | df, 131 | filename=None, 132 | schema='newick', 133 | taxon_col='uid', 134 | taxon_annotations=[], 135 | node_col='uid', 136 | node_annotations=[], 137 | branch_lengths=True, 138 | **kwargs 139 | ): 140 | """Write a phylopandas tree DataFrame to various formats. 141 | 142 | Parameters 143 | ---------- 144 | df : DataFrame 145 | DataFrame containing tree data. 146 | 147 | filename : str 148 | filepath to write out tree. If None, will return string. 149 | 150 | schema : str 151 | tree format to write out. 152 | 153 | taxon_col : str (optional) 154 | Column in dataframe to label the taxon. If None, the index will be used. 155 | 156 | taxon_annotations : str 157 | List of columns to annotation in the tree taxon. 158 | 159 | node_col : str (optional) 160 | Column in dataframe to label the nodes. If None, the index will be used. 161 | 162 | node_annotations : str 163 | List of columns to annotation in the node taxon. 164 | 165 | branch_lengths : bool 166 | If True, inclues branch lengths. 167 | """ 168 | tree = _pandas_df_to_dendropy_tree( 169 | df, 170 | taxon_col=taxon_col, 171 | taxon_annotations=taxon_annotations, 172 | node_col=node_col, 173 | node_annotations=node_annotations, 174 | branch_lengths=branch_lengths, 175 | ) 176 | 177 | # Write out format 178 | if filename is not None: 179 | tree.write(path=filename, schema=schema, suppress_annotations=False, **kwargs) 180 | else: 181 | return tree.as_string(schema=schema) 182 | 183 | 184 | def _write_method(schema): 185 | """Add a write method for named schema to a class. 186 | """ 187 | def method( 188 | self, 189 | filename=None, 190 | schema=schema, 191 | taxon_col='uid', 192 | taxon_annotations=[], 193 | node_col='uid', 194 | node_annotations=[], 195 | branch_lengths=True, 196 | **kwargs): 197 | # Use generic write class to write data. 198 | return _write( 199 | self._data, 200 | filename=filename, 201 | schema=schema, 202 | taxon_col=taxon_col, 203 | taxon_annotations=taxon_annotations, 204 | node_col=node_col, 205 | node_annotations=node_annotations, 206 | branch_lengths=branch_lengths, 207 | **kwargs 208 | ) 209 | # Update docs 210 | method.__doc__ = _write_doc_template(schema) 211 | return method 212 | 213 | 214 | def _write_function(schema): 215 | """Add a write method for named schema to a class. 216 | """ 217 | def func( 218 | data, 219 | filename=None, 220 | schema=schema, 221 | taxon_col='uid', 222 | taxon_annotations=[], 223 | node_col='uid', 224 | node_annotations=[], 225 | branch_lengths=True, 226 | **kwargs): 227 | # Use generic write class to write data. 228 | return _write( 229 | data, 230 | filename=filename, 231 | schema=schema, 232 | taxon_col=taxon_col, 233 | taxon_annotations=taxon_annotations, 234 | node_col=node_col, 235 | node_annotations=node_annotations, 236 | branch_lengths=branch_lengths, 237 | **kwargs 238 | ) 239 | # Update docs 240 | func.__doc__ = _write_doc_template(schema) 241 | return func 242 | 243 | def to_dendropy( 244 | data, 245 | taxon_col='uid', 246 | taxon_annotations=[], 247 | node_col='uid', 248 | node_annotations=[], 249 | branch_lengths=True): 250 | return _pandas_df_to_dendropy_tree( 251 | data, 252 | taxon_col=taxon_col, 253 | taxon_annotations=taxon_annotations, 254 | node_col=node_col, 255 | node_annotations=node_annotations, 256 | branch_lengths=branch_lengths, 257 | ) 258 | 259 | to_newick = _write_function('newick') 260 | to_nexml = _write_function('nexml') 261 | to_nexus_tree = _write_function('nexus') 262 | -------------------------------------------------------------------------------- /phylopandas/seqio/write.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | Functions for write sequence data to sequence files. 3 | """ 4 | import pandas as pd 5 | 6 | # Import Biopython 7 | from Bio import SeqIO 8 | from Bio.Seq import Seq 9 | from Bio.SeqRecord import SeqRecord 10 | import Bio.Alphabet 11 | 12 | 13 | def _write_doc_template(schema): 14 | s = """Write to {} format. 15 | 16 | Parameters 17 | ---------- 18 | filename : str 19 | File to write {} string to. If no filename is given, a {} string 20 | will be returned. 21 | 22 | sequence_col : str (default='sequence') 23 | Sequence column name in DataFrame. 24 | 25 | id_col : str (default='id') 26 | ID column name in DataFrame 27 | 28 | id_only : bool (default=False) 29 | If True, use only the ID column to label sequences in fasta. 30 | """.format(schema, schema, schema) 31 | return s 32 | 33 | 34 | def pandas_df_to_biopython_seqrecord( 35 | df, 36 | id_col='uid', 37 | sequence_col='sequence', 38 | extra_data=None, 39 | alphabet=None, 40 | ): 41 | """Convert pandas dataframe to biopython seqrecord for easy writing. 42 | 43 | Parameters 44 | ---------- 45 | df : Dataframe 46 | Pandas dataframe to convert 47 | 48 | id_col : str 49 | column in dataframe to use as sequence label 50 | 51 | sequence_col str: 52 | column in dataframe to use as sequence data 53 | 54 | extra_data : list 55 | extra columns to use in sequence description line 56 | 57 | alphabet : 58 | biopython Alphabet object 59 | 60 | Returns 61 | ------- 62 | seq_records : 63 | List of biopython seqrecords. 64 | """ 65 | seq_records = [] 66 | 67 | for i, row in df.iterrows(): 68 | # Tries getting sequence data. If a TypeError at the seqrecord 69 | # creation is thrown, it is assumed that this row does not contain 70 | # sequence data and therefore the row is ignored. 71 | try: 72 | # Get sequence 73 | seq = Seq(row[sequence_col], alphabet=alphabet) 74 | 75 | # Get id 76 | id = row[id_col] 77 | 78 | # Build a description 79 | description = "" 80 | if extra_data is not None: 81 | description = " ".join([row[key] for key in extra_data]) 82 | 83 | # Build a record 84 | record = SeqRecord( 85 | seq=seq, 86 | id=id, 87 | description=description, 88 | ) 89 | seq_records.append(record) 90 | except TypeError: 91 | pass 92 | 93 | return seq_records 94 | 95 | def pandas_series_to_biopython_seqrecord( 96 | series, 97 | id_col='uid', 98 | sequence_col='sequence', 99 | extra_data=None, 100 | alphabet=None 101 | ): 102 | """Convert pandas series to biopython seqrecord for easy writing. 103 | 104 | Parameters 105 | ---------- 106 | series : Series 107 | Pandas series to convert 108 | 109 | id_col : str 110 | column in dataframe to use as sequence label 111 | 112 | sequence_col : str 113 | column in dataframe to use as sequence data 114 | 115 | extra_data : list 116 | extra columns to use in sequence description line 117 | 118 | Returns 119 | ------- 120 | seq_records : 121 | List of biopython seqrecords. 122 | """ 123 | # Get sequence 124 | seq = Seq(series[sequence_col], alphabet=alphabet) 125 | 126 | # Get id 127 | id = series[id_col] 128 | 129 | # Build a description 130 | description = "" 131 | if extra_data is not None: 132 | description = " ".join([series[key] for key in extra_data]) 133 | 134 | # Build a record 135 | record = SeqRecord( 136 | seq=seq, 137 | id=id, 138 | description=description, 139 | ) 140 | 141 | seq_records = [record] 142 | return seq_records 143 | 144 | def _write( 145 | data, 146 | filename=None, 147 | schema='fasta', 148 | id_col='uid', 149 | sequence_col='sequence', 150 | extra_data=None, 151 | alphabet=None, 152 | **kwargs): 153 | """General write function. Write phylopanda data to biopython format. 154 | 155 | Parameters 156 | ---------- 157 | filename : str 158 | File to write string to. If no filename is given, a string 159 | will be returned. 160 | 161 | sequence_col : str (default='sequence') 162 | Sequence column name in DataFrame. 163 | 164 | id_col : str (default='id') 165 | ID column name in DataFrame 166 | 167 | id_only : bool (default=False) 168 | If True, use only the ID column to label sequences in fasta. 169 | """ 170 | # Check Alphabet if given 171 | if alphabet is None: 172 | alphabet = Bio.Alphabet.Alphabet() 173 | 174 | elif alphabet in ['dna', 'rna', 'protein', 'nucleotide']: 175 | alphabet = getattr(Bio.Alphabet, 'generic_{}'.format(alphabet)) 176 | 177 | else: 178 | raise Exception( 179 | "The alphabet is not recognized. Must be 'dna', 'rna', " 180 | "'nucleotide', or 'protein'.") 181 | 182 | # Build a list of records from a pandas DataFrame 183 | if type(data) is pd.DataFrame: 184 | seq_records = pandas_df_to_biopython_seqrecord( 185 | data, 186 | id_col=id_col, 187 | sequence_col=sequence_col, 188 | extra_data=extra_data, 189 | alphabet=alphabet, 190 | ) 191 | 192 | # Build a record from a pandas Series 193 | elif type(data) is pd.Series: 194 | seq_records = pandas_series_to_biopython_seqrecord( 195 | data, 196 | id_col=id_col, 197 | sequence_col=sequence_col, 198 | extra_data=extra_data, 199 | alphabet=alphabet, 200 | ) 201 | 202 | # Write to disk or return string 203 | if filename is not None: 204 | SeqIO.write(seq_records, filename, format=schema, **kwargs) 205 | 206 | else: 207 | return "".join([s.format(schema) for s in seq_records]) 208 | 209 | def _write_method(schema): 210 | """Add a write method for named schema to a class. 211 | """ 212 | def method( 213 | self, 214 | filename=None, 215 | schema=schema, 216 | id_col='uid', 217 | sequence_col='sequence', 218 | extra_data=None, 219 | alphabet=None, 220 | **kwargs): 221 | # Use generic write class to write data. 222 | return _write( 223 | self._data, 224 | filename=filename, 225 | schema=schema, 226 | id_col=id_col, 227 | sequence_col=sequence_col, 228 | extra_data=extra_data, 229 | alphabet=alphabet, 230 | **kwargs 231 | ) 232 | # Update docs 233 | method.__doc__ = _write_doc_template(schema) 234 | return method 235 | 236 | 237 | def _write_function(schema): 238 | """Add a write method for named schema to a class. 239 | """ 240 | def func( 241 | data, 242 | filename=None, 243 | schema=schema, 244 | id_col='uid', 245 | sequence_col='sequence', 246 | extra_data=None, 247 | alphabet=None, 248 | **kwargs): 249 | # Use generic write class to write data. 250 | return _write( 251 | data, 252 | filename=filename, 253 | schema=schema, 254 | id_col=id_col, 255 | sequence_col=sequence_col, 256 | extra_data=extra_data, 257 | alphabet=alphabet, 258 | **kwargs 259 | ) 260 | # Update docs 261 | func.__doc__ = _write_doc_template(schema) 262 | return func 263 | 264 | 265 | # Write functions to various formats. 266 | to_fasta = _write_function('fasta') 267 | to_phylip = _write_function('phylip') 268 | to_clustal = _write_function('clustal') 269 | to_embl = _write_function('embl') 270 | to_nexus_seq = _write_function('nexus') 271 | to_swiss = _write_function('swiss') 272 | to_fastq = _write_function('fastq') 273 | -------------------------------------------------------------------------------- /docs/_logo/logo-02.svg: -------------------------------------------------------------------------------- 1 | 2 | 13 | 15 | 17 | 18 | 20 | image/svg+xml 21 | 23 | 24 | 25 | 26 | 27 | 31 | 38 | 39 | 43 |   53 | 57 | 60 | 64 | 68 | 72 | 76 | 80 | 84 | 88 | 92 | 96 | 97 | 104 | 105 | 115 | 116 | 120 | 124 | 128 | 132 | 136 | 140 | 144 | 148 | 152 | 156 | 160 | 161 | 162 | 166 | Pandas Phylo 195 | 196 | -------------------------------------------------------------------------------- /docs/_logo/banner.svg: -------------------------------------------------------------------------------- 1 | 2 | 13 | 15 | 17 | 18 | 20 | image/svg+xml 21 | 23 | 24 | 25 | 26 | 27 | 39 | 43 |   53 | 57 | 60 | 64 | 68 | 72 | 76 | 80 | 84 | 88 | 92 | 96 | 97 | 104 | 105 | 115 | 116 | 120 | 124 | 128 | 132 | 136 | 140 | 144 | 148 | 152 | 156 | 160 | 161 | 162 | 166 | 169 | Pandas Phylo 198 | 199 | 200 | -------------------------------------------------------------------------------- /docs/_logo/logo-2.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 39 | 41 | 42 | 44 | image/svg+xml 45 | 47 | 48 | 49 | 50 | 51 | 57 | 64 | 65 | 71 |   82 | 86 | 89 | 94 | 99 | 105 | 110 | 115 | 121 | 127 | 133 | 139 | 140 | 147 | 148 | 159 | 160 | 166 | 170 | 176 | 182 | 188 | 194 | 200 | 206 | 212 | 218 | 224 | 225 | 226 | 232 | Pandas Phylo 261 | 262 | -------------------------------------------------------------------------------- /docs/_logo/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 39 | 41 | 42 | 44 | image/svg+xml 45 | 47 | 48 | 49 | 50 | 51 | 57 | 64 | 65 | 71 |   82 | 86 | 89 | 94 | 99 | 105 | 110 | 115 | 121 | 127 | 133 | 139 | 140 | 147 | 148 | 159 | 160 | 166 | 170 | 176 | 182 | 188 | 194 | 200 | 206 | 212 | 218 | 224 | 225 | 226 | 232 | Pandas Phylo 261 | 262 | -------------------------------------------------------------------------------- /examples/intro-notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Phylopandas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Let me introduce you to PhyloPandas. A Pandas dataframe and interface for phylogenetics." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import phylopandas as ph" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Reading data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Phylopandas comes with various `read_` methods to load phylogenetic data into a Pandas DataFrame.\n", 47 | "\n", 48 | "Check out the various formats by hitting `tab` after `read` in the cell below." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "ename": "AttributeError", 58 | "evalue": "module 'phylopandas' has no attribute 'read_'", 59 | "output_type": "error", 60 | "traceback": [ 61 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 62 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 63 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 64 | "\u001b[0;31mAttributeError\u001b[0m: module 'phylopandas' has no attribute 'read_'" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "ph.read_" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Try reading some of the sequence files in the `data` folder." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "with open('PF08793_seed.fasta', 'r') as f:\n", 86 | " print(f.read())" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "ph.read_fasta('PF08793_seed.fasta')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "ph.read_phylip('PF08793_seed.phylip')" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "ph.read_clustal('PF08793_seed.clustal')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Writing data" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "PhyloPandas attaches a `phylo` accessor to the standard Pandas DataFrame. Inside this accessor are various writing methods, following Pandas syntax, allowing you to write to various sequence formats.\n", 128 | "\n", 129 | "To quickly see the writing functions, hit `tab` after `to_` in the cell below." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "df = ph.read_fasta('PF08793_seed.fasta')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "df.phylo.to_" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Let's write the dataframe back out to fasta. If you don't give a filename, it will return a string." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "s = df.phylo.to_fasta()\n", 164 | "print(s)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Converting between formats" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "Of course, this means you can easily convert between sequence formats. " 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 4, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | ">5PosaPz1Ba\n", 191 | "KCIAFDK----ND-KINPFTGRPINENNDTYRMIYSMCHG\n", 192 | ">nTtjWXLcTL\n", 193 | "ACALYYD----DP-TVNPFTDEPLRRYSPIDDLLYRNCES\n", 194 | ">Nk9EoqE14T\n", 195 | "YCTNFHR----DE-SRNPLTGKKLVPTSPIRKAWHKMCSG\n", 196 | ">KrryYldJzG\n", 197 | "LCAEYKR----SP-RYNPWTDRTLAPGSPKHNLISGMCGG\n", 198 | ">8sH15yS2LJ\n", 199 | "VCNDLALCSQHTD-TYNPWTDRALLPDSPVHDMIDYVCNT\n", 200 | ">38EkV6VtF1\n", 201 | "VCERFAA----DP-TRNPVTGSPLSRNDPLYTDLMEICKG\n", 202 | ">goe9RcxcQY\n", 203 | "TCEAFCR----DP-TRNPVTGQKMRRNGIEYQMFAEECDC\n", 204 | ">zBbStiY22V\n", 205 | "KCDEWEKIRLNSS-PKNPFTKRNVKKDGPTYKKIDLICKH\n", 206 | ">gUstHy3NWv\n", 207 | "KCYEWDIAKKKSPLPKSPLTGRKLKQHGPTWKKITAECAT\n", 208 | ">pJSzBTSdyJ\n", 209 | "KCSKWHE----QP-LINPLTNRKIKKNGPTYKELERECGP\n", 210 | ">hHqmLdOzYk\n", 211 | "LCSKWKA----NP-LVNPATGRKIKKDGPVYEKIQKKCS-\n", 212 | ">9PhikwhdAD\n", 213 | "YCDEFER----NP-TRNPRTGRTIKRGGPVFRALERECSD\n", 214 | ">YIM7zb5VSh\n", 215 | "-CPEFAR----DP-TRNPRTGRTIKRGGPTYRALEAECAD\n", 216 | ">hhFPHo9QRt\n", 217 | "ECEQWLA----NK-GINPRTGKAIKIGGPTYKKLEMECKE\n", 218 | ">1UAjmKxk2o\n", 219 | "VCKKFLA----NK-TVSPYSGRPIKPGKKLYNDLEKHCSG\n", 220 | ">AxcIhHg3sO\n", 221 | "QCRAFEE----NP-DVNPNTGRRISPTGPIASSMRRRCMN\n", 222 | ">yuLFxOOfPi\n", 223 | "KCNQLRN----NRYTVNPVSNRAIAPRGDTANTLRRICEQ\n", 224 | ">URSmxyxeaW\n", 225 | "QCETFKR----NKQAVSPLTNCPIDKFGRTAARFRKECD-\n", 226 | "\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "df = ph.read_phylip('PF08793_seed.phylip')\n", 232 | "\n", 233 | "fasta_str = df.phylo.to_fasta()\n", 234 | "\n", 235 | "print(fasta_str)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## Reading Tree Data" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "Phylopandas can also read in phylogenetic tree data." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 5, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "(Q8QUQ5_ISKNN/45-79:0.38376442,Q8QUQ6_ISKNN/37-75:0.93473288,(Q8QUQ5_ISKNN/123-157:1.14582942,(Q0E553_SFAVA/142-176:0.94308689,(Q0E553_SFAVA/184-218:0.98977147,(Q0E553_SFAVA/60-94:0.95706148,(((019R_FRG3G/5-39:0.06723315,(019R_FRG3G/139-172:0.05690376,(019R_FRG3G/249-283:0.95772959,019R_FRG3G/302-336:0.58361302)2.745285:0.61968795)1.680162:0.12814819)8.545520:0.30724093,((VF232_IIV6/64-98:0.77338949,((VF380_IIV6/7-45:0.56133629,VF380_IIV3/8-47:0.64307079)7.484104:0.37367018,(VF378_IIV6/4-38:0.31530205,O41158_PBCV1/63-96:0.46076842)1.909391:0.20522645)0.218717:0.09388521)2.531435:0.20551347,Q0E553_SFAVA/14-48:1.58834786)0.265099:0.00027193)6.209727:0.37908212,(Q8QUQ5_ISKNN/164-198:0.63907222,Q8QUQ5_ISKNN/7-42:0.96743219)2.806276:0.362965)0.677978:0.20054193)0.718698:0.20642561)2.503850:0.27168922)1.162623:0.15868612)6.040602:0.48939921);\n", 262 | "\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "with open('PF08793_seed.newick', 'r') as f:\n", 268 | " print( f.read())" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 6, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/html": [ 279 | "
\n", 280 | "\n", 293 | "\n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | "
typeidparentlengthlabeldistanceuid
0root0None0.00000000.000000kCIjFBZKXZ
1leafQ8QUQ5_ISKNN/45-7900.383764Q8QUQ5_ISKNN/45-790.383764wKP5pcfIok
2leafQ8QUQ6_ISKNN/37-7500.934733Q8QUQ6_ISKNN/37-750.934733Wi6ARQAOcw
3node100.48939910.489399iKoRLPGtl6
4leafQ8QUQ5_ISKNN/123-15711.145829Q8QUQ5_ISKNN/123-1571.635229RbLr5Hi2L9
5node210.15868620.648085pR3f9C8Ort
6leafQ0E553_SFAVA/142-17620.943087Q0E553_SFAVA/142-1761.5911728wbvqaG3jg
7node320.27168930.919775sCUs3pJLK8
8leafQ0E553_SFAVA/184-21830.989771Q0E553_SFAVA/184-2181.909546Lov4UJif6D
9node430.20642641.1262005yDZXG1tyd
10leafQ0E553_SFAVA/60-9440.957061Q0E553_SFAVA/60-942.0832625vdNmIFPXc
11node540.20054251.326742WBQ9xPajWc
12node650.37908261.705824ghTcY2ffpP
13node760.30724172.013065Jppnt2XML0
14leaf019R_FRG3G/5-3970.067233019R_FRG3G/5-392.080298TDxeDIcFpO
15node870.12814882.1412136iGd5fwhap
16leaf019R_FRG3G/139-17280.056904019R_FRG3G/139-1722.198117hyhVxB9tHU
17node980.61968892.760901QnhYrilQej
18leaf019R_FRG3G/249-28390.957730019R_FRG3G/249-2833.718631V4FBuc6aRj
19leaf019R_FRG3G/302-33690.583613019R_FRG3G/302-3363.344514QKM2XtMt64
20node1060.000272101.706096NIJbhPLOuF
21node11100.205513111.911610lk3EvVgXDt
22leafVF232_IIV6/64-98110.773389VF232_IIV6/64-982.684999FAdWRQzPjL
23node12110.093885122.005495i7efTQ6vir
24node13120.373670132.3791651b9wK0j6kP
25leafVF380_IIV6/7-45130.561336VF380_IIV6/7-452.940501M4uL7HMUUX
26leafVF380_IIV3/8-47130.643071VF380_IIV3/8-473.0222363D2lkpDYdj
27node14120.205226142.210721MPQsyhpHRU
28leafVF378_IIV6/4-38140.315302VF378_IIV6/4-382.526023PpUlPwYDu2
29leafO41158_PBCV1/63-96140.460768O41158_PBCV1/63-962.671490TdWSC3FiL6
30leafQ0E553_SFAVA/14-48101.588348Q0E553_SFAVA/14-483.29444499kd8VpIQk
31node1550.362965151.689707y1TJLF3YJa
32leafQ8QUQ5_ISKNN/164-198150.639072Q8QUQ5_ISKNN/164-1982.3287796MpoaU0KeN
33leafQ8QUQ5_ISKNN/7-42150.967432Q8QUQ5_ISKNN/7-422.657139w84ibjT7xv
\n", 649 | "
" 650 | ], 651 | "text/plain": [ 652 | " type id parent length label \\\n", 653 | "0 root 0 None 0.000000 0 \n", 654 | "1 leaf Q8QUQ5_ISKNN/45-79 0 0.383764 Q8QUQ5_ISKNN/45-79 \n", 655 | "2 leaf Q8QUQ6_ISKNN/37-75 0 0.934733 Q8QUQ6_ISKNN/37-75 \n", 656 | "3 node 1 0 0.489399 1 \n", 657 | "4 leaf Q8QUQ5_ISKNN/123-157 1 1.145829 Q8QUQ5_ISKNN/123-157 \n", 658 | "5 node 2 1 0.158686 2 \n", 659 | "6 leaf Q0E553_SFAVA/142-176 2 0.943087 Q0E553_SFAVA/142-176 \n", 660 | "7 node 3 2 0.271689 3 \n", 661 | "8 leaf Q0E553_SFAVA/184-218 3 0.989771 Q0E553_SFAVA/184-218 \n", 662 | "9 node 4 3 0.206426 4 \n", 663 | "10 leaf Q0E553_SFAVA/60-94 4 0.957061 Q0E553_SFAVA/60-94 \n", 664 | "11 node 5 4 0.200542 5 \n", 665 | "12 node 6 5 0.379082 6 \n", 666 | "13 node 7 6 0.307241 7 \n", 667 | "14 leaf 019R_FRG3G/5-39 7 0.067233 019R_FRG3G/5-39 \n", 668 | "15 node 8 7 0.128148 8 \n", 669 | "16 leaf 019R_FRG3G/139-172 8 0.056904 019R_FRG3G/139-172 \n", 670 | "17 node 9 8 0.619688 9 \n", 671 | "18 leaf 019R_FRG3G/249-283 9 0.957730 019R_FRG3G/249-283 \n", 672 | "19 leaf 019R_FRG3G/302-336 9 0.583613 019R_FRG3G/302-336 \n", 673 | "20 node 10 6 0.000272 10 \n", 674 | "21 node 11 10 0.205513 11 \n", 675 | "22 leaf VF232_IIV6/64-98 11 0.773389 VF232_IIV6/64-98 \n", 676 | "23 node 12 11 0.093885 12 \n", 677 | "24 node 13 12 0.373670 13 \n", 678 | "25 leaf VF380_IIV6/7-45 13 0.561336 VF380_IIV6/7-45 \n", 679 | "26 leaf VF380_IIV3/8-47 13 0.643071 VF380_IIV3/8-47 \n", 680 | "27 node 14 12 0.205226 14 \n", 681 | "28 leaf VF378_IIV6/4-38 14 0.315302 VF378_IIV6/4-38 \n", 682 | "29 leaf O41158_PBCV1/63-96 14 0.460768 O41158_PBCV1/63-96 \n", 683 | "30 leaf Q0E553_SFAVA/14-48 10 1.588348 Q0E553_SFAVA/14-48 \n", 684 | "31 node 15 5 0.362965 15 \n", 685 | "32 leaf Q8QUQ5_ISKNN/164-198 15 0.639072 Q8QUQ5_ISKNN/164-198 \n", 686 | "33 leaf Q8QUQ5_ISKNN/7-42 15 0.967432 Q8QUQ5_ISKNN/7-42 \n", 687 | "\n", 688 | " distance uid \n", 689 | "0 0.000000 kCIjFBZKXZ \n", 690 | "1 0.383764 wKP5pcfIok \n", 691 | "2 0.934733 Wi6ARQAOcw \n", 692 | "3 0.489399 iKoRLPGtl6 \n", 693 | "4 1.635229 RbLr5Hi2L9 \n", 694 | "5 0.648085 pR3f9C8Ort \n", 695 | "6 1.591172 8wbvqaG3jg \n", 696 | "7 0.919775 sCUs3pJLK8 \n", 697 | "8 1.909546 Lov4UJif6D \n", 698 | "9 1.126200 5yDZXG1tyd \n", 699 | "10 2.083262 5vdNmIFPXc \n", 700 | "11 1.326742 WBQ9xPajWc \n", 701 | "12 1.705824 ghTcY2ffpP \n", 702 | "13 2.013065 Jppnt2XML0 \n", 703 | "14 2.080298 TDxeDIcFpO \n", 704 | "15 2.141213 6iGd5fwhap \n", 705 | "16 2.198117 hyhVxB9tHU \n", 706 | "17 2.760901 QnhYrilQej \n", 707 | "18 3.718631 V4FBuc6aRj \n", 708 | "19 3.344514 QKM2XtMt64 \n", 709 | "20 1.706096 NIJbhPLOuF \n", 710 | "21 1.911610 lk3EvVgXDt \n", 711 | "22 2.684999 FAdWRQzPjL \n", 712 | "23 2.005495 i7efTQ6vir \n", 713 | "24 2.379165 1b9wK0j6kP \n", 714 | "25 2.940501 M4uL7HMUUX \n", 715 | "26 3.022236 3D2lkpDYdj \n", 716 | "27 2.210721 MPQsyhpHRU \n", 717 | "28 2.526023 PpUlPwYDu2 \n", 718 | "29 2.671490 TdWSC3FiL6 \n", 719 | "30 3.294444 99kd8VpIQk \n", 720 | "31 1.689707 y1TJLF3YJa \n", 721 | "32 2.328779 6MpoaU0KeN \n", 722 | "33 2.657139 w84ibjT7xv " 723 | ] 724 | }, 725 | "execution_count": 6, 726 | "metadata": {}, 727 | "output_type": "execute_result" 728 | } 729 | ], 730 | "source": [ 731 | "ph.read_newick('PF08793_seed.newick')" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "metadata": {}, 737 | "source": [ 738 | "## Why is PhyloPandas useful? \n", 739 | "\n", 740 | "We already have BioPython, DendroPy, ete3, etc. right?" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 7, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "df = ph.read_newick('PF08793_seed.newick')\n", 750 | "\n", 751 | "df2 = df.loc[df.type == \"leaf\"]" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 8, 757 | "metadata": {}, 758 | "outputs": [ 759 | { 760 | "data": { 761 | "text/html": [ 762 | "
\n", 763 | "\n", 776 | "\n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | "
typeidparentlengthlabeldistanceuid
0root0None0.00000000.0000009x4F7nTLnY
1leafQ8QUQ5_ISKNN/45-7900.383764Q8QUQ5_ISKNN/45-790.383764bhUZpMzqaw
2leafQ8QUQ6_ISKNN/37-7500.934733Q8QUQ6_ISKNN/37-750.934733AGoLMJy4qb
3node100.48939910.489399PEr58Pk7IB
4leafQ8QUQ5_ISKNN/123-15711.145829Q8QUQ5_ISKNN/123-1571.635229CQmpogxXrH
5node210.15868620.6480854fGJ1yqAd6
6leafQ0E553_SFAVA/142-17620.943087Q0E553_SFAVA/142-1761.591172W89uwOl3sK
7node320.27168930.919775xCOwZZkfi5
8leafQ0E553_SFAVA/184-21830.989771Q0E553_SFAVA/184-2181.909546gDFNACm9Vx
9node430.20642641.126200BngfjtGSGI
10leafQ0E553_SFAVA/60-9440.957061Q0E553_SFAVA/60-942.083262fRZuaBG9S3
11node540.20054251.326742NmlJLQbDRv
12node650.37908261.705824lrdraHKZPu
13node760.30724172.013065PFW37AvcYM
14leaf019R_FRG3G/5-3970.067233019R_FRG3G/5-392.080298mclkZI6LJJ
15node870.12814882.141213L812ps7kEQ
16leaf019R_FRG3G/139-17280.056904019R_FRG3G/139-1722.1981176qtDyUu3Xx
17node980.61968892.760901jbaGKmQX58
18leaf019R_FRG3G/249-28390.957730019R_FRG3G/249-2833.718631ZM0EOpcIQT
19leaf019R_FRG3G/302-33690.583613019R_FRG3G/302-3363.344514WQi85K0XJ9
20node1060.000272101.706096m5gcndbJ8y
21node11100.205513111.911610HTHRlWWbVk
22leafVF232_IIV6/64-98110.773389VF232_IIV6/64-982.684999HKAm1CcD5f
23node12110.093885122.00549543NfhfKUkH
24node13120.373670132.379165pHJPwh7ew7
25leafVF380_IIV6/7-45130.561336VF380_IIV6/7-452.940501rrJHPnwZSf
26leafVF380_IIV3/8-47130.643071VF380_IIV3/8-473.022236ZvZl8mCP8M
27node14120.205226142.210721uBIkldlUE1
28leafVF378_IIV6/4-38140.315302VF378_IIV6/4-382.526023OBk4WSlGu7
29leafO41158_PBCV1/63-96140.460768O41158_PBCV1/63-962.671490PO4PsryR5V
30leafQ0E553_SFAVA/14-48101.588348Q0E553_SFAVA/14-483.294444P3GhB4vdqL
31node1550.362965151.689707HTXFsynsuZ
32leafQ8QUQ5_ISKNN/164-198150.639072Q8QUQ5_ISKNN/164-1982.32877956dIugXUfd
33leafQ8QUQ5_ISKNN/7-42150.967432Q8QUQ5_ISKNN/7-422.657139G193mLw0d7
\n", 1132 | "
" 1133 | ], 1134 | "text/plain": [ 1135 | " type id parent length label \\\n", 1136 | "0 root 0 None 0.000000 0 \n", 1137 | "1 leaf Q8QUQ5_ISKNN/45-79 0 0.383764 Q8QUQ5_ISKNN/45-79 \n", 1138 | "2 leaf Q8QUQ6_ISKNN/37-75 0 0.934733 Q8QUQ6_ISKNN/37-75 \n", 1139 | "3 node 1 0 0.489399 1 \n", 1140 | "4 leaf Q8QUQ5_ISKNN/123-157 1 1.145829 Q8QUQ5_ISKNN/123-157 \n", 1141 | "5 node 2 1 0.158686 2 \n", 1142 | "6 leaf Q0E553_SFAVA/142-176 2 0.943087 Q0E553_SFAVA/142-176 \n", 1143 | "7 node 3 2 0.271689 3 \n", 1144 | "8 leaf Q0E553_SFAVA/184-218 3 0.989771 Q0E553_SFAVA/184-218 \n", 1145 | "9 node 4 3 0.206426 4 \n", 1146 | "10 leaf Q0E553_SFAVA/60-94 4 0.957061 Q0E553_SFAVA/60-94 \n", 1147 | "11 node 5 4 0.200542 5 \n", 1148 | "12 node 6 5 0.379082 6 \n", 1149 | "13 node 7 6 0.307241 7 \n", 1150 | "14 leaf 019R_FRG3G/5-39 7 0.067233 019R_FRG3G/5-39 \n", 1151 | "15 node 8 7 0.128148 8 \n", 1152 | "16 leaf 019R_FRG3G/139-172 8 0.056904 019R_FRG3G/139-172 \n", 1153 | "17 node 9 8 0.619688 9 \n", 1154 | "18 leaf 019R_FRG3G/249-283 9 0.957730 019R_FRG3G/249-283 \n", 1155 | "19 leaf 019R_FRG3G/302-336 9 0.583613 019R_FRG3G/302-336 \n", 1156 | "20 node 10 6 0.000272 10 \n", 1157 | "21 node 11 10 0.205513 11 \n", 1158 | "22 leaf VF232_IIV6/64-98 11 0.773389 VF232_IIV6/64-98 \n", 1159 | "23 node 12 11 0.093885 12 \n", 1160 | "24 node 13 12 0.373670 13 \n", 1161 | "25 leaf VF380_IIV6/7-45 13 0.561336 VF380_IIV6/7-45 \n", 1162 | "26 leaf VF380_IIV3/8-47 13 0.643071 VF380_IIV3/8-47 \n", 1163 | "27 node 14 12 0.205226 14 \n", 1164 | "28 leaf VF378_IIV6/4-38 14 0.315302 VF378_IIV6/4-38 \n", 1165 | "29 leaf O41158_PBCV1/63-96 14 0.460768 O41158_PBCV1/63-96 \n", 1166 | "30 leaf Q0E553_SFAVA/14-48 10 1.588348 Q0E553_SFAVA/14-48 \n", 1167 | "31 node 15 5 0.362965 15 \n", 1168 | "32 leaf Q8QUQ5_ISKNN/164-198 15 0.639072 Q8QUQ5_ISKNN/164-198 \n", 1169 | "33 leaf Q8QUQ5_ISKNN/7-42 15 0.967432 Q8QUQ5_ISKNN/7-42 \n", 1170 | "\n", 1171 | " distance uid \n", 1172 | "0 0.000000 9x4F7nTLnY \n", 1173 | "1 0.383764 bhUZpMzqaw \n", 1174 | "2 0.934733 AGoLMJy4qb \n", 1175 | "3 0.489399 PEr58Pk7IB \n", 1176 | "4 1.635229 CQmpogxXrH \n", 1177 | "5 0.648085 4fGJ1yqAd6 \n", 1178 | "6 1.591172 W89uwOl3sK \n", 1179 | "7 0.919775 xCOwZZkfi5 \n", 1180 | "8 1.909546 gDFNACm9Vx \n", 1181 | "9 1.126200 BngfjtGSGI \n", 1182 | "10 2.083262 fRZuaBG9S3 \n", 1183 | "11 1.326742 NmlJLQbDRv \n", 1184 | "12 1.705824 lrdraHKZPu \n", 1185 | "13 2.013065 PFW37AvcYM \n", 1186 | "14 2.080298 mclkZI6LJJ \n", 1187 | "15 2.141213 L812ps7kEQ \n", 1188 | "16 2.198117 6qtDyUu3Xx \n", 1189 | "17 2.760901 jbaGKmQX58 \n", 1190 | "18 3.718631 ZM0EOpcIQT \n", 1191 | "19 3.344514 WQi85K0XJ9 \n", 1192 | "20 1.706096 m5gcndbJ8y \n", 1193 | "21 1.911610 HTHRlWWbVk \n", 1194 | "22 2.684999 HKAm1CcD5f \n", 1195 | "23 2.005495 43NfhfKUkH \n", 1196 | "24 2.379165 pHJPwh7ew7 \n", 1197 | "25 2.940501 rrJHPnwZSf \n", 1198 | "26 3.022236 ZvZl8mCP8M \n", 1199 | "27 2.210721 uBIkldlUE1 \n", 1200 | "28 2.526023 OBk4WSlGu7 \n", 1201 | "29 2.671490 PO4PsryR5V \n", 1202 | "30 3.294444 P3GhB4vdqL \n", 1203 | "31 1.689707 HTXFsynsuZ \n", 1204 | "32 2.328779 56dIugXUfd \n", 1205 | "33 2.657139 G193mLw0d7 " 1206 | ] 1207 | }, 1208 | "execution_count": 8, 1209 | "metadata": {}, 1210 | "output_type": "execute_result" 1211 | } 1212 | ], 1213 | "source": [ 1214 | "df" 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "code", 1219 | "execution_count": 9, 1220 | "metadata": {}, 1221 | "outputs": [ 1222 | { 1223 | "data": { 1224 | "text/html": [ 1225 | "
\n", 1226 | "\n", 1239 | "\n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | "
typeidparentlengthlabeldistanceuid
1leafQ8QUQ5_ISKNN/45-7900.383764Q8QUQ5_ISKNN/45-790.383764bhUZpMzqaw
2leafQ8QUQ6_ISKNN/37-7500.934733Q8QUQ6_ISKNN/37-750.934733AGoLMJy4qb
4leafQ8QUQ5_ISKNN/123-15711.145829Q8QUQ5_ISKNN/123-1571.635229CQmpogxXrH
6leafQ0E553_SFAVA/142-17620.943087Q0E553_SFAVA/142-1761.591172W89uwOl3sK
8leafQ0E553_SFAVA/184-21830.989771Q0E553_SFAVA/184-2181.909546gDFNACm9Vx
10leafQ0E553_SFAVA/60-9440.957061Q0E553_SFAVA/60-942.083262fRZuaBG9S3
14leaf019R_FRG3G/5-3970.067233019R_FRG3G/5-392.080298mclkZI6LJJ
16leaf019R_FRG3G/139-17280.056904019R_FRG3G/139-1722.1981176qtDyUu3Xx
18leaf019R_FRG3G/249-28390.957730019R_FRG3G/249-2833.718631ZM0EOpcIQT
19leaf019R_FRG3G/302-33690.583613019R_FRG3G/302-3363.344514WQi85K0XJ9
22leafVF232_IIV6/64-98110.773389VF232_IIV6/64-982.684999HKAm1CcD5f
25leafVF380_IIV6/7-45130.561336VF380_IIV6/7-452.940501rrJHPnwZSf
26leafVF380_IIV3/8-47130.643071VF380_IIV3/8-473.022236ZvZl8mCP8M
28leafVF378_IIV6/4-38140.315302VF378_IIV6/4-382.526023OBk4WSlGu7
29leafO41158_PBCV1/63-96140.460768O41158_PBCV1/63-962.671490PO4PsryR5V
30leafQ0E553_SFAVA/14-48101.588348Q0E553_SFAVA/14-483.294444P3GhB4vdqL
32leafQ8QUQ5_ISKNN/164-198150.639072Q8QUQ5_ISKNN/164-1982.32877956dIugXUfd
33leafQ8QUQ5_ISKNN/7-42150.967432Q8QUQ5_ISKNN/7-422.657139G193mLw0d7
\n", 1435 | "
" 1436 | ], 1437 | "text/plain": [ 1438 | " type id parent length label \\\n", 1439 | "1 leaf Q8QUQ5_ISKNN/45-79 0 0.383764 Q8QUQ5_ISKNN/45-79 \n", 1440 | "2 leaf Q8QUQ6_ISKNN/37-75 0 0.934733 Q8QUQ6_ISKNN/37-75 \n", 1441 | "4 leaf Q8QUQ5_ISKNN/123-157 1 1.145829 Q8QUQ5_ISKNN/123-157 \n", 1442 | "6 leaf Q0E553_SFAVA/142-176 2 0.943087 Q0E553_SFAVA/142-176 \n", 1443 | "8 leaf Q0E553_SFAVA/184-218 3 0.989771 Q0E553_SFAVA/184-218 \n", 1444 | "10 leaf Q0E553_SFAVA/60-94 4 0.957061 Q0E553_SFAVA/60-94 \n", 1445 | "14 leaf 019R_FRG3G/5-39 7 0.067233 019R_FRG3G/5-39 \n", 1446 | "16 leaf 019R_FRG3G/139-172 8 0.056904 019R_FRG3G/139-172 \n", 1447 | "18 leaf 019R_FRG3G/249-283 9 0.957730 019R_FRG3G/249-283 \n", 1448 | "19 leaf 019R_FRG3G/302-336 9 0.583613 019R_FRG3G/302-336 \n", 1449 | "22 leaf VF232_IIV6/64-98 11 0.773389 VF232_IIV6/64-98 \n", 1450 | "25 leaf VF380_IIV6/7-45 13 0.561336 VF380_IIV6/7-45 \n", 1451 | "26 leaf VF380_IIV3/8-47 13 0.643071 VF380_IIV3/8-47 \n", 1452 | "28 leaf VF378_IIV6/4-38 14 0.315302 VF378_IIV6/4-38 \n", 1453 | "29 leaf O41158_PBCV1/63-96 14 0.460768 O41158_PBCV1/63-96 \n", 1454 | "30 leaf Q0E553_SFAVA/14-48 10 1.588348 Q0E553_SFAVA/14-48 \n", 1455 | "32 leaf Q8QUQ5_ISKNN/164-198 15 0.639072 Q8QUQ5_ISKNN/164-198 \n", 1456 | "33 leaf Q8QUQ5_ISKNN/7-42 15 0.967432 Q8QUQ5_ISKNN/7-42 \n", 1457 | "\n", 1458 | " distance uid \n", 1459 | "1 0.383764 bhUZpMzqaw \n", 1460 | "2 0.934733 AGoLMJy4qb \n", 1461 | "4 1.635229 CQmpogxXrH \n", 1462 | "6 1.591172 W89uwOl3sK \n", 1463 | "8 1.909546 gDFNACm9Vx \n", 1464 | "10 2.083262 fRZuaBG9S3 \n", 1465 | "14 2.080298 mclkZI6LJJ \n", 1466 | "16 2.198117 6qtDyUu3Xx \n", 1467 | "18 3.718631 ZM0EOpcIQT \n", 1468 | "19 3.344514 WQi85K0XJ9 \n", 1469 | "22 2.684999 HKAm1CcD5f \n", 1470 | "25 2.940501 rrJHPnwZSf \n", 1471 | "26 3.022236 ZvZl8mCP8M \n", 1472 | "28 2.526023 OBk4WSlGu7 \n", 1473 | "29 2.671490 PO4PsryR5V \n", 1474 | "30 3.294444 P3GhB4vdqL \n", 1475 | "32 2.328779 56dIugXUfd \n", 1476 | "33 2.657139 G193mLw0d7 " 1477 | ] 1478 | }, 1479 | "execution_count": 9, 1480 | "metadata": {}, 1481 | "output_type": "execute_result" 1482 | } 1483 | ], 1484 | "source": [ 1485 | "df2" 1486 | ] 1487 | }, 1488 | { 1489 | "cell_type": "markdown", 1490 | "metadata": {}, 1491 | "source": [ 1492 | "# Here is where the real magic happens!" 1493 | ] 1494 | }, 1495 | { 1496 | "cell_type": "markdown", 1497 | "metadata": {}, 1498 | "source": [ 1499 | "## Reading Sequence *and* Tree Data" 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "markdown", 1504 | "metadata": {}, 1505 | "source": [ 1506 | "Phylopandas has the ability to combine sequence and tree data in a single DataFrame." 1507 | ] 1508 | }, 1509 | { 1510 | "cell_type": "code", 1511 | "execution_count": 10, 1512 | "metadata": {}, 1513 | "outputs": [], 1514 | "source": [ 1515 | "# Read sequences.\n", 1516 | "df = ph.read_fasta('PF08793_seed.fasta')\n", 1517 | "\n", 1518 | "# Read tree.\n", 1519 | "df = df.phylo.read_newick('PF08793_seed.newick', combine_on='id')\n", 1520 | "#df" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "markdown", 1525 | "metadata": {}, 1526 | "source": [ 1527 | "This enables us to build phylogenetics tools around a single, core dataframe. " 1528 | ] 1529 | }, 1530 | { 1531 | "cell_type": "markdown", 1532 | "metadata": {}, 1533 | "source": [ 1534 | "## View an interactive Tree" 1535 | ] 1536 | }, 1537 | { 1538 | "cell_type": "markdown", 1539 | "metadata": {}, 1540 | "source": [ 1541 | "**You must have PhyloVega installed!**\n", 1542 | "\n", 1543 | "https://github.com/Zsailer/phylovega" 1544 | ] 1545 | } 1546 | ], 1547 | "metadata": { 1548 | "kernelspec": { 1549 | "display_name": "Python 3", 1550 | "language": "python", 1551 | "name": "python3" 1552 | }, 1553 | "language_info": { 1554 | "codemirror_mode": { 1555 | "name": "ipython", 1556 | "version": 3 1557 | }, 1558 | "file_extension": ".py", 1559 | "mimetype": "text/x-python", 1560 | "name": "python", 1561 | "nbconvert_exporter": "python", 1562 | "pygments_lexer": "ipython3", 1563 | "version": "3.7.3" 1564 | } 1565 | }, 1566 | "nbformat": 4, 1567 | "nbformat_minor": 2 1568 | } 1569 | --------------------------------------------------------------------------------