├── .gitignore ├── .pylintrc ├── .readthedocs.yaml ├── CHANGELOG.rst ├── LICENSE.txt ├── README.md ├── Snakefile ├── config_example.yaml ├── config_example2.yaml ├── config_example2_lorelei.yaml ├── data ├── example │ ├── ali.fa.gz │ ├── correct_only_genelist.txt │ ├── forest.nhx │ ├── genes │ │ ├── genes.Amia.calva.bed │ │ ├── genes.Astyanax.mexicanus.bed │ │ ├── genes.Danio.rerio.bed │ │ ├── genes.Esox.lucius.bed │ │ ├── genes.Gasterosteus.aculeatus.bed │ │ ├── genes.Lepisosteus.oculatus.bed │ │ ├── genes.Oncorhynchus.mykiss.bed │ │ ├── genes.Oreochromis.niloticus.bed │ │ ├── genes.Oryzias.latipes.bed │ │ ├── genes.Poecilia.formosa.bed │ │ ├── genes.Salmo.salar.bed │ │ ├── genes.Salvelinus.alpinus.bed │ │ ├── genes.Takifugu.rubripes.bed │ │ ├── genes.Tetraodon.nigroviridis.bed │ │ └── genes.Xiphophorus.maculatus.bed │ ├── genes_sp_mapping.txt │ ├── lowcov │ └── species_tree.nwk └── example2 │ ├── ali.fa.gz │ ├── genes │ ├── genes.Amia.calva.list │ ├── genes.Arapaima.gigas.list │ ├── genes.Danio.rerio.list │ ├── genes.Gasterosteus.aculeatus.list │ ├── genes.Lepisosteus.oculatus.list │ ├── genes.Oryzias.latipes.list │ ├── genes.Paramormyrops.kingsleyae.list │ └── genes.Scleropages.formosus.list │ ├── preduplication_ancgenes.tsv │ ├── species_tree.nwk │ └── trees.nhx ├── doc ├── Makefile ├── _static │ ├── Metropolis-Medium.otf │ └── style.css ├── conf.py ├── getting_started_installation.rst ├── getting_started_usage.rst ├── img │ ├── basic_sptree.png │ ├── diagnostic_by_homeo_bowfin.png │ ├── diagnostic_on_medaka.png │ ├── example_cor_27.png │ ├── example_ori_27.png │ ├── lore_on_medaka.png │ ├── scorpios_illustrated.png │ └── sptree_lore.png ├── index.rst ├── input_building_a_dataset.rst ├── input_description.rst ├── input_formatting.rst ├── input_your_configuration_file.rst ├── lorelei_configuration_file.rst ├── lorelei_introduction.rst ├── lorelei_usage.rst ├── make.bat ├── output_advanced.rst ├── output_genetrees.rst ├── output_treeviz.rst ├── project_changelog.rst ├── project_info.rst ├── requirements.txt ├── scripts.graphs.rst ├── scripts.lorelei.rst ├── scripts.rst ├── scripts.synteny.rst └── scripts.trees.rst ├── envs ├── graphs.yaml ├── plots.yaml ├── polytomysolver.yaml ├── rideogram.yaml └── scorpios.yaml ├── iterate_scorpios.sh ├── module_build_trees.smk ├── module_correct_trees.smk ├── module_graphs_orthogroups.smk ├── module_lorelei_diagnostic.smk ├── module_lorelei_lktests.smk ├── module_orthology_table.smk ├── module_synteny_ortho_para.smk ├── scorpios_lorelei.smk └── scripts ├── __init__.py ├── correct_subtrees_treebest.sh ├── graphs ├── __init__.py ├── combine_outgroups.py └── orthogroups.py ├── lorelei ├── __init__.py ├── constrained_aore_lore_topologies.py ├── fix_rideogram.py ├── homeologs_pairs_from_ancestor.py ├── homeologs_tree_conflicts.py ├── make_rideograms_inputs.py ├── plot_genome.R └── write_ancgenes_treeclass.py ├── make_lk_test_consel.sh ├── prototype_au_test3.sh ├── synteny ├── __init__.py ├── duplicated_families.py ├── f1_score_optimization.py ├── filter_no_synteny_genes.py ├── filter_regions.py ├── missed_orthologies.py ├── mygenome.py ├── pairwise_orthology_synteny.py ├── syntenycompare.py └── utilities.py └── trees ├── __init__.py ├── build_treebest_trees.py ├── convert_ids.py ├── cut_subtrees.py ├── genetree.py ├── inconsistent_trees.py ├── iteration_nhx_tags.py ├── make_tree_images.py ├── merge_subtrees.py ├── orthologs.py ├── parse_au_test.py ├── regraft_subtrees.py ├── speciestree.py └── utilities.py /.gitignore: -------------------------------------------------------------------------------- 1 | .snakemake/ 2 | SCORPiOs_*/ 3 | *__pycache__/ 4 | *.pyc 5 | data/* 6 | !data/example/ 7 | !data/example2/ 8 | data/example/genes/*.bz2 9 | config* 10 | !config_example* 11 | doc/build/ -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | 3 | # Required 4 | version: 2 5 | 6 | # Build documentation in the doc/ directory with Sphinx 7 | sphinx: 8 | configuration: doc/conf.py 9 | fail_on_warning: false 10 | 11 | # Optionally set the env required to build your docs 12 | python: 13 | version: 3.6 14 | install: 15 | - requirements: doc/requirements.txt -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | All notable changes to SCORPiOs, added after the first released version v1.0.0, will be documented here. 2 | 3 | [Version 2.0.0] - 13/12/2021 - `v2.0.0 `_ 4 | ---------------------------- 5 | 6 | Introducing SCORPiOs LORelEi (Lineage-specific Ohnolog Resolution Extension), an extension to SCORPiOs to detect LORe. Note that the scorpios environment has to be updated and that additional snakemake arguments are required to run scorpios (--scheduler=greedy and --cores, see the docs for updated usage instructions). 7 | 8 | Added 9 | ^^^^^ 10 | - SCORPiOs LORelEi (LORe Extension): an extension to diagnose potential cases of lineage-specific rediploidisation. 11 | - New experimental option to recompute branch-lengths with RAxML after all subtrees corrections (instead of treebest phyml). 12 | - Removed the deprecated :code:`--buffer_size` argument from the configuration file. 13 | - Sphinx autodoc for the API. 14 | 15 | Changed 16 | ^^^^^^^ 17 | - Updated SCORPiOs conda environment to snakemake version 6.6.1 and added dependency to the roman python package. 18 | 19 | 20 | [Version 1.3.0] - 23/11/2020 - `v1.3.0 `_ 21 | ------------------------------------------- 22 | 23 | Further workflow updates to improve computational efficiency and simplify its usage. Note that the main conda environnment has been updated and needs to be reinstalled if you have the previous version. 24 | 25 | Added 26 | ^^^^^ 27 | - Default output trees in iterative mode have now tags for corrected internal WGD nodes (in addition to corresponding descending leaves). 28 | 29 | Changed 30 | ^^^^^^^ 31 | - **RAxML** replaces PhyML to compute site likelihood (CONSEL input for likelihood AU-tests). 32 | - Simplified usage for iterative correction: the wrapper script directly parses the YAML configuration file so that providing the jobname is no longer required (see the updated :ref:`usage instructions `). 33 | 34 | 35 | Version 1.2.0 - 18/10/2020 - `v1.2.0 `_ 36 | ------------------------------------------- 37 | 38 | Minor workflow updates to improve computational efficiency and scalability to large datasets. 39 | 40 | Added 41 | ^^^^^ 42 | - Option to perform community detection with spectral clustering instead of Girvan-Newman, for improved computational effciency on large datasets. (See the :ref:`documentation` for usage instructions). 43 | 44 | Changed 45 | ^^^^^^^ 46 | - treebest distmat now replaces fastdist to build input distance matrices for profileNJ. 47 | - :ref:`Installation instructions ` now recommend mamba for a faster dependency solving process. 48 | 49 | Version 1.1.0 - 19/05/2020 - `v1.1.0 `_ 50 | ------------------------------------- 51 | 52 | Developed a new visualization tool to inspect tree corrections. 53 | 54 | Added 55 | ^^^^^ 56 | - New tool to generate .png images for corrected trees. 57 | - New html documentation. 58 | 59 | Fixed 60 | ^^^^^ 61 | - Fixed high memory usage in subtrees reinsertion step. 62 | - Minor python code speed-ups. 63 | -------------------------------------------------------------------------------- /Snakefile: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | SCORPiOs corrects orthologies and paralogy relationships in gene trees, so that they are consistent 4 | with a known WGD event. To do so, the method takes advantage of synteny conservation patterns. 5 | """ 6 | 7 | import os 8 | import glob 9 | import itertools 10 | import sys 11 | from scripts.trees import speciestree as spt 12 | from scripts.synteny import filter_regions 13 | 14 | 15 | ## Set all output names 16 | def out_name(name, jobname, iteration, wcard_wgd=False, wcard_outgr=False): 17 | """ 18 | Generates output names with jobname directory prefix and iteration suffix. 19 | Also adds required wildcards. 20 | """ 21 | if wcard_wgd: 22 | name+='_{wgd}' 23 | 24 | if wcard_outgr: 25 | name+='_{outgr}' 26 | 27 | name = "SCORPiOs_"+jobname+'/'+name+'_'+str(iteration) 28 | return name 29 | 30 | JNAME = config["jobname"] 31 | ITER = config['current_iter'] 32 | 33 | if "trees" in config and os.path.isfile(config["trees"]): 34 | input_trees = config["trees"] 35 | elif "trees" in config and not os.path.isfile(config["trees"]): 36 | trees_err = config["trees"] 37 | sys.stderr.write(f"Error: {trees_err} trees file does not exist.\n") 38 | sys.exit(1) 39 | else: 40 | input_trees = out_name("input_forest", JNAME, ITER)+'.nhx' 41 | 42 | OrthoTableStrict = out_name("Families/HomologsStrict", JNAME, ITER, True, True) 43 | Chr = out_name("Families/Chr", JNAME, ITER, True, True) 44 | OrthoTable = out_name("Families/Homologs", JNAME, ITER, True, True) 45 | OrthoTableF = out_name("Families/HomologsFilter", JNAME, ITER, True, True) 46 | TreesOrthologies = out_name("TreesOrthologies", JNAME, ITER) 47 | SyntenyOrthoPred = out_name("SyntenyOrthoPred", JNAME, ITER, True, True) 48 | Sorted_SyntenyOrthoPred = out_name("Synteny/Sorted_SyntenyOrthoPred", JNAME, ITER, True, True) 49 | GraphsOrthogroups = out_name("Graphs/GraphsOrthogroups", JNAME, ITER, True, True) 50 | Summary = out_name("Graphs/Summary", JNAME, ITER, True, True) 51 | outcombin = out_name("Graphs/outcombin", JNAME, ITER, True) 52 | Acc = out_name("Corrections/Accepted_Trees", JNAME, ITER, True) 53 | MULTIGENIC = out_name("Trees/Multigenic", JNAME, ITER, True) 54 | TREES_SUMMARY = out_name("Corrections/Trees_summary", JNAME, ITER, True) 55 | SUBTREES = out_name("Trees/subtrees", JNAME, ITER) 56 | SUBALIS = out_name("Trees/subalis", JNAME, ITER) 57 | CTREES = out_name("Trees/ctrees", JNAME, ITER) 58 | Pairwise_SyntenyOrthoPred = out_name("Pairwise_SyntenyOrthoPred", JNAME, ITER) 59 | PolyS = out_name("Corrections/PolyS", JNAME, ITER) 60 | OutPolylk = out_name("Corrections/Res_polylk", JNAME, ITER) 61 | outTrees = out_name("SCORPiOs_output", JNAME, ITER)+'.nhx' 62 | treeB = out_name("Corrections/TreeB", JNAME, ITER) 63 | OuttreeBlk = out_name("Corrections/Res_treeBlk", JNAME, ITER) 64 | outTmpTrees = out_name("Corrections/tmp_whole_trees", JNAME, ITER) 65 | UNCERTAIN = out_name("Families/UNCERTAIN", JNAME, ITER, True, True) 66 | regions = out_name("Families/tmp_iter_updated_regions", JNAME, ITER, True, True) 67 | NO_ANC_TREE = out_name("sptree_no_anc", JNAME, ITER)+'.nwk' 68 | tmp_matrix = out_name("fastdist_mat", JNAME, ITER, True) 69 | fam_no_graph = out_name("Families/Summary_fam_no_graph", JNAME, ITER, True, True) 70 | orthologs = out_name("Families/orthologs", JNAME, ITER, True, True) 71 | paralogs = out_name("Families/paralogs", JNAME, ITER, True, True) 72 | Threshold = out_name("Families/threshold", JNAME, ITER, True, True) 73 | 74 | # get correct input names and params in iterative mode 75 | args_autho = '' 76 | OrthoTable_prev = '' 77 | Acc_prev = '' 78 | incombin = '' 79 | if int(ITER) > 1: 80 | Acc_prev = out_name("Corrections/Accepted_Trees", JNAME, int(ITER)-1, True, False) 81 | OrthoTable_prev = out_name("Families/Homologs", JNAME, int(ITER)-1, True, True) 82 | input_trees = out_name("SCORPiOs_output", JNAME, int(ITER)-1)+'.nhx' 83 | args_autho = '-filter '+regions 84 | incombin = out_name("Graphs/outcombin", JNAME, int(ITER)-1, True) 85 | 86 | 87 | arg_brlength = '-br '+str(config.get('brlength', 'y')) 88 | #if in iterative mode we force re-computation of branch-lengths 89 | # if int(ITER) > 0: 90 | # arg_brlength = '-br y' 91 | 92 | ## Set parameters from config 93 | config["genes_sp_mapping"] = config.get("genes_sp_mapping", "") 94 | config["windowSize"] = config.get("windowSize", 15) 95 | config["cutoff"] = config.get("cutoff", 0) 96 | config["ignoreSingleGeneCom"] = config.get("ignoreSingleGeneCom", 'y') 97 | config["save_subtrees_lktest"] = config.get("save_subtrees_lktest", 'y') 98 | config["save_tmp_trees"] = config.get("save_tmp_trees", "n") 99 | 100 | # Set genes file format if dyogen format is specified (otherwise bed is assumed) 101 | config["genes_format"] = config.get("genes_format", "bed") 102 | 103 | #all WGDs in trees 104 | anc_other = '' 105 | anc_arg = '-ow '+','.join(config['WGDs'].keys()) 106 | 107 | #"lowcoverage" species, place them in families but don't use them in the graphs 108 | lowcov = '' 109 | lowcov_arg = '' 110 | if "lowcov_sp" in config: 111 | with open(config["lowcov_sp"], 'r') as f: 112 | lowcov = (',').join([sp.strip() for sp in f]) 113 | lowcov_arg = '-l '+lowcov 114 | 115 | wildcard_constraints: 116 | pairwise="[A-Za-z.0-9]+_[A-Za-z.0-9]+", #no underscore in sp names, as also required in ensembl 117 | wgd="[A-Za-z.0-9]+", 118 | outgr="[A-Za-z.0-9]+" 119 | 120 | config["filter_otable_nosynteny"] = config.get("filter_otable_nosynteny", 'n') 121 | 122 | ### WORKFLOW 123 | 124 | #Final output 125 | rule Target: 126 | input: "SCORPiOs_"+config['jobname']+"/.cleanup_"+str(ITER) 127 | 128 | #include the 5 modules 129 | include: "module_build_trees.smk" 130 | include: "module_orthology_table.smk" 131 | include: "module_synteny_ortho_para.smk" 132 | include: "module_graphs_orthogroups.smk" 133 | include: "module_correct_trees.smk" 134 | -------------------------------------------------------------------------------- /config_example.yaml: -------------------------------------------------------------------------------- 1 | #=========================== SCORPiOs EXAMPLE CONFIGURATION FILE ============================# 2 | ##For a full description of supported settings, please take a look at SCORPiOs documentation.# 3 | 4 | 5 | #----------------------------------------- INPUTS -------------------------------------------# 6 | 7 | # INPUT1 - The gene trees to correct, as a single file in New Hampshire Extended (.nhx) format. 8 | trees: data/example/forest.nhx 9 | 10 | # INPUT2 - The multiple sequence alignments used to build the trees, as a single fasta file. 11 | alis: data/example/ali.fa.gz 12 | 13 | # INPUT3 - The genes coordinates for all duplicated species and outgroup(s). 14 | #one file per species, in BED (.bed) format. Can also be in 'dyogen' format. 15 | genes: data/example/genes/genes.%s.bed 16 | 17 | # Uncomment if genes coordinates are in dyogen format, otherwise .bed is assumed. 18 | #genes_format: dyogen 19 | 20 | # INPUT4 - The species tree in Newick format with labelled internal nodes (ancestor names). 21 | species_tree: data/example/species_tree.nwk 22 | 23 | # Gene-to-species mapping file : a single text file with two columns: gene_name species_name. 24 | #genes_sp_mapping: data/example/genes_sp_mapping.txt 25 | 26 | # Restrict correction to gene trees that contain the gene id listed in the following file: 27 | # (any gene id within a given tree can be listed and will induce correction of this tree) 28 | # subset_to_correct: 'data/example/correct_only_genelist.txt' 29 | 30 | 31 | #----------------------------------------- OUTPUTS ------------------------------------------# 32 | 33 | # Provide a job name, which will be appended to the output folder name. 34 | jobname: 'example' 35 | 36 | 37 | #---------------------------------------- PARAMETERS ----------------------------------------# 38 | 39 | # Whole-genome duplication(s) and outgroup(s) 40 | # Outgroups, if multiple, should be monophyletic if you intent to run LORelEi. 41 | WGDs: 42 | Clupeocephala: 'Lepisosteus.oculatus,Amia.calva' 43 | Salmonidae: 'Esox.lucius,Gasterosteus.aculeatus,Oryzias.latipes' 44 | 45 | # Whether the average number of syntenic orthologs to include genes in the Orthology Table 46 | # should be optimized: yes ('y') or no ('n'). Default threshold value if not optimized: 2.0. 47 | optimize_synteny_support_threshold: 'y' 48 | 49 | # Whether orthologs without synteny support should be discarded from the synteny analysis: 50 | # yes ('y') or no ('n') 51 | filter_otable_nosynteny: 'n' 52 | 53 | # Size of the sliding window used in the pairwise synteny analysis: 54 | windowSize: 15 55 | 56 | # Cut-off on pairwise synteny similarity scores (between 0 and 1): 57 | cutoff: 0 58 | 59 | # Whether branch-lengths should be recomputed after corrections: yes ('y') or no ('n'). 60 | brlength: 'y' 61 | 62 | # Software to recompute the branch-lengths: 'treebest phyml' (default) or 'raxml'. 63 | ## Uncomment to use RAxML## 64 | #brlength_tool: raxml 65 | 66 | # Any species with a poorer assembly quality that should be discarded for synteny analysis. 67 | ## Comment out if you want to use all species in the synteny analysis.## 68 | lowcov_sp: 'data/example/lowcov' 69 | 70 | # Whether to ignore tree-synteny inconsistencies when an orthogroup graph community contains 71 | # only a single gene: yes ('y') or no ('n'). These are poorly-supported WGD duplication nodes. 72 | ignoreSingleGeneCom: 'y' 73 | 74 | # Whether each individual corrected tree and its non-corrected counterpart should be saved, 75 | # each in a .nhx file: yes ('y') or no ('n'). This facilitates direct inspection of corrections. 76 | save_tmp_trees: 'y' 77 | 78 | # Whether more detailed intermediary outputs should be saved: yes ('y') or no ('n'). 79 | # Setting `save_subtrees_lktest` to 'y' saves, in addition to default outputs: 80 | # - constrained tree topologies 81 | # - profileNJ and TreeBeST synteny-aware trees 82 | # - AU-tests outputs 83 | # Should be set to 'y' if you intent to run LORelEi. 84 | save_subtrees_lktest: 'y' 85 | 86 | 87 | #Skip profileNJ solution and only resolve synteny-derived multifurcated trees with treebest 88 | #skip_profilenj: 'y' 89 | 90 | 91 | # Optionally, use spectral clustering instead of Girvan-Newman for graph community detection. 92 | spectral: 'n' 93 | 94 | # Iterative-correction related option, automatically updated by the wrapper iterate_scorpios.sh. 95 | ## DO NOT MODIFY MANUALLY even if using iterative mode.## 96 | current_iter: 0 97 | 98 | 99 | #---------------------------------------- RESSOURCES ----------------------------------------# 100 | 101 | # Maximum number of threads (will never use more than this number). 102 | # It will be restricted to the number specified via --cores (1 if --cores is not invoked). 103 | ncores: 14 104 | 105 | # Use a parallelization scheme specific to large jobs: yes ('y') or no ('n'). 106 | parallel_scheme_large_job: 'n' 107 | 108 | # Limit number of cores for the branch length computation (after all corrections). 109 | ## Uncomment to reduce RAM usage.## 110 | #limit_threads_for_branch_lengths: 37 -------------------------------------------------------------------------------- /config_example2.yaml: -------------------------------------------------------------------------------- 1 | #=========================== SCORPiOs EXAMPLE CONFIGURATION FILE ============================# 2 | ##For a full description of supported settings, please take a look at SCORPiOs documentation.# 3 | 4 | 5 | #----------------------------------------- INPUTS -------------------------------------------# 6 | 7 | # INPUT1 - The gene trees to correct, as a single file in New Hampshire Extended (.nhx) format. 8 | trees: data/example2/trees.nhx 9 | 10 | # INPUT2 - The multiple sequence alignments used to build the trees, as a single fasta file. 11 | alis: data/example2/ali.fa.gz 12 | 13 | # INPUT3 - The genes coordinates for all duplicated species and outgroup(s). 14 | #one file per species, in BED (.bed) format. Can also be in 'dyogen' format. 15 | genes: data/example2/genes/genes.%s.list 16 | genes_format: 'dyogen' 17 | 18 | # INPUT4 - The species tree in Newick format with labelled internal nodes (ancestor names). 19 | species_tree: data/example2/species_tree.nwk 20 | 21 | 22 | #----------------------------------------- OUTPUTS ------------------------------------------# 23 | 24 | # Provide a job name, which will be appended to the output folder name. 25 | jobname: 'example2' 26 | 27 | 28 | #---------------------------------------- PARAMETERS ----------------------------------------# 29 | 30 | # Whole-genome duplication(s) and outgroup(s) 31 | # Outgroups, if multiple, should be monophyletic if you intent to run LORelEi. 32 | WGDs: 33 | Osteoglossocephalai: 'Amia.calva' 34 | 35 | # Whether the average number of syntenic orthologs to include genes in the Orthology Table 36 | # should be optimized: yes ('y') or no ('n'). Default threshold value if not optimized: 2.0. 37 | optimize_synteny_support_threshold: 'n' 38 | 39 | # Whether branch-lengths should be recomputed after corrections: yes ('y') or no ('n'). 40 | brlength: 'n' 41 | 42 | # Whether more detailed intermediary outputs should be saved: yes ('y') or no ('n'). 43 | # Setting `save_subtrees_lktest` to 'y' saves, in addition to default outputs: 44 | # - constrained tree topologies 45 | # - profileNJ and TreeBeST synteny-aware trees 46 | # - AU-tests outputs 47 | # Should be set to 'y' if you intent to run LORelEi. 48 | save_subtrees_lktest: 'y' 49 | 50 | # Iterative-correction related option, automatically updated by the wrapper iterate_scorpios.sh. 51 | ## DO NOT MODIFY MANUALLY even if using iterative mode.## 52 | current_iter: 0 53 | 54 | 55 | #---------------------------------------- RESSOURCES ----------------------------------------# 56 | 57 | # Maximum number of threads (will never use more than this number). 58 | # It will be restricted to the number specified via --cores (1 if --cores is not invoked). 59 | ncores: 14 60 | 61 | -------------------------------------------------------------------------------- /config_example2_lorelei.yaml: -------------------------------------------------------------------------------- 1 | #======================== SCORPiOs LORelEi EXAMPLE CONFIGURATION FILE =========================# 2 | ## For a full description of supported settings, please take a look at SCORPiOs documentation.## 3 | 4 | #------------------------------------ REQUIRED PARAMETERS -------------------------------------# 5 | # (all modes) # 6 | 7 | #Configuration file for SCORPiOs correction run. 8 | scorpios_config: "config_example2.yaml" 9 | 10 | #SCORPiOs LORelEi mode, can be 'diagnostic', 'likelihood_tests'. 11 | mode: "diagnostic" 12 | 13 | # Duplicated genome to plot conflicted gene families or LORe/AORe families on. 14 | # The corresponding gene coordinates file will be found using the path given in scorpios config. 15 | dup_genome: "Oryzias.latipes" 16 | 17 | #------------------------------------ OPTIONAL PARAMETERS -------------------------------------# 18 | # (all modes) # 19 | 20 | # Optional, SCORPiOs iteration to use (0 for a simple run, 1 recommended in iterative SCORPiOs). 21 | # Default is 0, so you can omit it in normal mode. 22 | #iter: 0 23 | 24 | # Optional, WGD to consider, you can omit it if you ran SCORPiOs on this WGD only. 25 | #lore_wgd: "Osteoglossocephalai" 26 | 27 | # Optional, to append a lorelei jobname to the scorpios jobname, 28 | # (useful to run different lorelei jobs on the same SCORPiOs main job). 29 | #jname: "myrun1" 30 | 31 | 32 | #----------------------------------- Diagnostic PARAMETERS ------------------------------------# 33 | 34 | # Optional, outgroup OR ancestral reconstruction to use as a proxy to the pre-WGD karyotype. 35 | # You can omit it if you used only 1 outgroup for SCORPiOs correction and want to use it here. 36 | pre_dup_proxy: 37 | use_outgr: "Amia.calva" 38 | # use_anc: "data/example2/preduplication_ancgenes.tsv" 39 | 40 | 41 | #------------------------------------ LK-TESTS PARAMETERS -------------------------------------# 42 | 43 | # Optional, set raxml random seed (for reproducibility) 44 | #raxml_seed: 1234 45 | 46 | # Optional, to prune gene trees and retain only a subset of the duplicated species 47 | #sp_to_keep: "Arapaima.gigas Danio.rerio Scleropages.formosus Oryzias.latipes" 48 | -------------------------------------------------------------------------------- /data/example/ali.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/data/example/ali.fa.gz -------------------------------------------------------------------------------- /data/example/correct_only_genelist.txt: -------------------------------------------------------------------------------- 1 | 102687506_Lepisosteus.oculatus 2 | 102695880_Lepisosteus.oculatus 3 | 102687097_Lepisosteus.oculatus 4 | 102697250_Lepisosteus.oculatus -------------------------------------------------------------------------------- /data/example/genes/genes.Amia.calva.bed: -------------------------------------------------------------------------------- 1 | Aca_scaf_21 13626383 13652135 AMCP00000390_Amia.calva 2 | Aca_scaf_21 13684027 13687008 AMCP00000391_Amia.calva 3 | Aca_scaf_21 15678702 15694287 AMCP00000484_Amia.calva 4 | Aca_scaf_17 2394089 2399355 AMCP00000763_Amia.calva 5 | Aca_scaf_17 4957139 4982651 AMCP00000838_Amia.calva 6 | Aca_scaf_17 10725849 10737276 AMCP00000962_Amia.calva 7 | Aca_scaf_17 13514684 13519469 AMCP00001067_Amia.calva 8 | Aca_scaf_17 13939711 13960043 AMCP00001077_Amia.calva 9 | Aca_scaf_17 14006307 14007332 AMCP00001079_Amia.calva 10 | Aca_scaf_17 14343748 14351478 AMCP00001083_Amia.calva 11 | Aca_scaf_17 14328300 14333025 AMCP00001085_Amia.calva 12 | Aca_scaf_17 14619523 14623851 AMCP00001086_Amia.calva 13 | Aca_scaf_17 21971299 21992332 AMCP00001260_Amia.calva 14 | Aca_scaf_16 1420067 1430684 AMCP00002043_Amia.calva 15 | Aca_scaf_16 8921914 8933126 AMCP00002243_Amia.calva 16 | Aca_scaf_16 14976106 15005344 AMCP00002453_Amia.calva 17 | Aca_scaf_16 15508448 15534012 AMCP00002460_Amia.calva 18 | Aca_scaf_16 15617380 15623746 AMCP00002462_Amia.calva 19 | Aca_scaf_16 15607041 15616265 AMCP00002463_Amia.calva 20 | Aca_scaf_16 15626198 15630089 AMCP00002464_Amia.calva 21 | Aca_scaf_16 23033808 23036859 AMCP00002625_Amia.calva 22 | Aca_scaf_19 5438380 5443809 AMCP00002883_Amia.calva 23 | Aca_scaf_19 8174912 8176893 AMCP00002981_Amia.calva 24 | Aca_scaf_19 14631357 14632106 AMCP00003213_Amia.calva 25 | Aca_scaf_1 18348135 18349366 AMCP00003797_Amia.calva 26 | Aca_scaf_1 48932465 48934416 AMCP00004419_Amia.calva 27 | Aca_scaf_14 7829952 7848529 AMCP00006527_Amia.calva 28 | Aca_scaf_14 14948171 14963480 AMCP00006699_Amia.calva 29 | Aca_scaf_14 21032398 21048481 AMCP00006814_Amia.calva 30 | Aca_scaf_14 25336345 25340058 AMCP00006905_Amia.calva 31 | Aca_scaf_3 23133811 23139521 AMCP00007585_Amia.calva 32 | Aca_scaf_3 35625810 35637920 AMCP00007875_Amia.calva 33 | Aca_scaf_3 38692843 38704205 AMCP00007970_Amia.calva 34 | Aca_scaf_6 8369707 8383963 AMCP00008670_Amia.calva 35 | Aca_scaf_6 8357428 8364543 AMCP00008671_Amia.calva 36 | Aca_scaf_6 9704582 9735738 AMCP00008721_Amia.calva 37 | Aca_scaf_6 11602415 11618855 AMCP00008782_Amia.calva 38 | Aca_scaf_6 13299306 13302628 AMCP00008858_Amia.calva 39 | Aca_scaf_6 13815425 13823056 AMCP00008880_Amia.calva 40 | Aca_scaf_6 14017350 14021503 AMCP00008891_Amia.calva 41 | Aca_scaf_6 14103351 14109000 AMCP00008892_Amia.calva 42 | Aca_scaf_6 14089653 14092404 AMCP00008893_Amia.calva 43 | Aca_scaf_6 21294324 21295469 AMCP00009040_Amia.calva 44 | Aca_scaf_6 29573069 29587854 AMCP00009230_Amia.calva 45 | Aca_scaf_6 40992447 40994537 AMCP00009540_Amia.calva 46 | Aca_scaf_6 45281202 45283020 AMCP00009747_Amia.calva 47 | Aca_scaf_9 6908886 6920689 AMCP00009957_Amia.calva 48 | Aca_scaf_9 24197476 24214335 AMCP00010322_Amia.calva 49 | Aca_scaf_9 24833071 24837552 AMCP00010333_Amia.calva 50 | Aca_scaf_9 28339880 28371632 AMCP00010409_Amia.calva 51 | Aca_scaf_9 31114522 31119247 AMCP00010483_Amia.calva 52 | Aca_scaf_9 32252441 32283770 AMCP00010493_Amia.calva 53 | Aca_scaf_9 39656951 39661536 AMCP00010709_Amia.calva 54 | Aca_scaf_9 40662277 40678028 AMCP00010740_Amia.calva 55 | Aca_scaf_13 28951310 28967738 AMCP00011523_Amia.calva 56 | Aca_scaf_18 254881 260591 AMCP00011583_Amia.calva 57 | Aca_scaf_18 2858086 2863415 AMCP00011720_Amia.calva 58 | Aca_scaf_5 17559761 17561006 AMCP00013027_Amia.calva 59 | Aca_scaf_8 11088747 11096433 AMCP00014364_Amia.calva 60 | Aca_scaf_8 25149803 25163635 AMCP00014644_Amia.calva 61 | Aca_scaf_12 8608850 8612864 AMCP00015377_Amia.calva 62 | Aca_scaf_12 26306627 26308732 AMCP00015872_Amia.calva 63 | Aca_scaf_2 25103687 25106170 AMCP00016568_Amia.calva 64 | Aca_scaf_2 25553480 25559207 AMCP00016586_Amia.calva 65 | Aca_scaf_2 25534049 25542045 AMCP00016587_Amia.calva 66 | Aca_scaf_2 25515479 25520165 AMCP00016588_Amia.calva 67 | Aca_scaf_2 25449909 25483710 AMCP00016594_Amia.calva 68 | Aca_scaf_2 25611563 25642333 AMCP00016597_Amia.calva 69 | Aca_scaf_2 25725253 25758926 AMCP00016598_Amia.calva 70 | Aca_scaf_2 25706081 25726547 AMCP00016599_Amia.calva 71 | Aca_scaf_2 25676569 25703327 AMCP00016600_Amia.calva 72 | Aca_scaf_2 25824746 25825846 AMCP00016601_Amia.calva 73 | Aca_scaf_2 25961546 25973771 AMCP00016602_Amia.calva 74 | Aca_scaf_2 25879346 25887229 AMCP00016603_Amia.calva 75 | Aca_scaf_2 25929577 25933057 AMCP00016604_Amia.calva 76 | Aca_scaf_2 25956131 25958833 AMCP00016605_Amia.calva 77 | Aca_scaf_2 25906124 25918145 AMCP00016606_Amia.calva 78 | Aca_scaf_2 25948060 25948410 AMCP00016607_Amia.calva 79 | Aca_scaf_2 34761492 34773639 AMCP00016923_Amia.calva 80 | Aca_scaf_2 35010365 35014666 AMCP00016926_Amia.calva 81 | Aca_scaf_2 35033815 35081248 AMCP00016927_Amia.calva 82 | Aca_scaf_2 35097590 35116668 AMCP00016928_Amia.calva 83 | Aca_scaf_2 35137549 35141789 AMCP00016929_Amia.calva 84 | Aca_scaf_2 35121947 35131005 AMCP00016930_Amia.calva 85 | Aca_scaf_2 35091453 35096502 AMCP00016931_Amia.calva 86 | Aca_scaf_2 35205880 35212613 AMCP00016932_Amia.calva 87 | Aca_scaf_2 35220562 35232529 AMCP00016933_Amia.calva 88 | Aca_scaf_2 35158597 35204230 AMCP00016934_Amia.calva 89 | Aca_scaf_2 35247507 35260480 AMCP00016935_Amia.calva 90 | Aca_scaf_2 35235592 35245651 AMCP00016936_Amia.calva 91 | Aca_scaf_11 2713157 2726827 AMCP00017423_Amia.calva 92 | Aca_scaf_11 9893228 9895433 AMCP00017552_Amia.calva 93 | Aca_scaf_11 17928433 17942165 AMCP00017684_Amia.calva 94 | Aca_scaf_11 20603355 20616229 AMCP00017733_Amia.calva 95 | Aca_scaf_11 23767371 23770666 AMCP00017800_Amia.calva 96 | Aca_scaf_11 23781868 23793676 AMCP00017801_Amia.calva 97 | Aca_scaf_11 27427479 27432576 AMCP00017895_Amia.calva 98 | Aca_scaf_11 31659901 31675037 AMCP00017981_Amia.calva 99 | Aca_scaf_11 31645825 31649592 AMCP00017982_Amia.calva 100 | Aca_scaf_15 7883207 7894569 AMCP00018310_Amia.calva 101 | Aca_scaf_15 10299003 10299554 AMCP00018425_Amia.calva 102 | Aca_scaf_4 11888005 11901076 AMCP00019273_Amia.calva 103 | Aca_scaf_4 29581830 29583506 AMCP00019558_Amia.calva 104 | Aca_scaf_4 34563039 34565981 AMCP00019642_Amia.calva 105 | Aca_scaf_4 35742867 35763241 AMCP00019661_Amia.calva 106 | Aca_scaf_4 38842142 38865670 AMCP00019712_Amia.calva 107 | Aca_scaf_4 45019140 45033185 AMCP00019800_Amia.calva 108 | Aca_scaf_4 45206216 45237401 AMCP00019803_Amia.calva 109 | Aca_scaf_7 7407657 7411975 AMCP00020030_Amia.calva 110 | Aca_scaf_7 18171620 18175979 AMCP00020409_Amia.calva 111 | Aca_scaf_7 25579705 25588751 AMCP00020600_Amia.calva 112 | Aca_scaf_7 36114629 36117694 AMCP00020906_Amia.calva 113 | Aca_scaf_10 24215868 24220678 AMCP00021660_Amia.calva 114 | Aca_scaf_10 25074347 25085082 AMCP00021680_Amia.calva 115 | Aca_scaf_10 25406702 25411562 AMCP00021686_Amia.calva 116 | -------------------------------------------------------------------------------- /data/example/genes/genes.Astyanax.mexicanus.bed: -------------------------------------------------------------------------------- 1 | NC_035898.1 4167953 4239353 103034298_Astyanax.mexicanus 2 | NC_035898.1 4244176 4250400 103027236_Astyanax.mexicanus 3 | NC_035898.1 4263799 4285939 103033979_Astyanax.mexicanus 4 | NC_035898.1 5422178 5450200 103045387_Astyanax.mexicanus 5 | NC_035898.1 5474814 5500346 103032392_Astyanax.mexicanus 6 | NC_035898.1 5539941 5686852 103044182_Astyanax.mexicanus 7 | NC_035898.1 5699599 5704961 103043349_Astyanax.mexicanus 8 | NC_035898.1 5706208 5727621 103043863_Astyanax.mexicanus 9 | NC_035898.1 27799416 27842056 103034016_Astyanax.mexicanus 10 | NC_035899.1 22030937 22035018 103033822_Astyanax.mexicanus 11 | NC_035899.1 47918346 47926402 103032605_Astyanax.mexicanus 12 | NC_035900.1 8867476 8887318 103024169_Astyanax.mexicanus 13 | NC_035900.1 18725892 18806982 103023065_Astyanax.mexicanus 14 | NC_035900.1 18814772 18827717 103035736_Astyanax.mexicanus 15 | NC_035900.1 18829967 18865676 103023371_Astyanax.mexicanus 16 | NC_035900.1 18869146 18878812 103036263_Astyanax.mexicanus 17 | NC_035900.1 24376681 24385379 111192423_Astyanax.mexicanus 18 | NC_035900.1 25022191 25025894 103032189_Astyanax.mexicanus 19 | NC_035900.1 25070634 25134961 103038297_Astyanax.mexicanus 20 | NC_035900.1 25176862 25193886 103024802_Astyanax.mexicanus 21 | NC_035900.1 25266942 25270117 103023029_Astyanax.mexicanus 22 | NC_035900.1 25294253 25333929 103022724_Astyanax.mexicanus 23 | NC_035900.1 25359459 25364883 103022410_Astyanax.mexicanus 24 | NC_035901.1 3757357 3763477 103023027_Astyanax.mexicanus 25 | NC_035901.1 19722602 19790148 103045377_Astyanax.mexicanus 26 | NC_035902.1 26095111 26529204 103038282_Astyanax.mexicanus 27 | NC_035902.1 30064340 30068306 103045547_Astyanax.mexicanus 28 | NC_035902.1 51355943 51366715 103031389_Astyanax.mexicanus 29 | NC_035903.1 13271177 13303112 103035159_Astyanax.mexicanus 30 | NC_035903.1 21932769 21956278 103040701_Astyanax.mexicanus 31 | NC_035903.1 36970717 36973142 103026488_Astyanax.mexicanus 32 | NC_035905.1 4624101 4665487 103045299_Astyanax.mexicanus 33 | NC_035905.1 8650628 8655017 111193185_Astyanax.mexicanus 34 | NC_035905.1 35264933 35269468 103041454_Astyanax.mexicanus 35 | NC_035905.1 39811767 39858569 103033221_Astyanax.mexicanus 36 | NC_035906.1 5875704 5926455 103042448_Astyanax.mexicanus 37 | NC_035906.1 5881559 5895535 103043281_Astyanax.mexicanus 38 | NC_035906.1 5929824 5988498 103043891_Astyanax.mexicanus 39 | NC_035906.1 14300309 14304383 103041562_Astyanax.mexicanus 40 | NC_035906.1 30949216 30956947 103032107_Astyanax.mexicanus 41 | NC_035907.1 9754233 9770056 103026715_Astyanax.mexicanus 42 | NC_035908.1 17113310 17117221 103032094_Astyanax.mexicanus 43 | NC_035908.1 17867016 17877231 103028343_Astyanax.mexicanus 44 | NC_035909.1 9358502 9370742 103029059_Astyanax.mexicanus 45 | NC_035909.1 18232082 18282278 103041078_Astyanax.mexicanus 46 | NC_035909.1 28849141 28864783 103033422_Astyanax.mexicanus 47 | NC_035909.1 32064797 32071785 103029109_Astyanax.mexicanus 48 | NC_035909.1 37023391 37031363 103023810_Astyanax.mexicanus 49 | NC_035910.1 17744012 17751562 103030328_Astyanax.mexicanus 50 | NC_035910.1 18896021 18899622 103047465_Astyanax.mexicanus 51 | NC_035910.1 26518665 26540810 103038081_Astyanax.mexicanus 52 | NC_035910.1 27402150 27408332 103045072_Astyanax.mexicanus 53 | NC_035910.1 35970109 35975545 103043841_Astyanax.mexicanus 54 | NC_035910.1 35976784 35980086 103043535_Astyanax.mexicanus 55 | NC_035910.1 38149766 38159925 103028989_Astyanax.mexicanus 56 | NC_035911.1 4748328 4779920 103036708_Astyanax.mexicanus 57 | NC_035911.1 6262897 6443935 103043500_Astyanax.mexicanus 58 | NC_035911.1 6480484 6483848 103044603_Astyanax.mexicanus 59 | NC_035912.1 11498055 11513466 103024055_Astyanax.mexicanus 60 | NC_035912.1 12472804 12571966 103027860_Astyanax.mexicanus 61 | NC_035912.1 23780252 23905504 103035300_Astyanax.mexicanus 62 | NC_035912.1 31230447 31243063 103042161_Astyanax.mexicanus 63 | NC_035913.1 4508636 4512310 103029581_Astyanax.mexicanus 64 | NC_035913.1 12081317 12095138 103041402_Astyanax.mexicanus 65 | NC_035913.1 16570167 16591498 103031786_Astyanax.mexicanus 66 | NC_035913.1 20455216 20520468 103039967_Astyanax.mexicanus 67 | NC_035914.1 21081241 21106649 103039329_Astyanax.mexicanus 68 | NC_035914.1 31381217 31514641 103040352_Astyanax.mexicanus 69 | NC_035914.1 31559170 31583047 103038585_Astyanax.mexicanus 70 | NC_035915.1 780792 809901 103026821_Astyanax.mexicanus 71 | NC_035915.1 11237767 11242057 111194895_Astyanax.mexicanus 72 | NC_035915.1 25784545 25794814 103030310_Astyanax.mexicanus 73 | NC_035915.1 27448518 27463953 103034209_Astyanax.mexicanus 74 | NC_035915.1 29342297 29351215 103035006_Astyanax.mexicanus 75 | NC_035915.1 29994851 30038807 103031296_Astyanax.mexicanus 76 | NC_035917.1 509651 540134 103047322_Astyanax.mexicanus 77 | NC_035918.1 5779891 5798591 103024133_Astyanax.mexicanus 78 | NC_035918.1 13451926 13456441 103028318_Astyanax.mexicanus 79 | NC_035918.1 21412689 21428618 103027815_Astyanax.mexicanus 80 | NC_035919.1 11108041 11115372 103039714_Astyanax.mexicanus 81 | NC_035919.1 11144696 11169857 103040244_Astyanax.mexicanus 82 | NC_035919.1 11176542 11196955 103040563_Astyanax.mexicanus 83 | NC_035919.1 11200297 11239603 103040863_Astyanax.mexicanus 84 | NC_035919.1 11244956 11284551 103041779_Astyanax.mexicanus 85 | NC_035919.1 11291176 11317552 103042086_Astyanax.mexicanus 86 | NC_035919.1 11321902 11327936 103042600_Astyanax.mexicanus 87 | NC_035919.1 28736802 28744567 103029066_Astyanax.mexicanus 88 | NC_035919.1 28747087 28755459 103030436_Astyanax.mexicanus 89 | NC_035919.1 28784642 28802718 103030751_Astyanax.mexicanus 90 | NC_035920.1 13751413 13764223 103039142_Astyanax.mexicanus 91 | NC_035920.1 37113010 37114597 103034033_Astyanax.mexicanus 92 | NC_035920.1 41200075 41202618 103036286_Astyanax.mexicanus 93 | NC_035921.1 14639866 14640543 111195974_Astyanax.mexicanus 94 | NW_019170789.1 109253 122490 103043383_Astyanax.mexicanus 95 | NW_019170789.1 224023 227252 103043082_Astyanax.mexicanus 96 | NW_019170833.1 214100 223604 103047431_Astyanax.mexicanus 97 | NW_019171246.1 429119 466673 103032727_Astyanax.mexicanus 98 | NW_019171246.1 478555 496966 103033270_Astyanax.mexicanus 99 | NW_019171406.1 7200 11225 103042554_Astyanax.mexicanus 100 | NW_019171514.1 96889 125632 103047126_Astyanax.mexicanus 101 | NW_019171857.1 13462 26036 103043047_Astyanax.mexicanus 102 | NW_019171857.1 31088 79687 103042216_Astyanax.mexicanus 103 | NW_019171900.1 132397 156581 103027170_Astyanax.mexicanus 104 | NW_019171921.1 246704 275671 103047346_Astyanax.mexicanus 105 | NW_019172032.1 124079 162800 103023898_Astyanax.mexicanus 106 | NW_019172238.1 479315 534452 103035950_Astyanax.mexicanus 107 | NW_019172676.1 33603 40538 103032653_Astyanax.mexicanus 108 | NW_019172676.1 455160 740251 103023674_Astyanax.mexicanus 109 | NW_019172823.1 486150 490486 103030776_Astyanax.mexicanus 110 | NW_019172828.1 1359317 1361921 103043375_Astyanax.mexicanus 111 | NW_019172836.1 113570 146312 103021343_Astyanax.mexicanus 112 | NW_019172841.1 162684 168000 103025435_Astyanax.mexicanus 113 | NW_019172863.1 1163776 1173221 103046920_Astyanax.mexicanus 114 | NW_019172875.1 187720 215532 103034004_Astyanax.mexicanus 115 | NW_019172875.1 216889 233276 103037791_Astyanax.mexicanus 116 | NW_019172906.1 6124632 6132740 103042052_Astyanax.mexicanus 117 | NW_019172945.1 3631324 3634237 103021382_Astyanax.mexicanus 118 | -------------------------------------------------------------------------------- /data/example/genes/genes.Danio.rerio.bed: -------------------------------------------------------------------------------- 1 | 1 43652480 43684584 ENSDARG00000052712_Danio.rerio 2 | 10 5667401 5684326 ENSDARG00000103600_Danio.rerio 3 | 11 3301277 3308569 ENSDARG00000026577_Danio.rerio 4 | 11 11974708 11979854 ENSDARG00000086998_Danio.rerio 5 | 11 39075613 39105253 ENSDARG00000071212_Danio.rerio 6 | 12 9449665 9468618 ENSDARG00000057531_Danio.rerio 7 | 12 13905286 13957037 ENSDARG00000045129_Danio.rerio 8 | 12 13961268 13966184 ENSDARG00000045130_Danio.rerio 9 | 12 13973058 14078070 ENSDARG00000055652_Danio.rerio 10 | 12 14920088 14922955 ENSDARG00000016854_Danio.rerio 11 | 12 15078124 15082367 ENSDARG00000116039_Danio.rerio 12 | 12 15273304 15311999 ENSDARG00000075340_Danio.rerio 13 | 12 17754859 17756905 ENSDARG00000045166_Danio.rerio 14 | 12 47945664 47959911 ENSDARG00000038226_Danio.rerio 15 | 13 25761279 25762391 ENSDARG00000016951_Danio.rerio 16 | 14 9497032 9535094 ENSDARG00000114748_Danio.rerio 17 | 14 9507291 9522364 ENSDARG00000037555_Danio.rerio 18 | 14 26703331 26704829 ENSDARG00000056130_Danio.rerio 19 | 16 7214005 7239508 ENSDARG00000058597_Danio.rerio 20 | 16 11992498 12006828 ENSDARG00000057879_Danio.rerio 21 | 16 20294976 20305104 ENSDARG00000040695_Danio.rerio 22 | 16 43353106 43356018 ENSDARG00000113257_Danio.rerio 23 | 16 49647402 49661406 ENSDARG00000007257_Danio.rerio 24 | 16 49729339 49846359 ENSDARG00000086162_Danio.rerio 25 | 17 654759 686887 ENSDARG00000112136_Danio.rerio 26 | 17 691453 702817 ENSDARG00000103933_Danio.rerio 27 | 17 4386649 4396692 ENSDARG00000005814_Danio.rerio 28 | 17 7510220 7521586 ENSDARG00000115489_Danio.rerio 29 | 17 7522777 7531632 ENSDARG00000112170_Danio.rerio 30 | 17 19419989 19466319 ENSDARG00000001129_Danio.rerio 31 | 17 20923691 20929461 ENSDARG00000087554_Danio.rerio 32 | 17 24747209 24787808 ENSDARG00000074004_Danio.rerio 33 | 18 25027495 25051846 ENSDARG00000018788_Danio.rerio 34 | 19 4828251 4851411 ENSDARG00000075567_Danio.rerio 35 | 19 4856351 4905342 ENSDARG00000063726_Danio.rerio 36 | 19 4912817 4962402 ENSDARG00000076280_Danio.rerio 37 | 19 4968947 4985996 ENSDARG00000017809_Danio.rerio 38 | 19 4990320 4999232 ENSDARG00000011076_Danio.rerio 39 | 19 17377346 17385548 ENSDARG00000099527_Danio.rerio 40 | 19 18310386 18313381 ENSDARG00000104170_Danio.rerio 41 | 19 18423378 18492557 ENSDARG00000104089_Danio.rerio 42 | 19 18493782 18597568 ENSDARG00000099183_Danio.rerio 43 | 19 20708460 20722368 ENSDARG00000018602_Danio.rerio 44 | 19 40248697 40328549 ENSDARG00000070228_Danio.rerio 45 | 19 42591150 42607451 ENSDARG00000005023_Danio.rerio 46 | 19 42609132 42617723 ENSDARG00000018010_Danio.rerio 47 | 19 42660158 42682947 ENSDARG00000012135_Danio.rerio 48 | 2 21634128 21734733 ENSDARG00000062251_Danio.rerio 49 | 2 31475772 31519662 ENSDARG00000055565_Danio.rerio 50 | 2 31838442 31853345 ENSDARG00000045421_Danio.rerio 51 | 2 50608099 50629155 ENSDARG00000020794_Danio.rerio 52 | 2 52782056 52795722 ENSDARG00000115832_Danio.rerio 53 | 21 244503 248613 ENSDARG00000102477_Danio.rerio 54 | 21 1780790 1799265 ENSDARG00000045301_Danio.rerio 55 | 21 5209476 5240595 ENSDARG00000036584_Danio.rerio 56 | 21 6136981 6150568 ENSDARG00000044811_Danio.rerio 57 | 21 26991198 26996035 ENSDARG00000044540_Danio.rerio 58 | 21 38732945 38739728 ENSDARG00000054032_Danio.rerio 59 | 22 24389135 24482035 ENSDARG00000103026_Danio.rerio 60 | 23 14097508 14180307 ENSDARG00000001881_Danio.rerio 61 | 23 26946429 26959206 ENSDARG00000076030_Danio.rerio 62 | 23 28185883 28294763 ENSDARG00000009899_Danio.rerio 63 | 23 28343988 28347039 ENSDARG00000003469_Danio.rerio 64 | 23 44797665 44819100 ENSDARG00000054211_Danio.rerio 65 | 24 984822 1010467 ENSDARG00000076791_Danio.rerio 66 | 24 1023839 1036384 ENSDARG00000059412_Danio.rerio 67 | 24 1042594 1100918 ENSDARG00000077878_Danio.rerio 68 | 24 21640635 21642858 ENSDARG00000071583_Danio.rerio 69 | 24 32411753 32474949 ENSDARG00000040008_Danio.rerio 70 | 24 33767771 33780387 ENSDARG00000056683_Danio.rerio 71 | 25 17479264 17494364 ENSDARG00000076425_Danio.rerio 72 | 25 18766165 18778356 ENSDARG00000076191_Danio.rerio 73 | 25 21064019 21066136 ENSDARG00000034497_Danio.rerio 74 | 25 34956368 34973211 ENSDARG00000034256_Danio.rerio 75 | 3 16157120 16227683 ENSDARG00000002167_Danio.rerio 76 | 3 16229911 16232442 ENSDARG00000013307_Danio.rerio 77 | 3 16265924 16285259 ENSDARG00000021195_Danio.rerio 78 | 3 17615599 17651899 ENSDARG00000026712_Danio.rerio 79 | 3 17689419 17710221 ENSDARG00000089463_Danio.rerio 80 | 3 17744339 17860047 ENSDARG00000058154_Danio.rerio 81 | 3 17867126 17871860 ENSDARG00000052594_Danio.rerio 82 | 3 17878124 17910375 ENSDARG00000058148_Danio.rerio 83 | 3 17910569 17928233 ENSDARG00000058140_Danio.rerio 84 | 3 17933132 17946949 ENSDARG00000070822_Danio.rerio 85 | 3 17951790 18009316 ENSDARG00000099079_Danio.rerio 86 | 3 21240594 21242460 ENSDARG00000007344_Danio.rerio 87 | 3 28634467 28665291 ENSDARG00000060915_Danio.rerio 88 | 4 8609517 8611841 ENSDARG00000045801_Danio.rerio 89 | 4 26303613 26322836 ENSDARG00000098221_Danio.rerio 90 | 4 74087575 74109792 ENSDARG00000100752_Danio.rerio 91 | 5 52000163 52010213 ENSDARG00000051916_Danio.rerio 92 | 5 67980187 67993086 ENSDARG00000030267_Danio.rerio 93 | 6 7428613 7439490 ENSDARG00000037000_Danio.rerio 94 | 6 14086688 14146979 ENSDARG00000062174_Danio.rerio 95 | 6 49551614 49580086 ENSDARG00000015807_Danio.rerio 96 | 7 11595264 11605185 ENSDARG00000076117_Danio.rerio 97 | 7 20779026 20796935 ENSDARG00000017874_Danio.rerio 98 | 7 71792290 71837265 ENSDARG00000099045_Danio.rerio 99 | 8 1284784 1304505 ENSDARG00000042072_Danio.rerio 100 | 8 7494941 7504201 ENSDARG00000003867_Danio.rerio 101 | 8 44703854 44707937 ENSDARG00000006137_Danio.rerio 102 | 8 53948194 53960349 ENSDARG00000102546_Danio.rerio 103 | 9 307863 321351 ENSDARG00000098883_Danio.rerio 104 | 9 8887528 8895243 ENSDARG00000005049_Danio.rerio 105 | 9 27750490 27790405 ENSDARG00000016059_Danio.rerio 106 | 9 29484043 29497916 ENSDARG00000041110_Danio.rerio 107 | 9 38074082 38130085 ENSDARG00000009738_Danio.rerio 108 | 9 42728269 42730672 ENSDARG00000010962_Danio.rerio 109 | 9 43243212 43538328 ENSDARG00000006065_Danio.rerio 110 | 9 44291721 44295071 ENSDARG00000019566_Danio.rerio 111 | 9 44430705 44489978 ENSDARG00000038941_Danio.rerio 112 | 9 51147380 51217396 ENSDARG00000018553_Danio.rerio 113 | -------------------------------------------------------------------------------- /data/example/genes/genes.Lepisosteus.oculatus.bed: -------------------------------------------------------------------------------- 1 | NC_023179.1 7592053 7597115 102693734_Lepisosteus.oculatus 2 | NC_023180.1 5016037 5053202 107079475_Lepisosteus.oculatus 3 | NC_023180.1 9034952 9041531 102690580_Lepisosteus.oculatus 4 | NC_023180.1 9786463 9799881 102696162_Lepisosteus.oculatus 5 | NC_023180.1 15690550 15702414 102690457_Lepisosteus.oculatus 6 | NC_023180.1 16803751 16829340 102684584_Lepisosteus.oculatus 7 | NC_023180.1 30641063 30700831 102682418_Lepisosteus.oculatus 8 | NC_023180.1 32051580 32364345 102684309_Lepisosteus.oculatus 9 | NC_023180.1 40451903 40462701 102695445_Lepisosteus.oculatus 10 | NC_023180.1 45828542 45867500 102698191_Lepisosteus.oculatus 11 | NC_023180.1 59424623 59427711 102693600_Lepisosteus.oculatus 12 | NC_023180.1 61026664 61036403 102683785_Lepisosteus.oculatus 13 | NC_023181.1 5550776 5554877 102684053_Lepisosteus.oculatus 14 | NC_023181.1 38014137 38020919 102687885_Lepisosteus.oculatus 15 | NC_023181.1 48112026 48127666 102688439_Lepisosteus.oculatus 16 | NC_023182.1 7140768 7172154 102690930_Lepisosteus.oculatus 17 | NC_023182.1 13063254 13089265 102695385_Lepisosteus.oculatus 18 | NC_023182.1 15030281 15047640 102694083_Lepisosteus.oculatus 19 | NC_023182.1 17813365 17898021 102692071_Lepisosteus.oculatus 20 | NC_023182.1 17921436 17929757 102692397_Lepisosteus.oculatus 21 | NC_023182.1 18154380 18210945 102698540_Lepisosteus.oculatus 22 | NC_023182.1 18914945 18919377 102694003_Lepisosteus.oculatus 23 | NC_023182.1 43811038 43824210 102685185_Lepisosteus.oculatus 24 | NC_023182.1 66622408 66641434 102686328_Lepisosteus.oculatus 25 | NC_023183.1 7063722 7069880 102694349_Lepisosteus.oculatus 26 | NC_023183.1 29387572 29389417 102698205_Lepisosteus.oculatus 27 | NC_023184.1 34127386 34129940 102692818_Lepisosteus.oculatus 28 | NC_023184.1 38389374 38401505 102686676_Lepisosteus.oculatus 29 | NC_023185.1 308352 390118 102684602_Lepisosteus.oculatus 30 | NC_023185.1 10139153 10201468 102694611_Lepisosteus.oculatus 31 | NC_023186.1 20708713 20778713 102688455_Lepisosteus.oculatus 32 | NC_023186.1 27730696 27754659 102699089_Lepisosteus.oculatus 33 | NC_023186.1 40432181 40435110 102695013_Lepisosteus.oculatus 34 | NC_023186.1 45703326 45729227 102694617_Lepisosteus.oculatus 35 | NC_023187.1 1484571 1499102 102683456_Lepisosteus.oculatus 36 | NC_023187.1 31228938 31385577 102690009_Lepisosteus.oculatus 37 | NC_023187.1 33989437 34008402 102686424_Lepisosteus.oculatus 38 | NC_023187.1 36413433 36536126 102690339_Lepisosteus.oculatus 39 | NC_023187.1 36565686 36590668 107078266_Lepisosteus.oculatus 40 | NC_023187.1 36634556 36671104 102695539_Lepisosteus.oculatus 41 | NC_023187.1 47757044 47760692 102687362_Lepisosteus.oculatus 42 | NC_023189.1 3661347 3678492 102686627_Lepisosteus.oculatus 43 | NC_023189.1 3680622 3690917 102683944_Lepisosteus.oculatus 44 | NC_023189.1 3692040 3706197 102684148_Lepisosteus.oculatus 45 | NC_023189.1 3767632 3841420 102684741_Lepisosteus.oculatus 46 | NC_023189.1 13073239 13164607 102686771_Lepisosteus.oculatus 47 | NC_023189.1 17950012 17955070 102695475_Lepisosteus.oculatus 48 | NC_023189.1 19663558 19671368 102696266_Lepisosteus.oculatus 49 | NC_023189.1 19867056 19978721 102690888_Lepisosteus.oculatus 50 | NC_023189.1 20427953 20433214 102691292_Lepisosteus.oculatus 51 | NC_023189.1 31081840 31085419 102690222_Lepisosteus.oculatus 52 | NC_023190.1 4542861 4594114 102689881_Lepisosteus.oculatus 53 | NC_023190.1 7092044 7098892 102696857_Lepisosteus.oculatus 54 | NC_023190.1 9780864 9786849 102693104_Lepisosteus.oculatus 55 | NC_023190.1 13480038 13488978 102698360_Lepisosteus.oculatus 56 | NC_023190.1 13786243 13911225 102683400_Lepisosteus.oculatus 57 | NC_023190.1 14276261 14279724 102684022_Lepisosteus.oculatus 58 | NC_023190.1 14351641 14378020 102682655_Lepisosteus.oculatus 59 | NC_023190.1 18382023 18406041 102693765_Lepisosteus.oculatus 60 | NC_023190.1 33078388 33087634 102686483_Lepisosteus.oculatus 61 | NC_023191.1 13346160 13374242 102687502_Lepisosteus.oculatus 62 | NC_023191.1 15525792 15526318 102692704_Lepisosteus.oculatus 63 | NC_023192.1 9180949 9249953 102697448_Lepisosteus.oculatus 64 | NC_023193.1 5073372 5096139 102695880_Lepisosteus.oculatus 65 | NC_023193.1 5115420 5126713 102685073_Lepisosteus.oculatus 66 | NC_023193.1 5162280 5340665 102685269_Lepisosteus.oculatus 67 | NC_023193.1 5295190 5303523 107079293_Lepisosteus.oculatus 68 | NC_023193.1 5356544 5361634 102696279_Lepisosteus.oculatus 69 | NC_023193.1 5362908 5379121 102696472_Lepisosteus.oculatus 70 | NC_023193.1 5381761 5389851 102685469_Lepisosteus.oculatus 71 | NC_023193.1 5391006 5395911 102685682_Lepisosteus.oculatus 72 | NC_023193.1 5401093 5435931 102685883_Lepisosteus.oculatus 73 | NC_023193.1 5438099 5443753 102696668_Lepisosteus.oculatus 74 | NC_023193.1 5445956 5451804 102686083_Lepisosteus.oculatus 75 | NC_023193.1 5452934 5464686 102686286_Lepisosteus.oculatus 76 | NC_023193.1 5465463 5475884 102686488_Lepisosteus.oculatus 77 | NC_023193.1 5478358 5488441 102686693_Lepisosteus.oculatus 78 | NC_023193.1 5526182 5532545 102687097_Lepisosteus.oculatus 79 | NC_023193.1 5534029 5536258 102687297_Lepisosteus.oculatus 80 | NC_023193.1 5540630 5543503 102696867_Lepisosteus.oculatus 81 | NC_023193.1 5543498 5556171 102697058_Lepisosteus.oculatus 82 | NC_023193.1 5557284 5599203 102687506_Lepisosteus.oculatus 83 | NC_023193.1 5579161 5606265 102687712_Lepisosteus.oculatus 84 | NC_023193.1 5615182 5621012 102697250_Lepisosteus.oculatus 85 | NC_023193.1 5670400 5699514 102687912_Lepisosteus.oculatus 86 | NC_023193.1 5700713 5716858 102688118_Lepisosteus.oculatus 87 | NC_023193.1 5724367 5739931 102688324_Lepisosteus.oculatus 88 | NC_023193.1 5755549 5789017 102688531_Lepisosteus.oculatus 89 | NC_023193.1 5811706 5819695 102688733_Lepisosteus.oculatus 90 | NC_023193.1 5821151 5831330 107075355_Lepisosteus.oculatus 91 | NC_023193.1 5836005 5841458 102697450_Lepisosteus.oculatus 92 | NC_023193.1 5847480 5897952 102697641_Lepisosteus.oculatus 93 | NC_023195.1 13446911 13476496 102682256_Lepisosteus.oculatus 94 | NC_023195.1 24152704 24162229 102683408_Lepisosteus.oculatus 95 | NC_023196.1 13961021 13974966 102694573_Lepisosteus.oculatus 96 | NC_023199.1 13433712 13439251 102690842_Lepisosteus.oculatus 97 | NC_023201.1 5480514 5494039 102692244_Lepisosteus.oculatus 98 | NC_023201.1 12770099 12777415 102697978_Lepisosteus.oculatus 99 | NC_023203.1 1579609 1603015 102685978_Lepisosteus.oculatus 100 | NC_023203.1 11490498 11505337 102688623_Lepisosteus.oculatus 101 | NC_023204.1 5055242 5056941 102682678_Lepisosteus.oculatus 102 | NC_023204.1 12637418 12650591 102689642_Lepisosteus.oculatus 103 | NW_006270225.1 60371 63795 102697468_Lepisosteus.oculatus 104 | -------------------------------------------------------------------------------- /data/example/genes/genes.Oryzias.latipes.bed: -------------------------------------------------------------------------------- 1 | NC_019859.2 14944162 14966540 101158169_Oryzias.latipes 2 | NC_019859.2 27133833 27135192 101169902_Oryzias.latipes 3 | NC_019860.2 24184750 24190946 101163462_Oryzias.latipes 4 | NC_019860.2 24212953 24225387 100192310_Oryzias.latipes 5 | NC_019861.2 24629112 24635991 101168357_Oryzias.latipes 6 | NC_019862.2 8680476 8733985 101170663_Oryzias.latipes 7 | NC_019862.2 17993351 17994458 101159253_Oryzias.latipes 8 | NC_019863.2 3990929 4000677 101172649_Oryzias.latipes 9 | NC_019863.2 4198641 4242713 101166104_Oryzias.latipes 10 | NC_019863.2 4252198 4255124 101167928_Oryzias.latipes 11 | NC_019863.2 10135169 10159675 101161796_Oryzias.latipes 12 | NC_019863.2 13831713 13842767 101174498_Oryzias.latipes 13 | NC_019863.2 24003311 24062016 101174585_Oryzias.latipes 14 | NC_019863.2 24452936 24468190 101172073_Oryzias.latipes 15 | NC_019863.2 27092473 27096591 101161880_Oryzias.latipes 16 | NC_019863.2 29857375 29860405 111947469_Oryzias.latipes 17 | NC_019864.2 18218816 18230300 100125461_Oryzias.latipes 18 | NC_019864.2 21032113 21034517 101162045_Oryzias.latipes 19 | NC_019864.2 27666587 27676699 101169200_Oryzias.latipes 20 | NC_019864.2 28111294 28118047 101157774_Oryzias.latipes 21 | NC_019865.2 7550351 7554560 101165142_Oryzias.latipes 22 | NC_019865.2 11585706 11602248 101164307_Oryzias.latipes 23 | NC_019865.2 12009473 12012305 101168209_Oryzias.latipes 24 | NC_019865.2 12023811 12091167 101170514_Oryzias.latipes 25 | NC_019865.2 19114819 19147021 101157698_Oryzias.latipes 26 | NC_019865.2 33769026 33784046 101166132_Oryzias.latipes 27 | NC_019866.2 5367260 5400938 101157217_Oryzias.latipes 28 | NC_019866.2 5407871 5410178 101157455_Oryzias.latipes 29 | NC_019866.2 5413824 5426359 100125462_Oryzias.latipes 30 | NC_019866.2 5748689 5750078 101159505_Oryzias.latipes 31 | NC_019866.2 5755817 5758604 101159745_Oryzias.latipes 32 | NC_019866.2 5759631 5767882 101168713_Oryzias.latipes 33 | NC_019866.2 5776683 5795965 101168957_Oryzias.latipes 34 | NC_019866.2 5805415 5811013 101159992_Oryzias.latipes 35 | NC_019866.2 5833230 5846518 101169207_Oryzias.latipes 36 | NC_019866.2 5847037 5863870 101160411_Oryzias.latipes 37 | NC_019866.2 5866710 5893180 101169458_Oryzias.latipes 38 | NC_019866.2 5907024 5908964 100049510_Oryzias.latipes 39 | NC_019866.2 5933825 5940228 101160814_Oryzias.latipes 40 | NC_019866.2 5941116 5948127 101169948_Oryzias.latipes 41 | NC_019866.2 5948216 5951883 101161059_Oryzias.latipes 42 | NC_019866.2 5952685 5957373 101161313_Oryzias.latipes 43 | NC_019866.2 5957973 5977779 101170190_Oryzias.latipes 44 | NC_019866.2 5978856 5982860 101161559_Oryzias.latipes 45 | NC_019866.2 5984629 5988556 101170435_Oryzias.latipes 46 | NC_019866.2 5988627 5996132 101161810_Oryzias.latipes 47 | NC_019866.2 5996123 5998750 101162212_Oryzias.latipes 48 | NC_019866.2 6002786 6126560 101170682_Oryzias.latipes 49 | NC_019866.2 6143648 6147917 101162602_Oryzias.latipes 50 | NC_019866.2 6161175 6172007 101162840_Oryzias.latipes 51 | NC_019866.2 22683676 22687679 110015526_Oryzias.latipes 52 | NC_019866.2 26051810 26079117 101156636_Oryzias.latipes 53 | NC_019867.2 1923928 1930038 100529163_Oryzias.latipes 54 | NC_019867.2 7791084 8016161 101163825_Oryzias.latipes 55 | NC_019867.2 13788525 13794142 101155202_Oryzias.latipes 56 | NC_019867.2 18679999 18681561 100125473_Oryzias.latipes 57 | NC_019867.2 29758389 29761397 111947936_Oryzias.latipes 58 | NC_019868.2 3393056 3394299 101164039_Oryzias.latipes 59 | NC_019868.2 4536218 4543528 101168275_Oryzias.latipes 60 | NC_019869.2 11110881 11113061 101158866_Oryzias.latipes 61 | NC_019869.2 22137159 22155213 101157070_Oryzias.latipes 62 | NC_019869.2 22174196 22180531 101157712_Oryzias.latipes 63 | NC_019869.2 22181188 22192874 101157467_Oryzias.latipes 64 | NC_019869.2 22423319 22439314 101159450_Oryzias.latipes 65 | NC_019869.2 22466154 22476287 101159835_Oryzias.latipes 66 | NC_019870.2 8748021 8761174 100049383_Oryzias.latipes 67 | NC_019870.2 28954238 28975189 100529162_Oryzias.latipes 68 | NC_019870.2 30295466 30300543 101158180_Oryzias.latipes 69 | NC_019871.2 14269572 14272184 111948470_Oryzias.latipes 70 | NC_019872.2 24590296 24598098 101167315_Oryzias.latipes 71 | NC_019873.2 1587845 1590647 100049478_Oryzias.latipes 72 | NC_019873.2 29953803 29954804 110016562_Oryzias.latipes 73 | NC_019874.2 16102950 16104628 101156431_Oryzias.latipes 74 | NC_019874.2 20527848 20534860 101171860_Oryzias.latipes 75 | NC_019874.2 25924781 25939761 101160939_Oryzias.latipes 76 | NC_019874.2 29902468 30015024 101167149_Oryzias.latipes 77 | NC_019874.2 30049829 30058066 101157247_Oryzias.latipes 78 | NC_019875.2 413039 415307 111949054_Oryzias.latipes 79 | NC_019875.2 4812074 4825777 101173252_Oryzias.latipes 80 | NC_019875.2 8566044 8579230 101165840_Oryzias.latipes 81 | NC_019875.2 29769824 29793061 101163277_Oryzias.latipes 82 | NC_019876.2 17560679 17567749 101168077_Oryzias.latipes 83 | NC_019877.2 4806888 4859136 101167570_Oryzias.latipes 84 | NC_019877.2 4861209 4870100 101160690_Oryzias.latipes 85 | NC_019877.2 4871382 4889715 101160440_Oryzias.latipes 86 | NC_019877.2 4894378 4898227 101160193_Oryzias.latipes 87 | NC_019877.2 4899357 4905498 101159944_Oryzias.latipes 88 | NC_019877.2 4905159 4911852 101159701_Oryzias.latipes 89 | NC_019877.2 4911920 4918374 101167319_Oryzias.latipes 90 | NC_019877.2 7428051 7433726 101169071_Oryzias.latipes 91 | NC_019877.2 9604579 9606577 101170879_Oryzias.latipes 92 | NC_019877.2 9616696 9636138 101171286_Oryzias.latipes 93 | NC_019877.2 9647276 9670935 101168992_Oryzias.latipes 94 | NC_019877.2 9690517 9693738 101171533_Oryzias.latipes 95 | NC_019877.2 9697833 9702483 101171783_Oryzias.latipes 96 | NC_019877.2 9703096 9704878 101172030_Oryzias.latipes 97 | NC_019877.2 10816723 10819514 101171949_Oryzias.latipes 98 | NC_019878.2 17489649 17492971 101160854_Oryzias.latipes 99 | NC_019878.2 20089182 20113185 101161015_Oryzias.latipes 100 | NC_019878.2 22485096 22511409 101166834_Oryzias.latipes 101 | NC_019878.2 22513024 22518676 101168902_Oryzias.latipes 102 | NC_019878.2 22525412 22533984 101169150_Oryzias.latipes 103 | NC_019879.2 4346613 4382742 101158981_Oryzias.latipes 104 | NC_019879.2 6620859 6685788 101165358_Oryzias.latipes 105 | NC_019879.2 6762032 6766160 101157169_Oryzias.latipes 106 | NC_019879.2 16563716 16567992 101161017_Oryzias.latipes 107 | NC_019879.2 21087601 21100578 101175445_Oryzias.latipes 108 | NC_019879.2 23291754 23304048 101166264_Oryzias.latipes 109 | NC_019879.2 25976636 25984035 101162882_Oryzias.latipes 110 | NC_019879.2 30611562 30615602 101156879_Oryzias.latipes 111 | NC_019880.2 4283050 4317127 101164359_Oryzias.latipes 112 | NC_019880.2 9530524 9564994 101171121_Oryzias.latipes 113 | NC_019880.2 9980748 10011144 101173350_Oryzias.latipes 114 | NC_019881.2 14011260 14015174 101161934_Oryzias.latipes 115 | NC_019881.2 23076823 23084945 101158319_Oryzias.latipes 116 | -------------------------------------------------------------------------------- /data/example/lowcov: -------------------------------------------------------------------------------- 1 | Xiphophorus.maculatus 2 | -------------------------------------------------------------------------------- /data/example/species_tree.nwk: -------------------------------------------------------------------------------- 1 | (((Amia.calva,Lepisosteus.oculatus)Holostei,(((Esox.lucius,(Salmo.salar,(Salvelinus.alpinus,Oncorhynchus.mykiss)Name3)Salmonidae)Protacanthopterygii,((((Xiphophorus.maculatus,Poecilia.formosa)Poeciliinae,Oryzias.latipes)Atherinomorphae,Oreochromis.niloticus)Ovalentaria,(Gasterosteus.aculeatus,(Takifugu.rubripes,Tetraodon.nigroviridis)Tetraodontidae)Name15)Name17)Euteleosteomorpha,(Astyanax.mexicanus,Danio.rerio)Otophysi)Clupeocephala)Neopterygii,((Homo.sapiens,Mus.musculus)Euarchontoglires,(Columba.livia,Gallus.gallus)Name1)Amniota)Bilateria; -------------------------------------------------------------------------------- /data/example2/ali.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/data/example2/ali.fa.gz -------------------------------------------------------------------------------- /data/example2/species_tree.nwk: -------------------------------------------------------------------------------- 1 | (Ciona.intestinalis,(((((Monodelphis.domestica,(Homo.sapiens,Mus.musculus)Euarchontoglires)Theria,Gallus.gallus)Amniota,Xenopus.tropicalis)Tetrapoda,Latimeria.chalumnae)Sarcopterygii,((Amia.calva,Lepisosteus.oculatus)Holostei,((Paramormyrops.kingsleyae,(Scleropages.formosus,Arapaima.gigas)Osteoglossidae)Osteoglossiformes,(Danio.rerio,(Oryzias.latipes,Gasterosteus.aculeatus)Ancestor9)Clupeocephala)Osteoglossocephalai)Neopterygii)Euteleostomi)Olfactores; -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/_static/Metropolis-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/_static/Metropolis-Medium.otf -------------------------------------------------------------------------------- /doc/_static/style.css: -------------------------------------------------------------------------------- 1 | .wy-body-for-nav { 2 | background: #404040; 3 | } 4 | 5 | .wy-side-nav-search .wy-dropdown > a { 6 | color: #404040; 7 | } 8 | 9 | .wy-side-nav-search { 10 | background-color: #57F1B1; 11 | } 12 | 13 | 14 | .wy-side-nav-search input[type="text"] { 15 | border-color: #57F1B1; 16 | } 17 | 18 | 19 | .wy-menu-vertical header, .wy-menu-vertical p.caption { 20 | color: #57F1B1; 21 | } 22 | 23 | .wy-side-nav-search > a, .wy-side-nav-search .wy-dropdown > a { 24 | color: #404040 25 | } 26 | 27 | 28 | .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { 29 | color: #210A72; 30 | } 31 | 32 | @font-face { 33 | font-family: "Metropolis"; 34 | src: url("./Metropolis-Medium.otf"); 35 | } 36 | 37 | h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend { 38 | font-family: "Metropolis","ff-tisa-web-pro","Georgia",Arial,sans-serif; 39 | margin-top: 5mm; 40 | } 41 | 42 | .wy-nav-top a { 43 | color: #404040; 44 | } 45 | 46 | .wy-nav-top i { 47 | color: #404040; 48 | } 49 | 50 | .wy-nav-top { 51 | background: #57F1B1; 52 | } 53 | 54 | .wy-menu-vertical a:active { 55 | background-color:#66CCAA; 56 | cursor:pointer; 57 | color:#fff 58 | } 59 | 60 | .wy-side-nav-search > div.version { 61 | color: #404040; 62 | } 63 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'SCORPiOs' 21 | copyright = '2020, Elise Parey & Camille Berthelot' 22 | author = 'Elise Parey & Camille Berthelot' 23 | 24 | release = '' 25 | version = '' 26 | 27 | master_doc = 'index' 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ["sphinx_rtd_theme", "sphinx-prompt", "sphinx.ext.autosectionlabel", 35 | "sphinx.ext.autodoc", 'sphinx.ext.napoleon'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | html_static_path = ["_static"] 40 | 41 | # List of patterns, relative to source directory, that match files and 42 | # directories to ignore when looking for source files. 43 | # This pattern also affects html_static_path and html_extra_path. 44 | exclude_patterns = [] 45 | 46 | autodoc_mock_imports = ["matplotlib", 'seaborn', 'roman', "networkx", "svgutils", 47 | "ete3", "sklearn", "numpy", "pandas", "scipy", 48 | "statsmodels"] 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | import sphinx_rtd_theme 54 | 55 | 56 | # The theme to use for HTML and HTML Help pages. See the documentation for 57 | # a list of builtin themes. 58 | # 59 | # html_theme = 'sphinx_pdj_theme' 60 | html_theme = "sphinx_rtd_theme" 61 | 62 | 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | def setup(app): 68 | app.add_css_file('style.css') 69 | -------------------------------------------------------------------------------- /doc/getting_started_installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | SCORPiOs is implemented as a `Snakemake `_ pipeline. Snakemake is a python-based language to build scalable and reproducible workflows. We take advantage of Snakemake's integration with the package manager `Conda `_ to ship all SCORPiOs dependencies. The following instructions will help you get a running copy of the pipeline and set up your environnement. 6 | 7 | Installing conda 8 | ================= 9 | 10 | The Conda package management system manages all SCORPiOs dependencies, including python packages and other software. 11 | 12 | To install Conda: 13 | 14 | * Download Miniconda3 installer for your system `here `_ 15 | 16 | .. |br| raw:: html 17 | 18 |
19 | 20 | * Run the installation script: :code:`bash Miniconda3-latest-Linux-x86_64.sh` or :code:`bash Miniconda3-latest-MacOSX-x86_64.sh`, and accept the defaults 21 | 22 | .. |br| raw:: html 23 | 24 |
25 | 26 | * Open a new terminal, run :code:`conda update conda` and press :code:`y` to confirm updates 27 | 28 | 29 | Installing SCORPiOs 30 | ==================== 31 | 32 | * Clone the repository: 33 | 34 | .. prompt:: bash 35 | 36 | git clone https://github.com/DyogenIBENS/SCORPIOS.git 37 | 38 | * Go to SCORPiOs root folder: 39 | 40 | .. prompt:: bash 41 | 42 | cd SCORPIOS 43 | 44 | * Create the main conda environment. We recommend using `Mamba `_ for a faster installation: 45 | 46 | .. prompt:: bash 47 | 48 | conda install -c conda-forge mamba 49 | mamba env create -f envs/scorpios.yaml 50 | 51 | * **Alternatively,** you can use conda directly: 52 | 53 | .. prompt:: bash 54 | 55 | conda env create -f envs/scorpios.yaml 56 | 57 | .. note:: Once the conda environnment is successfully created, the installation process is complete. You can proceed to the next section and test your installation on example data. Before running SCORPiOs, remember to activate the conda environment with :code:`conda activate scorpios`. 58 | 59 | Updating SCORPiOs conda environment 60 | ==================================== 61 | 62 | * As of SCORPiOs v2.0.0, the conda environment was updated and needs to be reinstalled for users who have a previous version: 63 | 64 | .. prompt:: bash 65 | 66 | conda env remove --name scorpios 67 | mamba env create -f envs/scorpios.yaml 68 | 69 | Reference 70 | ========== 71 | 72 | - `Snakemake: `_ Köster and Rahmann (2012) Snakemake - A scalable bioinformatics workflow engine. Bioinformatics, 28, 2520–2522. 73 | -------------------------------------------------------------------------------- /doc/getting_started_usage.rst: -------------------------------------------------------------------------------- 1 | Usage instructions 2 | ================== 3 | 4 | .. important:: 5 | Before using SCORPiOs, you should go to the SCORPiOs root folder and activate the conda environment with the command :code:`conda activate scorpios`. 6 | 7 | Running SCORPiOs on example data 8 | -------------------------------- 9 | 10 | We recommend running a test with our example data to ensure that installation was successful and to get familiar with the pipeline, inputs and outputs. 11 | 12 | SCORPiOs uses a YAML configuration file to specify inputs and parameters for each run. An example configuration file is provided: `config_example.yaml `_. This configuration file executes SCORPiOs on toy example `data `_, that you can use as reference for input formats. We explain how to format your own configuration file and input files in more details in the next chapter (see :ref:`Data file formats` and :ref:`Configuration file`). 13 | 14 | Here, we present the main commands to run SCORPiOs. 15 | 16 | Example 1: Simple SCORPiOs run 17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 18 | 19 | The only required Snakemake arguments to run SCORPiOs are :code:`--configfile`, the :code:`--use-conda` flag and the :code:`--scheduler=greedy` option. You also need to specify the number of threads via :code:`--cores`. For more advanced options, you can look at the `Snakemake documentation `_. 20 | 21 | To run SCORPiOs on example data, go to the SCORPiOs root folder and run: 22 | 23 | .. prompt:: bash 24 | 25 | snakemake --configfile config_example.yaml --use-conda --cores 4 --scheduler=greedy 26 | 27 | The following output should be generated: :code:`SCORPiOs_example/SCORPiOs_output_0.nhx`. 28 | 29 | To separate stdout and stderr (recommended, as SCORPiOs writes statistics on key steps of the workflow to the standard output): 30 | 31 | .. prompt:: bash 32 | 33 | snakemake --configfile config_example.yaml --use-conda --cores 4 --scheduler=greedy >out 2>err 34 | 35 | Example 2: Iterative SCORPiOs run 36 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 37 | 38 | SCORPiOs can run in iterative mode: SCORPiOs improves the gene trees a first time, and then uses the corrected set of gene trees again as input for a new correction run, until convergence. Correcting gene trees improves orthologies accuracy, which in turn makes synteny conservation patterns more informative, improving the gene tree reconstructions after successive runs. Usually, a small number of iterations (2-3) suffice to reach convergence. 39 | 40 | To run SCORPiOs in iterative mode on example data, execute the wrapper bash script :code:`iterate_scorpios.sh` as follows: 41 | 42 | .. prompt:: bash 43 | 44 | bash iterate_scorpios.sh --snake_args="--configfile config_example.yaml --cores 4 --scheduler=greedy" > out 2>err 45 | 46 | 47 | The following output should be generated: :code:`SCORPiOs_example/SCORPiOs_output_2_with_tags.nhx`. 48 | 49 | Command-line arguments for :code:`iterate_scorpios.sh` 50 | """""""""""""""""""""""""""""""""""""""""""""""""""""" 51 | 52 | **Required:** 53 | 54 | --snake_args=snakemake_arguments Snakemake arguments, should at minimum contain :code:`--configfile`, :code:`--cores` and :code:`--scheduler=greedy`. 55 | 56 | **Optional:** 57 | 58 | --max_iter=maxiter Maximum number of iterations to run (default=5). 59 | 60 | --min_corr=mincorr Minimum number of corrected subtrees to continue to the next iteration (default=1). 61 | 62 | --starting_iter=iter Starting iteration, to resume a run at a given iteration (default=1). 63 | 64 | 65 | Running SCORPiOs on your data 66 | ----------------------------- 67 | 68 | To run SCORPiOs on your data, you have to create a new configuration file for your SCORPiOs run. You will need to format your input data adequately and write your configuration file, using the provided example `config_example.yaml `_ as a guide. 69 | 70 | * Copy the example config file :code:`cp config_example.yaml config.yaml` 71 | * Open and edit :code:`config.yaml` to specify paths, files and parameters for your data 72 | 73 | To check your configuration, you can execute a dry-run with :code:`-n`. 74 | 75 | .. prompt:: bash 76 | 77 | snakemake --configfile config.yaml --use-conda -n 78 | 79 | Finally, you can run SCORPiOs as described above: 80 | 81 | .. prompt:: bash 82 | 83 | snakemake --configfile config.yaml --use-conda --cores 4 --scheduler=greedy 84 | 85 | or in iterative mode: 86 | 87 | .. prompt:: bash 88 | 89 | bash iterate_scorpios.sh --snake_args="--configfile config.yaml --cores 4 --scheduler=greedy" 90 | -------------------------------------------------------------------------------- /doc/img/basic_sptree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/basic_sptree.png -------------------------------------------------------------------------------- /doc/img/diagnostic_by_homeo_bowfin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/diagnostic_by_homeo_bowfin.png -------------------------------------------------------------------------------- /doc/img/diagnostic_on_medaka.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/diagnostic_on_medaka.png -------------------------------------------------------------------------------- /doc/img/example_cor_27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/example_cor_27.png -------------------------------------------------------------------------------- /doc/img/example_ori_27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/example_ori_27.png -------------------------------------------------------------------------------- /doc/img/lore_on_medaka.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/lore_on_medaka.png -------------------------------------------------------------------------------- /doc/img/scorpios_illustrated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/scorpios_illustrated.png -------------------------------------------------------------------------------- /doc/img/sptree_lore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/doc/img/sptree_lore.png -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. SCORPiOs documentation master file, created by 2 | sphinx-quickstart on Mon May 11 13:13:57 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to SCORPiOs documentation! 7 | ================================== 8 | 9 | 10 | .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3727519.svg 11 | :target: https://doi.org/10.5281/zenodo.3727519 12 | 13 | .. image:: https://img.shields.io/badge/License-GPLv3-blue.svg 14 | :target: https://www.gnu.org/licenses/gpl-3.0 15 | 16 | .. image:: https://img.shields.io/badge/snakemake-≥6.6.1-brightgreen.svg 17 | :target: https://snakemake.bitbucket.io 18 | 19 | .. image:: https://readthedocs.org/projects/scorpios/badge/?version=latest 20 | :target: https://scorpios.readthedocs.io/en/latest/?badge=latest 21 | 22 | SCORPiOs is a **synteny-guided gene tree correction pipeline** for clades that have undergone a whole-genome duplication event. SCORPiOs identifies gene trees where the whole-genome duplication is **missing** or **incorrectly placed**, based on the genomic locations of the duplicated genes across the different species. SCORPiOs then builds an **optimized gene tree** consistent with the known WGD event, the species tree, local synteny context, as well as gene sequence evolution. 23 | 24 | 25 | SCORPiOs is implemented as a `Snakemake `_ pipeline. SCORPiOs takes as input either gene trees or multiple alignments, and outputs the corresponding optimized gene trees. 26 | 27 | For more information, you can take a look at `SCORPiOs publication `_. 28 | 29 | .. image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/scorpios_illustrated.png 30 | 31 | 32 | References 33 | ---------- 34 | 35 | SCORPiOs uses the following tools to build and test gene trees: 36 | 37 | * `ProfileNJ `_: Noutahi et al. (2016) Efficient Gene Tree Correction Guided by Genome Evolution. PLOS ONE, 11, e0159559. 38 | 39 | 40 | * `RAxML `_: Stamatakis (2014) RAxML version 8: a tool for phylogenetic analysis and post-analysis of large phylogenies. Bioinformatics, 30, 1312–1313. 41 | 42 | 43 | * `PhyML `_: Guindon et al. (2010) New Algorithms and Methods to Estimate Maximum-Likelihood Phylogenies: Assessing the Performance of PhyML 3.0. Syst Biol, 59, 307–321. 44 | 45 | 46 | * `TreeBeST `_: Vilella et al. (2009) EnsemblCompara GeneTrees: Complete, duplication-aware phylogenetic trees in vertebrates. Genome Res., 19, 327–335. 47 | 48 | 49 | * `CONSEL `_: Shimodaira and Hasegawa (2001) CONSEL: for assessing the confidence of phylogenetic tree selection. Bioinformatics, 17, 1246–1247. 50 | 51 | 52 | .. toctree:: 53 | :caption: Quick start 54 | :name: quick_start 55 | :hidden: 56 | :maxdepth: 1 57 | 58 | getting_started_installation.rst 59 | getting_started_usage.rst 60 | 61 | 62 | .. toctree:: 63 | :caption: Input Data Preparation 64 | :name: input_data_preparation 65 | :hidden: 66 | :maxdepth: 1 67 | 68 | input_description.rst 69 | input_formatting.rst 70 | input_your_configuration_file.rst 71 | input_building_a_dataset.rst 72 | 73 | 74 | .. toctree:: 75 | :caption: Outputs Description 76 | :name: outputs_description 77 | :hidden: 78 | :maxdepth: 1 79 | 80 | output_genetrees.rst 81 | output_treeviz.rst 82 | output_advanced.rst 83 | 84 | 85 | .. toctree:: 86 | :caption: LORelEi (LORe Extension) 87 | :name: lorelei 88 | :hidden: 89 | :maxdepth: 1 90 | 91 | lorelei_introduction.rst 92 | lorelei_usage.rst 93 | lorelei_configuration_file.rst 94 | 95 | .. toctree:: 96 | :caption: Project Information 97 | :name: project_information 98 | :hidden: 99 | :maxdepth: 1 100 | 101 | scripts.rst 102 | project_changelog.rst 103 | project_info.rst 104 | -------------------------------------------------------------------------------- /doc/input_building_a_dataset.rst: -------------------------------------------------------------------------------- 1 | Building a dataset 2 | =================== 3 | 4 | If you do not have gene alignments available for your study species, you will need to start by building an input dataset for SCORPiOs. This implies that you will need to: 5 | 6 | 1. List the duplicated study species that you want to include in your dataset, while taking care to also select adequate non-duplicated outgroups. You can then generate the corresponding species phylogeny. 7 | 8 | .. |br| raw:: html 9 | 10 |
11 | 12 | 2. Extract all genes along with their CDS nucleotide sequences, for all your study species. 13 | 14 | .. |br| raw:: html 15 | 16 |
17 | 18 | 3. Group genes from all species into homologous gene families, i.e genes that descend from a single ancestral gene. 19 | 20 | .. |br| raw:: html 21 | 22 |
23 | 24 | 4. Build multiple sequence alignments for all homologous gene families. The most appropriate way to build gene multiple sequence alignments requires translating the CDS regions of genes into their protein sequences, aligning the protein sequences, then back-translating to the nucleotide sequences. 25 | 26 | .. |br| raw:: html 27 | 28 |
29 | 30 | 5. Filter multiple sequence alignments for poorly aligned or degenerated regions. 31 | 32 | .. |br| raw:: html 33 | 34 |
35 | 36 | 6. Optionally, build a gene tree for each family. Alternatively, SCORPiOs can build starting trees for you. In this case, you would need to write a file giving the gene to species correspondence. 37 | 38 | 39 | We've listed above the main steps required to build an input dataset for SCORPiOs. However, assembling genome-wide phylogenetic datasets is a complex process, frequently refined by all leading comparative genomic databases. As a state-of-the-art reference, we recommend the `GeneSeqToFamily paper `_, which describes how Ensembl Compara builds gene families, alignment and gene trees for a given set of species. In addition, the authors developped a Galaxy workflow `GeneSeqToFamily `_, to interactively run each step of the pipeline. 40 | 41 | Other well-curated pipelines such as `OrthoFinder `_ are appropriate to build a dataset that can be then used with SCORPiOs. 42 | 43 | Reference 44 | --------- 45 | 46 | - `GeneSeqToFamily `_: Thanki et al. (2018) GeneSeqToFamily: a Galaxy workflow to find gene families based on the Ensembl Compara GeneTrees pipeline. GigaScience 7. 47 | 48 | - `OrthoFinder `_: Emms and Kelly (2019) OrthoFinder: phylogenetic orthology inference for comparative genomics. Genome Biology 20. 49 | -------------------------------------------------------------------------------- /doc/input_description.rst: -------------------------------------------------------------------------------- 1 | Data preparation 2 | ================ 3 | 4 | 5 | SCORPiOs is a flexible gene tree correction pipeline: it can either start from a set of precomputed, phylogeny-reconciled gene trees, or build one from a set of gene multiple aligments using `TreeBeST `_. 6 | 7 | If you do not have gene trees or gene alignments readily available for your study species, please refer to the :ref:`Building a dataset` section. 8 | 9 | .. warning:: 10 | Because SCORPiOs leverages local synteny similarity, i.e evolution of neighboring genes, it requires **genome-wide data**. 11 | 12 | Input files 13 | ----------- 14 | 15 | SCORPiOs requires four input files, which are: 16 | 17 | 1. A set of phylogeny-reconciled gene trees as a single file in NHX format (extended Newick format, see `example `_ and :ref:`description`). 18 | 19 | **OR** 20 | 21 | 1. (bis) A genes-to-species mapping file, if starting the process from gene alignments only (see `example `_ and :ref:`description`). 22 | 23 | .. |br| raw:: html 24 | 25 |
26 | 27 | 2. The gene multiple alignments corresponding to the gene trees, as a single file in FASTA format (can be compressed with gzip, see `example `_ and :ref:`description`). 28 | 29 | .. |br| raw:: html 30 | 31 |
32 | 33 | 3. Gene coordinates files for each species in BED format (see `example `_) or dyogen format (see `example `_). See also :ref:`description`. 34 | 35 | .. |br| raw:: html 36 | 37 |
38 | 39 | 4. A species tree in NEWICK format, with names of ancestral species indicated at internal nodes (see `example `_ and :ref:`description`). 40 | 41 | 42 | For a detailed description of expected formats please refer to the :ref:`Data file formats` section. 43 | 44 | .. note:: 45 | If starting from gene trees, SCORPiOS uses the NHX :code:`S` (species name) tag to build the gene-species mapping. Otherwise, it uses the gene-to-species mapping file. 46 | 47 | 48 | SCORPiOs parameters 49 | -------------------- 50 | 51 | All parameters for a SCORPiOs run have to be indicated in the YAML configuration file, as shown in `config_example.yaml `_. 52 | 53 | A critical parameter is the position(s) of WGD(s) in the species tree and the species to use as outgroup(s). They both have to be specified together using the :code:`WGDs` keyword. The WGD position should be indicated with the name of the last common ancestor of all duplicated species. 54 | 55 | For instance, consider the species tree below: 56 | 57 | :: 58 | 59 | (spotted_gar, (zebrafish, (medaka, (tetraodon, fugu)Tetraodontidae)Euteleosteomorpha)Clupeocephala)Neopterygii; 60 | 61 | .. image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/basic_sptree.png 62 | 63 | The fish WGD occurred in the branch leading to the "Clupeocephala" ancestor, and we wish to use the spotted_gar as outgroup. This should be specified in the configuration file as: 64 | 65 | .. code-block:: yaml 66 | 67 | WGDs: 68 | Clupeocephala: spotted_gar 69 | 70 | For a detailed description of all parameters available in SCORPiOs please refer to the :ref:`Configuration file` section. 71 | 72 | 73 | Complex configurations 74 | ---------------------- 75 | 76 | SCORPiOs can correct gene trees that contain **more than one whole-genome duplication event**. 77 | 78 | In this case, each WGD is treated independently, starting from the more recent one (closer to the leaves) going up towards the more ancient one (closer to the root). If the WGDs are nested, the subtrees from the more recent events are ignored while correcting for the older WGD event(s), and reinserted after correction using their outgroup as a branching point. 79 | 80 | SCORPiOs can also use more than one reference outgroup to correct gene trees. Outgroup(s), separated by commas if more than one, are to be indicated for each WGDs. 81 | 82 | For instance, in the example `config_example.yaml `_, WGDs to correct are specified by: 83 | 84 | .. code-block:: yaml 85 | 86 | WGDs: 87 | Clupeocephala: Lepisosteus.oculatus,Amia.calva 88 | Salmonidae: Esox.lucius,Gasterosteus.aculeatus,Oryzias.latipes 89 | 90 | This specifies that gene trees have to be corrected for the teleost WGD (species below the Clupeocephala ancestor in the `species tree `_) and for the salmonids WGD (species below the Salmonidae ancestor in the `species tree `_). Lepisosteus.oculatus and Amia.calva should be used as outgroups to the teleost WGD and Esox.lucius, Gasterosteus.aculeatus and Oryzias.latipes as outgroups to the salmonid WGD. 91 | -------------------------------------------------------------------------------- /doc/input_formatting.rst: -------------------------------------------------------------------------------- 1 | Data file formats 2 | ================= 3 | 4 | Input files should typically follow the general conventions for their file format. Here is a more detailed overview of what SCORPiOs expects to find within each input file. 5 | 6 | 7 | Gene tree file 8 | -------------- 9 | 10 | All gene trees should be listed into a single NHX (New Hampshire Extended) file, separated by '//'. 11 | 12 | The gene trees should be phylogeny-reconciled: internal nodes should be tagged with the :code:`D` attribute to specify if they correspond to a speciation or a duplication (e.g. :code:`D=N` or :code:`D=Y`). Leaves should be tagged using the :code:`S` attribute to indicate the species (e.g. :code:`S=Danio.rerio`). Optionally, the :code:`DD` and :code:`DCS` attributes can help flag dubious duplications at internal nodes (:code:`DD=Y` or :code:`DCS=0` for dubious duplications). 13 | 14 | See an example gene tree file `here `_. 15 | 16 | Obtaining phylogeny-reconciled trees 17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 18 | 19 | Phylogenetic reconciliation compares gene trees to the species history, to annotate whether internal nodes in the gene tree correspond to speciation or duplication events. 20 | For many taxa, precomputed phylogeny-reconciled gene trees can be downloaded from public comparative genomics databases, such as Ensembl. 21 | 22 | If phylogeny-reconciled gene trees are not available for your study species, we suggest running SCORPiOs from gene alignments. SCORPiOs will then build an initial set of gene trees from the alignments using TreeBeST, and will optimize them using synteny information. 23 | 24 | Alternatively, you can use TreeBeST to reconcile NEWICK gene trees computed with another program to the species phylogeny. 25 | 26 | .. tip :: 27 | 28 | TreeBeSt is one of SCORPiOs dependencies, therefore you can invoke it without having to re-install it. Simply make sure you have activated scorpios conda environnment before running the command below (activation command: :code:`conda activate scorpios`). 29 | 30 | Given a species tree in the newick format and a gene tree in newick format that you want to convert to (.nhx) phylogeny-reconciled format. You will then need to: 31 | 32 | * append "_"+speciesname to gene names in your gene tree 33 | 34 | * run :code:`treebest sdi` to reconcile the gene tree: 35 | 36 | .. prompt:: bash 37 | 38 | treebest sdi my_gene_tree.nwk -s my_species_tree.nwk > my_gene_tree.nhx 39 | 40 | .. note:: 41 | 42 | Unreconciled tree formats (such as Newick) will raise an error when running SCORPiOs. If you do not have reconciled trees available, we recommend using gene alignments as your primary SCORPiOs input. 43 | 44 | Converting to NHX format 45 | ^^^^^^^^^^^^^^^^^^^^^^^^ 46 | 47 | If you already have phylogeny-reconciled gene trees, but they are not in NHX format, you will need to convert them. 48 | We recommend the `Phylo module `_ in Biopython, which will handle a variety of tree formats. 49 | 50 | 51 | Gene multiple alignment file 52 | ----------------------------- 53 | 54 | The multiple sequence alignments used to build the trees should be provided as a single file in fasta (.fa) format. These should be nucleotide sequence alignments (CDS alignments, back-translated from protein alignments). The file can be gzipped (.gz) or not. 55 | Alignments should be separated by '//' and should appear in the same order as their corresponding trees. Gene names should match those used in the trees. 56 | 57 | This file can also be used as the primary input for SCORPiOs if phylogeny-reconciled trees are not available. SCORPiOs will then use these alignments to build the initial set of trees. 58 | 59 | See an example gene multiple alignment file `here `_. 60 | 61 | Building a gene sequence multiple alignment 62 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 63 | 64 | Precomputed gene sequence alignments can be downloaded from a variety of public sources (databases, publications) for many species sets. However, if you cannot find precomputed gene alignments that suit your analysis, you will need to start the work from scratch and build your own gene alignments before you can use SCORPiOs. We provide a dedicated outline here: :ref:`Building a dataset`. 65 | 66 | .. warning:: 67 | Building gene multiple alignments is non-trivial and we recommend you use precomputed alignments if possible, if you are not familiar with this task. 68 | 69 | Gene coordinates files 70 | ---------------------- 71 | 72 | All genes and their coordinates should be provided as a separate file per species in BED format. Files can be bzipped2 (.bz2). Files must be a minimal BED format with 4 columns: chromosome; start; end; gene_name. 73 | All file names should follow similar conventions, to be retrieved with a regular expression. Gene names should match those used in the trees and alignments. 74 | 75 | See an example gene coordinates file `here `_. 76 | 77 | Alternatively, genes coordinate file can be provided in 'dyogen' format: a tab-separated file with 5 columns (chromosome; start; end; strand; gene_name). 78 | 79 | See an example gene coordinates file in 'dyogen' format `here `_. 80 | 81 | Genes-to-species mapping file 82 | ----------------------------- 83 | 84 | This is an alternative input to the gene tree file, when phylogeny-reconciled gene trees are unavailable and gene alignments are used as the primary input for SCORPiOs. 85 | 86 | Correspondances between gene IDs and species names should be provided as a single text file with two columns: gene_name; species_name. Genes from the same family should appear consecutively in the file. Genes families should be separated by '//'. Families should appear in the same order as their corresponding alignment in the alignments file . Gene names and species names should be the same as in the alignment and species tree, respectively. 87 | 88 | See an example genes-to-species mapping file `here `_. 89 | 90 | Species tree file 91 | ----------------- 92 | 93 | The species tree in NEWICK format, with names of ancestral species indicated at internal nodes. The species tree should contain all species included in the gene trees. Species names should not contain underscores '_'. For optimal tree reconstruction with `TreeBeST `_, the tree should not contain polytomies. 94 | 95 | See an example species tree file `here `_. -------------------------------------------------------------------------------- /doc/lorelei_introduction.rst: -------------------------------------------------------------------------------- 1 | Introducing SCORPiOs LORelEi 2 | ============================= 3 | 4 | SCORPiOs LORelEi (Lineage-specific Ohnolog Resolution Extension) was introduced in SCORPiOs v2.0.0, to diagnose potential cases of delayed rediploidisation following a whole-genome duplication. 5 | 6 | LORelEi does not affect how the SCORPiOs gene tree correction runs (see more details in the :ref:`next section `, along with complete LORelEi usage instructions). 7 | 8 | Here, we explain the evolutionary models of post-WGD rediploidisation, and how LORelEi attempts to diagnose them. 9 | 10 | Delayed meiosis rediploidisation 11 | --------------------------------- 12 | 13 | Delayed meiosis rediploidisation after a whole-genome duplication correspond to a prolonged period of recombination between duplicated chromosome pairs. Since recombination delays the divergence of duplicated gene sequences, it typically cause sequence-based gene trees to underestimate the time of duplication. Yet synteny does suggest their WGD-derived origin since they are found in doubly-conserved synteny with a non-duplicated outgroup genome. 14 | 15 | In practice, in the presence of delayed meiosis rediploidisation, SCORPiOs attempts to correct sequence-based gene trees to correctly position the duplication, as suggested by synteny. However, since SCORPiOs only corrects gene trees if it is able to find a solution that is both sequence and synteny consistent, the correction might be rejected (if it induces a significant drop of the sequence-based tree likelihood). 16 | 17 | LORelEi aims to better analyze these sequence-synteny conflicts (i.e. rejected synteny-guided tree corrections), to identify potential cases of delayed meiosis rediploidisation. 18 | 19 | SCORPiOs LORelEi 20 | ----------------- 21 | 22 | Diagnostic mode 23 | ^^^^^^^^^^^^^^^^ 24 | 25 | The :code:`diagnostic` mode extracts sequence-synteny conflicts revealed by the gene tree correction performed by SCORPiOs, and analyzes their spatial distribution in modern and ancestral genomes. The underlying rationale is that if conflicts are mainly artifacts due to noise in the data, they should be randomly distributed along genomes. Conversely, if conflicted gene families are in close genomic proximity or belong to specific duplicated chromosome pairs, conflicts can indicate Lineage-specific Ohnolog Resolution, i.e. prolonged recombination of specific genomic regions or chromosomes. 26 | 27 | LORelEi leverages SCORPiOs intermediary outputs (the :ref:`gene tree correction summaries ` and the :ref:`synteny-guided constrained tree topologies `) to identify conflicted gene trees. 28 | 29 | We provide a toy example dataset to illustrate what LORelEi can do. The toy example dataset is comprised of only 41 genes families including 6 teleosts genomes. These genes families come from 2 distinct WGD-duplicated chromosome pairs. LORelEi reveals that sequence-synteny conflicts are over-represented (not significantly) on one of the two pairs of duplicated chromosomes (the pair corresponding to the outgroup *Amia calva* chromosome 17). 30 | 31 | While an outgroup genome can be used as a proxy for the pre-duplication karyotype, LORelEi also accepts ancestral karyotype reconstruction. 32 | 33 | Finally, LORelEi plots conflicted gene families on the karyotype of a duplicated genome, in the example the medaka fish (*Oryzias latipes*). Here are example plots generated by LORelEi diagnostic on the toy dataset: 34 | 35 | .. image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/diagnostic_by_homeo_bowfin.png 36 | 37 | .. image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/diagnostic_on_medaka.png 38 | 39 | 40 | Full usage instructions to reproduce the example plots are presented in the :ref:`next section `. 41 | 42 | Likelihood-tests mode 43 | ^^^^^^^^^^^^^^^^^^^^^^ 44 | 45 | Two classes of gene trees can be defined depending on the relative timing of meiosis resolution and species divergence. AORe (Ancestral Ohnolog Resolution) gene treez correspond to gene trees for which meiosis was resolved **before** speciation, while LORe (Lineage-specific Ohnolog Resolution) gene trees are trees for which meiosis was resolved **after** speciation. 46 | 47 | The :code:`likelihood_tests` mode directly confronts and tests the AORe versus LORe gene tree hypotheses, and displays the location of AORe and LORe gene families on the karyotype of a duplicated genome. The :code:`likelihood_tests` mode requires more computation time and ressources because it computes the likelihood of the trees in each model, whereas the :code:`diagnostic` mode only extracts outputs already provided by SCORPiOs. We recommend running LORelEi in :code:`diagnostic` mode first and then in :code:`likelihood_tests` mode to confirm the results, if the conflicted gene families appear non-randomly distributed and conflicted gene tree topologies are compatible with LORe. 48 | 49 | LORelEi tests the rediploidisation status at the earliest speciation time point after the WGD. The example below show the AORe and LORe gene tree topologies confronted in LORelEi for the teleost species phylogeny of the toy dataset: 50 | 51 | .. image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/sptree_lore.png 52 | :align: center 53 | 54 | In the toy example dataset, LORelEi :code:`likelihood_tests` confirms that conflilcted gene families revealed by LORelEi :code:`diagnostic` can be explained by LORe, while revealing additional LORe gene families: 55 | 56 | .. image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/lore_on_medaka.png 57 | :align: center 58 | 59 | 60 | Full usage instructions to reproduce the example plot are presented in the :ref:`next section `. 61 | 62 | -------------------------------------------------------------------------------- /doc/lorelei_usage.rst: -------------------------------------------------------------------------------- 1 | LORelEi usage instructions 2 | ========================== 3 | 4 | SCORPiOs LORelEi is implemented as an extension to the main SCORPiOs pipeline, from which it depends as a snakemake subworkflow. LORelEi can be run after a previously completed SCORPiOs job or be directly invoked, in which case the main SCORPiOs job will be automatically run first (except for more advanced usage, see the :ref:`Example 3 ` below). 5 | 6 | In all cases, you will need to prepare two configuration files: one for the main SCORPiOs job and a second for LORelEi. Note, however, that the configuration file for LORelEi is a lot simpler than the one for SCORPiOs, and that it requires no additional input data. 7 | 8 | .. warning:: 9 | LORelEi was introduced in SCORPiOs v2.0.0. The implementation of subworkflows required a more recent version of Snakemake, which was updated from version 5.5.4 to 6.6.1 in `SCORPiOs conda environment `_. Thus, users having a previous version of the environment need to update it (as explained :ref:`here `). The update of Snakemake also implies that additional command-line arguments are required to run SCORPiOs, see the :ref:`updated usage instructions`. 10 | 11 | Running SCORPiOs LORelEi on example data 12 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 13 | An example configuration file for SCORPiOs LORelEi is provided: `config_example2_lorelei.yaml `_. This configuration file executes SCORPiOs LORelEi on toy example `data `_. We explain how to format your own configuration file in the next chapter (see :ref:`LORelEi configuration file`). 14 | 15 | 16 | As introduced in the previous chapter, two modes are available to run SCORPiOs LORelEi: 17 | :code:`diagnostic` (:ref:`Example 1 `) and :code:`likelihood_tests` (:ref:`Example 2 `). We also show how to use SCORPiOs iterative correction in conjonction with LORelEi (:ref:`Example 3 `). 18 | 19 | 20 | .. important:: 21 | Remember that you should go to the SCORPiOs root folder and activate the conda environment with the command :code:`conda activate scorpios` before running SCORPiOs and/or LORelEi. 22 | 23 | 24 | Example 1: SCORPiOs LORelEi diagnostic mode 25 | -------------------------------------------- 26 | 27 | To run SCORPiOs LORelEi on example data in :code:`diagnostic` mode: 28 | 29 | .. prompt:: bash 30 | 31 | snakemake -s scorpios_lorelei.smk --configfile config_example2_lorelei.yaml --use-conda --cores 4 --scheduler=greedy 32 | 33 | Outputs for the main SCORPiOs job should be generated in :code:`SCORPiOs_example2/`, while LORelEi results are stored in :code:`SCORPiOs_LORelEi_example2/`. 34 | 35 | The following LORelEi output figures should be generated: :code:`SCORPiOs-LORelEi_example2/diagnostic/seq_synteny_conflicts_by_homeologs.svg` and :code:`SCORPiOs-LORelEi_example2/diagnostic/seq_synteny_conflicts_on_genome.svg`. 36 | 37 | 38 | Example 2: SCORPiOs LORelEi likelihood_tests mode 39 | -------------------------------------------------- 40 | 41 | If you ran the example in :code:`diagnostic` mode previously, the SCORPiOs main job will not be re-run and LORelEi will re-use pre-computed outputs from the :code:`SCORPiOs_example2/` folder. 42 | 43 | The example configuration file contains all necessary arguments to also run LORelEi in :code:`likelihood_tests` mode, you only need to update the :code:`mode` parameter, either in the configuration file or directly in the command-line as follows: 44 | 45 | .. prompt:: bash 46 | 47 | snakemake -s scorpios_lorelei.smk --configfile config_example2_lorelei.yaml --config mode=likelihood_tests --use-conda --cores 4 --scheduler=greedy 48 | 49 | Since the configuration is very simple here, you can even omit the configuration file and only provide the three only required arguments in the command-line: 50 | 51 | .. prompt:: bash 52 | 53 | snakemake -s scorpios_lorelei.smk --config scorpios_config=config_example2.yaml mode=likelihood_tests dup_genome=Oryzias.latipes --use-conda --cores 4 --scheduler=greedy 54 | 55 | The following LORelEi outputs should be generated: :code:`SCORPiOs-LORelEi_example2/lktests/lore_aore_on_genome.svg` (figure) and :code:`SCORPiOs-LORelEi_example2/lktests/lore_aore_summary.tsv` (summary of LORe and AORe gene families). 56 | 57 | Example 3: SCORPiOs iterative and LORelEi 58 | ------------------------------------------ 59 | 60 | To run LORelEi in conjonction with SCORPiOs iterative gene tree correction, you will need to run SCORPiOs iterative correction first and then LORelei, specifying the iteration you want to analyze sequence-synteny conflicts on. We recommend using iteration 1 (or 2) of an iterative run for LORelEi, since the number of gene trees considered for correction by SCORPiOs - and thus by LORelEi afterwards - typically decreases a lot in later iterations. 61 | 62 | .. prompt:: bash 63 | 64 | bash iterate_scorpios.sh --snake_args="--configfile config_example2.yaml --cores 4 --scheduler=greedy" 65 | snakemake -s scorpios_lorelei.smk --configfile config_example2_lorelei.yaml --config iter=1 --use-conda --cores 4 --scheduler=greedy 66 | 67 | The following LORelEi outputs should be generated: :code:`SCORPiOs-LORelEi_example2/diagnostic/seq_synteny_conflicts_by_homeologs.svg` and :code:`SCORPiOs-LORelEi_example2/diagnostic/seq_synteny_conflicts_on_genome.svg`. You can change the :code:`jname` parameter to not overwrite previous results (see :ref:`LORelEi configuration file`). 68 | 69 | Running SCORPiOs LORelEi on your data 70 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 71 | 72 | Like for SCORPiOs, you have to create a new configuration file to run LORelEi on your own data. You can use the example configuration file as a guide to write your own (see :ref:`LORelEi configuration file`) and then run: 73 | 74 | .. prompt:: bash 75 | 76 | snakemake -s scorpios_lorelei.smk --configfile config_lorelei.yaml --use-conda --cores 4 --scheduler=greedy -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/output_advanced.rst: -------------------------------------------------------------------------------- 1 | Intermediary outputs 2 | ==================== 3 | 4 | Beyond description statistics printed to the standard output and final corrected trees, you may want to investigate step-by-step results of SCORPiOs for one or several specific gene families. 5 | 6 | .. important:: 7 | A gene family in SCORPiOs consists of a non-duplicated outgroup gene and all potential orthologous gene copies in WGD-duplicated species, based on the uncorrected gene trees. For each family, SCORPiOs computes a synteny-derived orthology graph, then a constrained tree topology based on synteny, and finally, if necessary, a synteny-aware corrected tree. Through each of these steps, a gene family is identified by the outgroup gene name. 8 | 9 | .. tip:: 10 | Several suffixes such as the name of the corrected WGD, the outgroup species and SCORPiOs iteration number are added to each output file, in order to precisely identify outputs, even in case of complex configurations. 11 | 12 | Comprehensive list of orthologs 13 | -------------------------------- 14 | The **orthology relationships** between genes of **duplicated species and outgroup** are stored in a single file, whose name starts with :code:`Homologs_`, and located inside the :code:`Families/` sub-folder. This table retains all gene copies since the ingroup/outgroup speciation node, as well as any other homologs with a loosely similar syntenic context. 15 | 16 | In the example run, one such file is: 17 | 18 | :code:`SCORPiOs_example/Families/Homologs_Salmonidae_Esox.lucius_0`. 19 | 20 | The three first columns of the file describe outgroup genes (chromosome, index of the gene on the chromosome, gene name). Other columns gives the predicted orthologous genes in duplicated species, with the same information. 21 | 22 | Pairwise synteny orthology predictions 23 | -------------------------------------- 24 | Raw **synteny-predicted orthologies** amongst duplicated species are stored in a single file, whose name starts with :code:`Sorted_SyntenyOrthoPred_`, and located in :code:`Synteny/`. 25 | 26 | In the example run, one such file is: 27 | 28 | :code:`SCORPiOs_example/Synteny/Sorted_SyntenyOrthoPred_Salmonidae_Esox.lucius_0.gz`. 29 | 30 | It is a 4-columns gunzipped (.gz) file, giving orthologous genes predicted between duplicated species, after the pairwise synteny analysis. The first columns shows a gene in a duplicated species 1, the second gives its predicted ortholog in duplicated species 2, the third gives the associated :math:`{\Delta}S` synteny score and the fourth the outgroup gene name. 31 | 32 | .. note:: 33 | In this file, species names are appended to the gene names. 34 | 35 | Orthogroups in synteny graphs 36 | ------------------------------ 37 | Predicted **orthogroups** based on community detection in **synteny graphs** are stored in a single file (:code:`GraphsOrthogroups_`) in :code:`Graphs/`, along with a summary of the community detection step (:code:`Summary_`). 38 | 39 | In the example run, the following file gives predicted orthogroups: 40 | 41 | :code:`SCORPiOs_example/Graphs/GraphsOrthogroups_Clupeocephala_Lepisosteus.oculatus_0`. 42 | 43 | The first column gives the name of the outgroup gene with an appended "a" or "b" letter to uniquely identify each the two post-WGD orthogroups. Other columns gives the duplicated species gene members. 44 | 45 | In addition, :code:`SCORPiOs_example/Graphs/Summary_Clupeocephala_Lepisosteus.oculatus_0` is a simple 3-columns table describing the community detection step. The outgroup gene is indicated in the first column, followed by the algorithm used for community detection and the number of graph edges removed in the second and third columns, respectively. 46 | 47 | 48 | Subtree corrections 49 | ------------------- 50 | 51 | Correction summary 52 | ^^^^^^^^^^^^^^^^^^^ 53 | 54 | The :code:`Corrections/` folder stores two files, one detailing **trees vs synteny consistency** and another with the list of **successfully corrected subtrees**. 55 | 56 | In the example run, the following file gives an inconsistency summary (with respect to the Clupeocephala WGD): 57 | 58 | :code:`SCORPiOs_example/Corrections/Trees_summary_Clupeocephala_0`. 59 | 60 | In addition, the following file lists corresponding accepted corrections: 61 | 62 | :code:`SCORPiOs_example/Corrections/Accepted_Trees_Clupeocephala_0`. 63 | 64 | 65 | 66 | Subtree corrections (additional) 67 | -------------------------------- 68 | Additional files can be saved if specified in the configuration file (see the configuration keyword :code:`save_subtrees_lktest`). 69 | 70 | 71 | Constrained tree topologies 72 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 73 | **Constrained tree topologies** are stored in the :code:`Trees/ctrees_0/` folder (:code:`Trees/ctrees_i/` for each iteration i in iterative mode). 74 | 75 | In the example, one constrained topology file is: 76 | 77 | :code:`SCORPiOs_example/Trees/ctrees_0/Clupeocephala/C_102697250_Lepisosteus.oculatus.nh` 78 | 79 | This file gives the constrained tree topology for the gene family identified by the outgroup gene :code:`102697250_Lepisosteus.oculatus`, in the newick format. 80 | 81 | profileNJ and TreeBeST solutions 82 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 83 | 84 | Synteny-aware trees built with `ProfileNJ `_ (an extension of the PolytomySolver package) and `TreeBeST phyml `_, using the **constrained tree topology**, are stored in the :code:`Corrections/PolyS_0/` and :code:`Corrections/TreeB_0/` folders, respectively. 85 | 86 | 87 | In the example, one ProfileNJ tree file is: 88 | 89 | :code:`SCORPiOs_example/Corrections/PolyS_0/Clupeocephala/102697250_Lepisosteus.oculatus.nh`. 90 | 91 | Trees are in Newick format. 92 | 93 | .. note:: 94 | 95 | SCORPiOs does not build a TreeBeST tree if the ProfileNJ solution is accepted. In this case, TreeBeST tree files will be empty. 96 | 97 | Likelihood AU-tests 98 | ^^^^^^^^^^^^^^^^^^^^ 99 | Output of the likelihood AU-tests are stored in the :code:`Corrections/Res_polylk_0/` and :code:`Corrections/Res_treeBlk_0/` folders. These are direct outputs from the `CONSEL `_ software. 100 | 101 | In the example, the following file gives **AU-test likelihood tests** results for the **original subtree** vs the corresponding synteny-aware tree resolved with **profileNJ**: 102 | 103 | :code:`SCORPiOs_example/Corrections/Res_polylk_0/Clupeocephala/Res_102697250_Lepisosteus.oculatus.txt` 104 | 105 | Similarly, files in the :code:`SCORPiOs_example/Corrections/Res_treeBlk_0/Clupeocephala/` stores comparisons of **original subtree vs TreeBeST phyml** solution. 106 | 107 | .. note:: 108 | 109 | AU-test result files for TreeBeST solutions will be empty if the profileNJ solution was accepted. 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /doc/output_genetrees.rst: -------------------------------------------------------------------------------- 1 | Corrected gene trees 2 | ==================== 3 | 4 | All outputs from SCORPiOs are stored in a folder named :code:`SCORPiOs_jobname/` (with the jobname as specified in the configuration file). 5 | 6 | The main output is the **SCORPiOs-optimized gene trees**. Gene trees are provided as a single file in NHX format. We explain in the next section how to visualize SCORPiOs corrections. 7 | 8 | The run commands generate: 9 | 10 | - :code:`SCORPiOs_example/SCORPiOs_output_0.nhx` for the simple run, 11 | - :code:`SCORPiOs_example/SCORPiOs_output_2_with_tags.nhx` for the iterative run. 12 | 13 | Outputs are suffixed with a digit representing the iteration number. This number is set to 0 in simple mode and starts at 1 in iterative mode. 14 | 15 | .. note:: 16 | NHX tags are added to the corrected gene trees: 17 | 18 | - **NEW in 1.3.0 (iterative mode only)**, corrected WGD nodes :code:`CORR_ID_WGD=Y`, 19 | 20 | - leaves of corrected subtrees are tagged with :code:`CORR_ID_WGD=ID`, 21 | 22 | - leaves rearranged to reinsert subtrees are tagged with :code:`MOVED_ID_WGD=ID`; 23 | 24 | where WGD will be replaced by the name of the corrected WGD and members of the same corrected subtree will be given the same ID. 25 | 26 | For instance, two leaves, with the tag :code:`CORR_ID_Clupeocephala=1` for one and :code:`CORR_ID_Clupeocephala=2` for the other, belong to two different subtrees corrected for the Clupeocephala WGD. 27 | 28 | In iterative mode, correction tags are additionaly suffixed with the iteration number of the first correction (i.e :code:`CORR_ID_Clupeocephala_1=1`). 29 | 30 | .. warning:: 31 | 32 | Please note that :code:`MOVED_ID_` is currently **not** supported in the final output in iterative mode. However, they **are** stored in individual, by iteration, corrected trees (see the entry :code:`save_tmp_trees` of the :ref:`Configuration file`), in the exact same format as in the non-iterative run. Corrections at each iteration **can** be visualized with SCORPiOs custom tree visualization script, using all available drawing options (see :ref:`Tree visualization`). 33 | 34 | 35 | Some intermediary outputs are also stored in different sub-folders of :code:`SCORPiOs_jobname/`. Please see the section on :ref:`Intermediary outputs` for a detailed description. 36 | 37 | In addition, SCORPiOs writes statistics on key steps of the workflow to the standard output. Thus, to separate output statistics from Snakemake logs, you can run: 38 | 39 | .. prompt:: bash 40 | 41 | snakemake --configfile config_example.yaml --use-conda --cores 4 --scheduler=greedy >out 2>err 42 | 43 | or 44 | 45 | .. prompt:: bash 46 | 47 | bash iterate_scorpios.sh --snake_args="--configfile config_example.yaml --cores 4 --scheduler=greedy" >out 2>err 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /doc/output_treeviz.rst: -------------------------------------------------------------------------------- 1 | Tree visualization 2 | =================== 3 | 4 | SCORPiOs tags corrected nodes in the gene trees to allow inspection using tree visualisation softwares (i.e `ETE Toolkit `_ or `ggtree `_). To facilitate correction visualization, we provide a custom script that generates before-and-after images for corrected trees. Alternatively, output NHX files from SCORPiOs can also directly be loaded into online tree comparison tools such as `Phylo.io `_. 5 | 6 | .. note:: 7 | 8 | The dedicated tool makes it easier to visualize specific corrections by SCORPiOs, especially for large gene families with more than one duplicated subtree. 9 | 10 | Visualizing tree corrections 11 | ---------------------------- 12 | 13 | With the default configuration, SCORPiOs saves individual corrected trees in the folder :code:`SCORPiOs_jobname/Corrections/tmp_whole_trees_0` (or :code:`SCORPiOs_jobname/Corrections/tmp_whole_trees_i` for iteration i, in iterative mode). Our custom script :code:`scripts/trees/make_tree_images.py` generates images based on the trees saved in this folder. 14 | 15 | .. important:: 16 | You should ensure that the SCORPiOs conda environnment is activated before running :code:`scripts/trees/make_tree_images.py`. You can activate it with :code:`conda activate scorpios`. 17 | 18 | Example 19 | ^^^^^^^ 20 | 21 | For instance, after a simple SCORPiOs run on example data, the following command creates images allowing to view all corrections for the salmonids WGD: 22 | 23 | .. prompt:: bash 24 | 25 | python scripts/trees/make_tree_images.py -i SCORPiOs_example/Corrections/tmp_whole_trees_0 --wgd Salmonidae --outgr 'Esox.lucius,Gasterosteus.aculeatus,Oryzias.latipes' -o SCORPiOs_example/Corrections/trees_img 26 | 27 | Here are generated figures for a corrected tree (:code:`SCORPiOs_example/Corrections/trees_img/img_cor_27.png`, right) and its before-correction counterpart (:code:`SCORPiOs_example/Corrections/trees_img/img_ori_27.png`, left): 28 | 29 | |pic1| |pic2| 30 | 31 | .. |pic1| image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/example_ori_27.png 32 | :width: 48% 33 | :alt: original tree 34 | 35 | .. |pic2| image:: https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/master/doc/img/example_cor_27.png 36 | :width: 48% 37 | :alt: corrected tree 38 | 39 | Internal nodes follow color conventions: duplications in red, dubious duplications in cyan and speciation in blue. Leaves of the SCORPiOs-corrected subtree are shown in the same color in the corrected and uncorrected versions. The corrected WGD node is highlighted with a bigger circle and a grey background. 40 | 41 | Command-line arguments for :code:`scripts/trees/make_tree_images.py`: 42 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 43 | 44 | **Required named arguments:** 45 | 46 | -i INPUT, --input INPUT Folder with corrected and original trees, or a list of tree files. 47 | 48 | -w WGD, --wgd=WGD Corrected wgd to highlight. For instance, -wgd Clupeocephala will show only subtrees corrected for 49 | the wgd that occured in the Clupeocephala ancestor. 50 | 51 | --outgr OUTGROUP Outgroup(s) used in SCORPiOs tree correction, comma-separated. 52 | 53 | 54 | **Optional arguments:** 55 | 56 | -o OUTPUT, --output OUTPUT Output folder, default is trees_img/ 57 | 58 | -f FORMAT, --format FORMAT Output format (pdf, svg or png). 59 | 60 | --show_moved Color non-wgd rearranged leaves, default is False 61 | 62 | --color_outgr Color the outgroup gene used by SCORPiOs, default is False 63 | 64 | 65 | Using the phylo.io web interface 66 | -------------------------------- 67 | Alternatively, users can view original and corrected trees using the `phylo.io web interface `_. 68 | 69 | This only requires to paste in (or upload) corrected and original trees. You can either use individual gene trees stored in the :code:`SCORPiOs_example/Corrections/tmp_whole_trees_0/` or the full sets of gene trees (here :code:`data/example/forest.nhx` and :code:`SCORPiOs_example/SCORPiOs_output_0.nhx`, respectively). However, we recommend using individual trees for real datasets, as the amount of data can be quite large. With the :code:`compare` function, original and corrected trees can be inspected side-by-side, with all differences highlighted. 70 | 71 | For more details and example images, you can look at `phylo.io documentation `_. 72 | 73 | 74 | Reference 75 | ^^^^^^^^^ 76 | `Phylo.io `_: Robinson et al,. (2016) Phylo.io : Interactive Viewing and Comparison of Large Phylogenetic Trees on the Web. Mol Biol Evol; 33 (8): 2163-2166. 77 | -------------------------------------------------------------------------------- /doc/project_changelog.rst: -------------------------------------------------------------------------------- 1 | Change Log 2 | ========== 3 | 4 | .. include:: ../CHANGELOG.rst -------------------------------------------------------------------------------- /doc/project_info.rst: -------------------------------------------------------------------------------- 1 | How to cite 2 | =========== 3 | 4 | If you use SCORPiOs, please cite: 5 | 6 | Parey E, Louis A, Cabau C, Guiguen Y, Roest Crollius H, Berthelot C, Synteny-guided resolution of gene trees clarifies the functional impact of whole genome duplications, Molecular Biology and Evolution, msaa149, https://doi.org/10.1093/molbev/msaa149. 7 | 8 | License 9 | ======= 10 | This code may be freely distributed and modified under the terms of the GNU General Public License version 3 (GPL v3) 11 | 12 | - `LICENSE GPLv3 `_ 13 | 14 | Authors 15 | ======= 16 | 17 | `DYOGEN Team (IBENS) `_ 18 | 19 | - `Elise Parey `_ 20 | - Alexandra Louis 21 | - Hugues Roest Crollius 22 | - Camille Berthelot 23 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==3.0.3 2 | sphinx-rtd-theme 3 | sphinx-prompt 4 | docutils==0.16 5 | -------------------------------------------------------------------------------- /doc/scripts.graphs.rst: -------------------------------------------------------------------------------- 1 | scripts.graphs package 2 | ====================== 3 | 4 | scripts.graphs.combine\_outgroups module 5 | ---------------------------------------- 6 | 7 | .. automodule:: scripts.graphs.combine_outgroups 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | scripts.graphs.orthogroups module 13 | --------------------------------- 14 | 15 | .. automodule:: scripts.graphs.orthogroups 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /doc/scripts.lorelei.rst: -------------------------------------------------------------------------------- 1 | scripts.lorelei package 2 | ======================= 3 | 4 | scripts.lorelei.constrained\_aore\_lore\_topologies module 5 | ---------------------------------------------------------- 6 | 7 | .. automodule:: scripts.lorelei.constrained_aore_lore_topologies 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | scripts.lorelei.fix\_rideogram module 13 | ------------------------------------- 14 | 15 | .. automodule:: scripts.lorelei.fix_rideogram 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | scripts.lorelei.homeologs\_pairs\_from\_ancestor module 21 | ------------------------------------------------------- 22 | 23 | .. automodule:: scripts.lorelei.homeologs_pairs_from_ancestor 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | scripts.lorelei.homeologs\_tree\_conflicts module 29 | ------------------------------------------------- 30 | 31 | .. automodule:: scripts.lorelei.homeologs_tree_conflicts 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | scripts.lorelei.make\_rideograms\_inputs module 37 | ----------------------------------------------- 38 | 39 | .. automodule:: scripts.lorelei.make_rideograms_inputs 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | scripts.lorelei.write\_ancgenes\_treeclass module 45 | ------------------------------------------------- 46 | 47 | .. automodule:: scripts.lorelei.write_ancgenes_treeclass 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | -------------------------------------------------------------------------------- /doc/scripts.rst: -------------------------------------------------------------------------------- 1 | API 2 | ==== 3 | 4 | Modules 5 | --------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | scripts.synteny 11 | scripts.trees 12 | scripts.graphs 13 | scripts.lorelei 14 | -------------------------------------------------------------------------------- /doc/scripts.synteny.rst: -------------------------------------------------------------------------------- 1 | scripts.synteny package 2 | ======================= 3 | 4 | scripts.synteny.duplicated\_families module 5 | ------------------------------------------- 6 | 7 | .. automodule:: scripts.synteny.duplicated_families 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | scripts.synteny.f1\_score\_optimization module 13 | ---------------------------------------------- 14 | 15 | .. automodule:: scripts.synteny.f1_score_optimization 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | scripts.synteny.filter\_no\_synteny\_genes module 21 | ------------------------------------------------- 22 | 23 | .. automodule:: scripts.synteny.filter_no_synteny_genes 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | scripts.synteny.filter\_regions module 29 | -------------------------------------- 30 | 31 | .. automodule:: scripts.synteny.filter_regions 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | scripts.synteny.missed\_orthologies module 37 | ------------------------------------------ 38 | 39 | .. automodule:: scripts.synteny.missed_orthologies 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | scripts.synteny.mygenome module 45 | ------------------------------- 46 | 47 | .. automodule:: scripts.synteny.mygenome 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | scripts.synteny.pairwise\_orthology\_synteny module 53 | --------------------------------------------------- 54 | 55 | .. automodule:: scripts.synteny.pairwise_orthology_synteny 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | 60 | scripts.synteny.syntenycompare module 61 | ------------------------------------- 62 | 63 | .. automodule:: scripts.synteny.syntenycompare 64 | :members: 65 | :undoc-members: 66 | :show-inheritance: 67 | 68 | scripts.synteny.utilities module 69 | -------------------------------- 70 | 71 | .. automodule:: scripts.synteny.utilities 72 | :members: 73 | :undoc-members: 74 | :show-inheritance: 75 | 76 | -------------------------------------------------------------------------------- /doc/scripts.trees.rst: -------------------------------------------------------------------------------- 1 | scripts.trees package 2 | ===================== 3 | 4 | scripts.trees.build\_treebest\_trees module 5 | ------------------------------------------- 6 | 7 | .. automodule:: scripts.trees.build_treebest_trees 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | scripts.trees.convert\_ids module 13 | --------------------------------- 14 | 15 | .. automodule:: scripts.trees.convert_ids 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | scripts.trees.cut\_subtrees module 21 | ---------------------------------- 22 | 23 | .. automodule:: scripts.trees.cut_subtrees 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | scripts.trees.genetree module 29 | ----------------------------- 30 | 31 | .. automodule:: scripts.trees.genetree 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | scripts.trees.inconsistent\_trees module 37 | ---------------------------------------- 38 | 39 | .. automodule:: scripts.trees.inconsistent_trees 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | scripts.trees.iteration\_nhx\_tags module 45 | ----------------------------------------- 46 | 47 | .. automodule:: scripts.trees.iteration_nhx_tags 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | scripts.trees.make\_tree\_images module 53 | --------------------------------------- 54 | 55 | .. automodule:: scripts.trees.make_tree_images 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | 60 | scripts.trees.merge\_subtrees module 61 | ------------------------------------ 62 | 63 | .. automodule:: scripts.trees.merge_subtrees 64 | :members: 65 | :undoc-members: 66 | :show-inheritance: 67 | 68 | scripts.trees.orthologs module 69 | ------------------------------ 70 | 71 | .. automodule:: scripts.trees.orthologs 72 | :members: 73 | :undoc-members: 74 | :show-inheritance: 75 | 76 | scripts.trees.parse\_au\_test module 77 | ------------------------------------ 78 | 79 | .. automodule:: scripts.trees.parse_au_test 80 | :members: 81 | :undoc-members: 82 | :show-inheritance: 83 | 84 | scripts.trees.regraft\_subtrees module 85 | -------------------------------------- 86 | 87 | .. automodule:: scripts.trees.regraft_subtrees 88 | :members: 89 | :undoc-members: 90 | :show-inheritance: 91 | 92 | scripts.trees.speciestree module 93 | -------------------------------- 94 | 95 | .. automodule:: scripts.trees.speciestree 96 | :members: 97 | :undoc-members: 98 | :show-inheritance: 99 | 100 | scripts.trees.utilities module 101 | ------------------------------ 102 | 103 | .. automodule:: scripts.trees.utilities 104 | :members: 105 | :undoc-members: 106 | :show-inheritance: 107 | -------------------------------------------------------------------------------- /envs/graphs.yaml: -------------------------------------------------------------------------------- 1 | name: graphs 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - networkx=2.3 8 | - numpy=1.18.1 9 | - python=3.6.8 10 | - scikit-learn=0.22.1 11 | - six=1.12.0 12 | -------------------------------------------------------------------------------- /envs/plots.yaml: -------------------------------------------------------------------------------- 1 | name: plots 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - pandas 7 | - matplotlib 8 | - seaborn 9 | - numpy 10 | - scipy 11 | - statsmodels 12 | - pingouin 13 | - svgutils -------------------------------------------------------------------------------- /envs/polytomysolver.yaml: -------------------------------------------------------------------------------- 1 | name: polytomysolver 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - python=2.7 8 | - pip=19.1.1 9 | - six=1.12.0 10 | - ete3=3.1.1 11 | - lxml=4.3.4 12 | 13 | - pip: 14 | - profileNJ 15 | 16 | #profileNJ triggers the download of the latest numpy version, this forces a compatible version 17 | - numpy==1.10.4 18 | -------------------------------------------------------------------------------- /envs/rideogram.yaml: -------------------------------------------------------------------------------- 1 | name: rideogram 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - r-rideogram 6 | - r-optparse -------------------------------------------------------------------------------- /envs/scorpios.yaml: -------------------------------------------------------------------------------- 1 | name: scorpios 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | - eparey 7 | dependencies: 8 | - consel=0.20 9 | - emboss=6.5.7 10 | - ete3=3.1.1 11 | - numpy=1.18.1 12 | - python=3.6.8 13 | - raxml=8.2.12 14 | - roman 15 | - six=1.12.0 16 | - smart_open=5.2 17 | - snakemake=6.6.1 18 | - snakemake-minimal=6.6.1 19 | - shyaml 20 | - tabulate=0.8 21 | - treebest=1.9.2.post1 22 | -------------------------------------------------------------------------------- /iterate_scorpios.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################################################################# 4 | # Bash wrapper script to run SCORPiOs in iterative mode # 5 | # # 6 | # Example usage: bash iterate_scorpios.sh --snake_args="--configfile config_example.yaml" # 7 | # [--max_iter=5] [--min_corr=1] # 8 | # [--starting_iter=1] # 9 | ############################################################################################# 10 | 11 | 12 | #Set default values for parameters (--j and --snake_args are required) 13 | max_iter=5 14 | min_corr=5 15 | iteration=1 16 | # snake_args="--configfile config_v89.yaml --cores 35" 17 | 18 | ### Command-line argument parsing ### 19 | while [ $# -gt 0 ]; do 20 | 21 | case "$1" in 22 | 23 | #maximum number of iteration to run 24 | --max_iter=*) 25 | max_iter="${1#*=}" 26 | ;; 27 | 28 | #stop if less than min_corr trees have been corrected 29 | --min_corr=*) 30 | min_correction="${1#*=}" 31 | ;; 32 | 33 | #starting iteration, if we want to resume a SCORPiOs run for a certain iter 34 | --starting_iter=*) 35 | iteration="${1#*=}" 36 | ;; 37 | 38 | #arguments for snakemake execution 39 | --snake_args=*) 40 | snake_args="${1#*=}" 41 | ;; 42 | 43 | *) 44 | 45 | echo "*********************************" >&2 46 | echo " ArgumentError: Invalid argument:" >&2 47 | echo " $1 " >&2 48 | echo "*********************************" >&2 49 | exit 1 50 | 51 | esac 52 | 53 | shift 54 | 55 | done 56 | 57 | #check that required arg is set 58 | if [ -z "$snake_args" ]; then 59 | echo "******************************************" >&2 60 | echo " ArgumentError: --snake_args is required " >&2 61 | echo "******************************************" >&2 62 | exit 1 63 | fi 64 | 65 | #extract config related args from snakemake args, to invoke the --config option correctly below 66 | snake_config_args=${snake_args#*--config } 67 | 68 | if [ "$snake_config_args" == "$snake_args" ]; then 69 | 70 | snake_config_args="--config " 71 | 72 | else 73 | 74 | snake_config_args=${snake_config_args%% -*} 75 | snake_config_args="--config ${snake_config_args}" 76 | snake_args="${snake_args/${snake_config_args}/}" 77 | 78 | fi 79 | 80 | j=0 81 | 82 | #get configfile name to then extract the jobname and the species tree from it 83 | configfile=${snake_args#*--configfile=} 84 | 85 | if [ "$configfile" == "$snake_args" ]; then 86 | 87 | configfile=${snake_args#*--configfile } 88 | 89 | fi 90 | 91 | configfile=${configfile%% *} 92 | configfile=${configfile%% --configfile} 93 | sptree=$(cat $configfile | shyaml get-value species_tree) 94 | job_name=$(cat $configfile | shyaml get-value jobname) 95 | 96 | #run SCORPiOs iteratively 97 | for i in $(seq $iteration $max_iter); do 98 | 99 | #if it is not the first iteration, print names of corrected subtrees to file tmp_corrected_prev_iter 100 | if (( $i!=1 )) 101 | then 102 | cat "SCORPiOs_"${job_name}/Corrections/Accepted_Trees*$((i-1)) > .tmp_corrected_prev_iter_${job_name} 103 | else 104 | touch .tmp_corrected_prev_iter_${job_name} 105 | fi 106 | 107 | #run SCORPiOs if first iteration or if number of corrections in previous iter > min_correction 108 | if (( $i==1 )) || [[ $(wc -l <.tmp_corrected_prev_iter_${job_name}) -gt $min_correction ]] 109 | then 110 | echo "----------------" 111 | echo " Iteration: $i" 112 | echo "----------------" 113 | echo "Iteration: $i" >&2 114 | snakemake $snake_args $snake_config_args current_iter=$i --use-conda 115 | j=$i 116 | else 117 | break 118 | fi 119 | 120 | done 121 | 122 | #remove temp 123 | rm .tmp_corrected_prev_iter_${job_name} 124 | 125 | #If output exists (i.e no raised errors above) write .nhx correction tags and exit 126 | if [ -f "SCORPiOs_${job_name}/SCORPiOs_output_${j}.nhx" ]; then 127 | 128 | echo "Termination after $j correction iterations ">&2 129 | echo "Browsing the corrected forests of each iteration to write final .nhx correction tags ">&2 130 | 131 | input="SCORPiOs_${job_name}/SCORPiOs_output_%d.nhx" 132 | output="SCORPiOs_${job_name}/SCORPiOs_output_${j}_with_tags.nhx" 133 | 134 | python -m scripts.trees.iteration_nhx_tags -o $output -i $j -c $input --internal -sp $sptree 135 | # python -m scripts.trees.iteration_nhx_tags -o $output -i $j -c $input 136 | fi 137 | -------------------------------------------------------------------------------- /module_build_trees.smk: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | SCORPiOs snakemake module to build starting gene trees with TreeBeST. 4 | """ 5 | 6 | if ITER < 2: #allows to use --forceall in iterative mode without this rule executing at iter 2 7 | 8 | rule build_input_trees: 9 | """ 10 | Build input trees with TreeBeST best. 11 | """ 12 | input: a = config["alis"], sp = config["species_tree"], map = config["genes_sp_mapping"] 13 | output: input_trees 14 | threads: config['ncores'] 15 | shell:""" 16 | python -m scripts.trees.build_treebest_trees -a {input.a} -s {input.sp} -m {input.map}\ 17 | -nc {threads} -o {output}\ 18 | -tmp SCORPiOs_{config[jobname]}/ 19 | """ 20 | -------------------------------------------------------------------------------- /module_graphs_orthogroups.smk: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | SCORPiOs snakemake module to identify orthologous gene communtities in orthology graphs and derive 4 | topological constraints on corresponding gene trees. 5 | """ 6 | SPECTRAL = config.get('spectral', '') 7 | if SPECTRAL and SPECTRAL.lower() not in ['n', 'no', 'false']: 8 | SPECTRAL = "--spectral" 9 | else: 10 | SPECTRAL = "" 11 | 12 | SUBSET_TO_CORRECT = config.get('subset_to_correct', '') #should be a file, one gene per line (one gene per tree to correct), if provided in config 13 | if SUBSET_TO_CORRECT: 14 | SUBSET_TO_CORRECT = '--correct_only '+SUBSET_TO_CORRECT 15 | 16 | rule cut_orthology_graphs: 17 | """ 18 | Loads orthology graphs and uses community detection algorithms to identify the 2 orthogroups. 19 | """ 20 | input: Sorted_SyntenyOrthoPred+'.gz' 21 | output: a=GraphsOrthogroups, b=Summary 22 | threads: config["ncores"] 23 | params: Summary = Summary.replace("{{outgr}}", "{{wildcards.outgr}}")\ 24 | .replace("{{wgd}}","{{wildcards.wgd}}") 25 | conda: "envs/graphs.yaml" 26 | shell:""" 27 | python -m scripts.graphs.orthogroups -i {input} -o {output.a} -n {threads} -s {params.Summary}\ 28 | -ignSg {config[ignoreSingleGeneCom]} -wgd {wildcards.wgd},{wildcards.outgr} {SPECTRAL} 29 | """ 30 | 31 | 32 | def expand_graphs_outgr(wildcards): 33 | """ 34 | Gets a list of graph cut results files (one for each outgroup) 35 | """ 36 | return expand(GraphsOrthogroups.replace("{wgd}", "{{wgd}}"),\ 37 | outgr=config['WGDs'][wildcards.wgd].split(',')) 38 | 39 | 40 | def expand_graphcut_summary(wildcards): 41 | """ 42 | Gets a list of graph cut summary files (one for each outgroup) 43 | """ 44 | return expand(Summary.replace("{wgd}", "{{wgd}}"),\ 45 | outgr=config['WGDs'][wildcards.wgd].split(',')) 46 | 47 | 48 | def expand_orthotables_outgr(wildcards): 49 | """ 50 | Gets a list of orthology tables (one for each outgroup) 51 | """ 52 | return expand(OrthoTable.replace("{wgd}", "{{wgd}}"),\ 53 | outgr=config['WGDs'][wildcards.wgd].split(',')) 54 | 55 | 56 | checkpoint gene_trees_to_correct: 57 | """ 58 | Converts orthogroups into topological constraints on the gene tree. 59 | Save all subtrees, subalignment and constrained tree where contraint is absent in input trees. 60 | """ 61 | input: graph_cuts=expand_graphs_outgr, graph_cuts_summaries=expand_graphcut_summary, 62 | orthotables=expand_orthotables_outgr, alis = config['alis'] 63 | 64 | output: ctrees = directory(CTREES+"/{wgd}/"), subalis = directory(SUBALIS+"/{wgd}/"), 65 | subtrees = directory(SUBTREES+"/{wgd}/"), tsum = TREES_SUMMARY 66 | 67 | params: graphs = lambda wildcards, input: ",".join(list(input.graph_cuts)), 68 | gsum = lambda wildcards, input: ",".join(list(input.graph_cuts_summaries)), 69 | otable = lambda wildcards, input: ",".join(list(input.orthotables)), 70 | outgroups = lambda wildcards: config['WGDs'][wildcards.wgd], 71 | outcombin = outcombin.replace("{{wgd}}", "{{wildcards.wgd}}"), 72 | args_subset_to_correct = SUBSET_TO_CORRECT 73 | 74 | shell:""" 75 | python -m scripts.trees.inconsistent_trees -n {params.outgroups} -i {params.graphs}\ 76 | -f {params.otable} -t {input_trees} -a {config[alis]} -oc {output.ctrees} -oa {output.subalis}\ 77 | -ot {output.subtrees} -gs {params.gsum} -s {output.tsum} -wgd {wildcards.wgd}\ 78 | -fcombin {params.outcombin} {params.args_subset_to_correct} 79 | """ 80 | -------------------------------------------------------------------------------- /module_lorelei_diagnostic.smk: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | SCORPiOs LORelEi module to analyze the genomic location of sequence-synteny conflicted families? 4 | """ 5 | 6 | OUTFOLDER = f"SCORPiOs-LORelEi_{JOBNAME_L}/diagnostic" 7 | 8 | 9 | OUTGR_GENES = SCORPIOS_CONFIG["genes"] % LORE_OUTGR 10 | 11 | POSt_DUP = config.get("post_dup", "") 12 | 13 | COMBIN_ARG = '' 14 | if len(LORE_OUTGRS.split(',')) > 1: 15 | COMBIN_ARG += '-c '+ COMBIN 16 | 17 | if "use_outgr" in config["pre_dup_proxy"]: 18 | 19 | REF = config["pre_dup_proxy"]["use_outgr"] 20 | 21 | rule prepare_homeologs_outgr: 22 | input: 23 | fam = ORTHOTABLE, 24 | summary = SUMMARY, 25 | acc = ACCEPTED, 26 | check = f"SCORPiOs-LORelEi_{JOBNAME_L}/integrity_checkpoint.out" 27 | output: 28 | incons = f"{OUTFOLDER}/conflicts", 29 | alltrees = f"{OUTFOLDER}/trees", 30 | params: 31 | genes = OUTGR_GENES, 32 | fomt = SCORPIOS_CONFIG.get("genes_format", "bed"), 33 | combin_args = COMBIN_ARG 34 | shell: 35 | "python -m scripts.lorelei.homeologs_pairs_from_ancestor -i {input.fam} --is_outgroup " 36 | "-homeo {params.genes} -s {input.summary} -oi {output.incons} -oa {output.alltrees} " 37 | "-a {input.acc} {params.combin_args} -f {params.fomt}" 38 | 39 | else: 40 | 41 | REF_FILE = config["pre_dup_proxy"]["use_anc"] 42 | REF = "Pre-duplication" 43 | 44 | rule prepare_homeologs_anc: 45 | input: 46 | fam = ORTHOTABLES, 47 | summary = SUMMARY, 48 | acc = ACCEPTED, 49 | pm = REF_FILE, 50 | check = f"SCORPiOs-LORelEi_{JOBNAME_L}/integrity_checkpoint.out" 51 | output: incons = f"{OUTFOLDER}/conflicts", alltrees = f"{OUTFOLDER}/trees" 52 | params: postdup = POSt_DUP 53 | shell: 54 | "python -m scripts.lorelei.homeologs_pairs_from_ancestor -i {input.fam} -a {input.acc} " 55 | "-homeo {input.pm} -s {input.summary} -oi {output.incons} -oa {output.alltrees} " 56 | "{params.postdup}" 57 | 58 | rule plot_homeologs: 59 | input: incons = f"{OUTFOLDER}/conflicts", all_trees = f"{OUTFOLDER}/trees" 60 | output: f"{OUTFOLDER}/seq_synteny_conflicts_by_homeologs.svg" 61 | conda: "envs/plots.yaml" 62 | shell: 63 | "python -m scripts.lorelei.homeologs_tree_conflicts -i {input.incons} -g {input.all_trees} " 64 | "-o {output} --refname '{REF}'" 65 | 66 | rule prepare_genome_plot: 67 | input: ctreedir = CTREES_DIR, summary = SUMMARY, acc = ACCEPTED, check = f"SCORPiOs-LORelEi_{JOBNAME_L}/integrity_checkpoint.out" 68 | 69 | output: fam = f"{OUTFOLDER}/conflicted_gene_families.tsv" 70 | shell: 71 | "python -m scripts.lorelei.write_ancgenes_treeclass -a {input.acc} -t {input.ctreedir} " 72 | "-c {input.summary} -o {output.fam} -r 'Inconsistent'" 73 | 74 | rule prepare_input_rideogram: 75 | input: 76 | fam = f"{OUTFOLDER}/conflicted_gene_families.tsv", 77 | genes = GENES 78 | output: 79 | karyo = f"{OUTFOLDER}/karyo_ide.txt", 80 | feat = f"{OUTFOLDER}/incons_ide.txt" 81 | params: sp = SP 82 | shell: "python -m scripts.lorelei.make_rideograms_inputs -i {input.fam} -g {input.genes} " 83 | "-k {output.karyo} -o {output.feat} -f dyogen" 84 | 85 | rule plot_conflicts_on_genome: 86 | input: 87 | karyo = f"{OUTFOLDER}/karyo_ide.txt", 88 | feat = f"{OUTFOLDER}/incons_ide.txt" 89 | output: temp(f"{OUTFOLDER}/seq_synteny_conflicts_on_genome_tmp.svg") 90 | params: sp = SP 91 | conda: 'envs/rideogram.yaml' 92 | shell: 93 | "Rscript scripts/lorelei/plot_genome.R -k {input.karyo} -f {input.feat} -o {output}" 94 | 95 | rule remove_legend_rideogram: 96 | input: f"{OUTFOLDER}/seq_synteny_conflicts_on_genome_tmp.svg" 97 | output: temp(f"{OUTFOLDER}/seq_synteny_conflicts_on_genome_tmp2.svg") 98 | shell: "sed 's/Low.*//g' {input} > {output} && echo '' >> {output}" 99 | 100 | rule new_legend_and_title_rideogram: 101 | input: f"{OUTFOLDER}/seq_synteny_conflicts_on_genome_tmp2.svg" 102 | output: f"{OUTFOLDER}/seq_synteny_conflicts_on_genome.svg" 103 | params: sp = SP 104 | conda: "envs/plots.yaml" 105 | shell: 106 | "python -m scripts.lorelei.fix_rideogram -i {input} -o {output} -c 1 " 107 | "-t 'Sequence-synteny conflicts on {params.sp} chromosomes' -l 'inconsistent trees'" 108 | 109 | -------------------------------------------------------------------------------- /module_orthology_table.smk: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | SCORPiOs snakemake module to build an exhasutive orthology table between ingroup WGD duplicated 4 | species and one outgroup non-duplicated species. 5 | """ 6 | 7 | rule get_gene_families: 8 | """ 9 | Phylogenetic orthologs between the outgroup and ingroups (duplicated species). 10 | """ 11 | input: input_trees, config["species_tree"] 12 | output: orthology_table = OrthoTableStrict, 13 | genes_without_orthologs = UNCERTAIN, 14 | phylo_homologs = [paralogs, orthologs] 15 | shell: """ 16 | python -m scripts.synteny.duplicated_families -t {input[0]} -n {wildcards.outgr}\ 17 | -d {wildcards.wgd} {anc_arg} -s {input[1]} -g {config[genes]}\ 18 | -o {output.orthology_table} -u {output.genes_without_orthologs} -f {config[genes_format]} 19 | """ 20 | 21 | 22 | rule gene_tree_orthologies: 23 | """ 24 | Phylogenetic orthologs in all pairs of ingroups. 25 | """ 26 | input: input_trees, config["species_tree"] 27 | output: a=directory(TreesOrthologies+'/{wgd}') 28 | shell: """ 29 | python -m scripts.trees.orthologs -d {wildcards.wgd} \ 30 | -s {config[species_tree]} -t {input_trees} -o {output.a}\ 31 | {lowcov_arg} {anc_arg} 32 | """ 33 | 34 | 35 | checkpoint outgroup_chromosomes: 36 | """ 37 | Creates a file with chromosomes in the outgroup having entries in the orthologytable. 38 | """ 39 | input: OrthoTableStrict 40 | output: Chr 41 | run: 42 | with open(input[0], 'r') as infile: 43 | c_outgr = {line.split('\t')[0] for i, line in enumerate(infile) if i != 0} 44 | 45 | with open(output[0], 'w') as outfile: 46 | for chrom in c_outgr: 47 | outfile.write(chrom+'\n') 48 | 49 | if "optimize_synteny_support_threshold" in config and config["optimize_synteny_support_threshold"] == 'y': 50 | 51 | rule get_scores_ortho: 52 | input: ot = OrthoTableStrict, po = orthologs, c = Chr, f = paralogs+'_fam' 53 | 54 | output: orthologs+'_scores.pkl' 55 | shell:""" 56 | python -m scripts.synteny.missed_orthologies -i {input.ot} -u {input.po} -c {input.c}\ 57 | --optimize -u_opt_fam {input.f} 58 | """ 59 | 60 | rule get_scores_para: 61 | input: ot = OrthoTableStrict, po = paralogs, c = Chr 62 | 63 | output: s = paralogs+'_scores.pkl', f = paralogs+'_fam' 64 | shell:""" 65 | python -m scripts.synteny.missed_orthologies -i {input.ot} -u {input.po} -c {input.c}\ 66 | --optimize -opt_fam {output.f} 67 | """ 68 | 69 | rule syntenic_support_threshold: 70 | input: o = orthologs+'_scores.pkl', p = paralogs+'_scores.pkl' 71 | output: Threshold 72 | shell:""" 73 | python -m scripts.synteny.f1_score_optimization -i1 {input.o} -i2 {input.p} -out {output} 74 | """ 75 | 76 | else: 77 | 78 | rule pass_synt_support_optimization: 79 | output: touch(Threshold) 80 | 81 | rule add_families_with_synteny_evidence: 82 | """ 83 | Updates the orthology table with synteny-supported orthologies. 84 | """ 85 | input: a=OrthoTableStrict, b=UNCERTAIN, c=Chr, t=Threshold 86 | output: OrthoTable 87 | params: no_graph = fam_no_graph.replace("{{outgr}}", "{{wildcards.outgr}}")\ 88 | .replace("{{wgd}}","{{wildcards.wgd}}"), 89 | shell: """ 90 | python -m scripts.synteny.missed_orthologies -i {input.a} -u {input.b} -c {input.c} -o {output}\ 91 | -wgd {wildcards.wgd},{wildcards.outgr} -w {config[windowSize]} -f {params.no_graph} `cat {input.t}` 92 | """ 93 | 94 | if int(ITER) > 1: 95 | 96 | rule filter_out_unchanged_regions: 97 | """ 98 | If run in iterative mode, filter orthologytable entries to regions with an updated synteny 99 | context compared to previous iteration. 100 | """ 101 | input: OrthoTable 102 | output: regions 103 | params: OrthoTable = OrthoTable.replace("{{wgd}}", "{{wildcards.wgd}}"), 104 | OrthoTable_prev = OrthoTable_prev.replace("{{wgd}}", "{{wildcards.wgd}}"), 105 | regions = regions.replace("{{outgr}}", "{{wildcards.outgr}}").replace("{{wgd}}",\ 106 | "{{wildcards.wgd}}"), 107 | Acc_prev = Acc_prev.replace("{{wgd}}", "{{wildcards.wgd}}"), 108 | no_graph = fam_no_graph.replace("{{outgr}}", "{{wildcards.outgr}}")\ 109 | .replace("{{wgd}}","{{wildcards.wgd}}"), 110 | wgd = lambda wildcards: wildcards.wgd+","+wildcards.outgr, 111 | combin = incombin.replace("{{wgd}}", "{{wildcards.wgd}}") 112 | 113 | run: 114 | filter_regions.make_region_file(params.OrthoTable, params.OrthoTable_prev,\ 115 | params.Acc_prev, params.regions, config['windowSize'], 116 | params.no_graph, params.wgd, params.combin) 117 | else: 118 | 119 | rule pass_filter: 120 | """ 121 | If run in iterative mode, filter orthologytable entries to regions with an updated synteny 122 | context compared to previous iteration. 123 | """ 124 | input: OrthoTable 125 | output: regions 126 | shell: """ 127 | touch {output} 128 | """ 129 | 130 | if config["filter_otable_nosynteny"] == 'y': 131 | rule filter_table: 132 | input: OrthoTable 133 | output: OrthoTableF 134 | params: Chr = Chr.replace("{{outgr}}", "{{wildcards.outgr}}").replace("{{wgd}}",\ 135 | "{{wildcards.wgd}}"), 136 | shell:""" 137 | python -m scripts.synteny.filter_no_synteny_genes -i {input} -chr {params.Chr}\ 138 | -w {config[windowSize]} -o {output} -wgd {wildcards.wgd},{wildcards.outgr} 139 | """ 140 | 141 | else: 142 | rule pass_ofilter: 143 | input: OrthoTable 144 | output: OrthoTableF 145 | shell:""" 146 | cp {input} {output} 147 | """ 148 | -------------------------------------------------------------------------------- /module_synteny_ortho_para.smk: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | SCORPiOs snakemake module to predict gene orthologies and paralogies using synteny conservation 4 | patterns, for all pairs of duplicated species. 5 | """ 6 | 7 | ## parallelization scheme --> snakemake parallelization is inefficient with a large number of jobs 8 | ## if many duplicated species, I convert lots of small jobs into a smaller number of bigger jobs 9 | args_pairwise_parallel = "" 10 | BIG_RUN = False 11 | if "parallel_scheme_large_job" in config and config["parallel_scheme_large_job"] == 'y': 12 | BIG_RUN = True 13 | args_pairwise_parallel = "-chr_list "+Chr 14 | 15 | 16 | rule synteny_orthologies_paralogies: 17 | """ 18 | Identifies post-WGD orthologs and paralogs in all pairs of duplicated species, using synteny. 19 | Executed in parallel for each chromsome in the outgroup and each pair of duplicated species. 20 | """ 21 | input: a=OrthoTableF, d=TreesOrthologies+'/{wgd}', e=regions 22 | output: temp(Pairwise_SyntenyOrthoPred+"/{pairwise}_{chr_outgr}_{outgr}_{wgd}.txt") 23 | params: args_autho = args_autho.replace("{{outgr}}", "{{wildcards.outgr}}").replace("{{wgd}}",\ 24 | "{{wildcards.wgd}}"), 25 | args_parallel = args_pairwise_parallel.replace("{{outgr}}", "{{wildcards.outgr}}")\ 26 | .replace("{{wgd}}","{{wildcards.wgd}}") 27 | shell:""" 28 | python -m scripts.synteny.pairwise_orthology_synteny -i {input.a} \ 29 | -p {wildcards.pairwise} -chr {wildcards.chr_outgr} \ 30 | -ortho {input.d} -o {output} -w {config[windowSize]} \ 31 | -cutoff={config[cutoff]} {params.args_autho} {params.args_parallel} 32 | """ 33 | 34 | 35 | def all_output_species_pair_outgr_chr(wildcards): 36 | """ 37 | Expands the chr_outgr wildcards and returns the list of expected outputs of rule 38 | synteny_orthologies_paralogies. 39 | """ 40 | chr_outgr = [] 41 | with open(checkpoints.outgroup_chromosomes.get(**wildcards).output[0]) as infile: 42 | 43 | chr_outgr += [line.strip() for line in infile] 44 | 45 | if BIG_RUN: 46 | 47 | chr_outgr = ["all_chrom"] 48 | 49 | ALL_DUPLICATED_SPECIES = spt.get_species(config["species_tree"], wildcards.wgd, 50 | ','.join(config["WGDs"].keys()), lowcov) 51 | ALL_DUPLICATED_SPECIES = list(itertools.combinations(ALL_DUPLICATED_SPECIES, 2)) 52 | ALL_SPECIES_PAIRS = [('_').join((i, j)) for (i, j) in ALL_DUPLICATED_SPECIES] 53 | 54 | return expand(Pairwise_SyntenyOrthoPred+"/{pairwise}_{chr_outgr}_{{outgr}}_{{wgd}}.txt", 55 | pairwise=ALL_SPECIES_PAIRS, chr_outgr=chr_outgr) 56 | 57 | 58 | rule orthology_graphs: 59 | """ 60 | Concatenates all pairwise outputs into a single file. 61 | Note: A simple 'cat' command could fail with error 'Argument list too long', if too many inputs. 62 | """ 63 | input: all_output_species_pair_outgr_chr 64 | output: o=temp(SyntenyOrthoPred) 65 | run: 66 | with open(output.o, 'w') as outfile: 67 | for fname in input: 68 | if wildcards.outgr in fname: 69 | with open(fname) as infile: 70 | outfile.write(infile.read()) 71 | 72 | 73 | rule orthology_graphs_sort_gzip: 74 | """ 75 | Sorts and compress the file with all concatenated orthology, for more efficient loading. 76 | """ 77 | input: SyntenyOrthoPred 78 | output: Sorted_SyntenyOrthoPred + '.gz' 79 | params: out = Sorted_SyntenyOrthoPred.replace("{{outgr}}", "{{wildcards.outgr}}")\ 80 | .replace("{{wgd}}", "{{wildcards.wgd}}"), 81 | tmp = "SCORPiOs_"+config["jobname"]+'/' 82 | shell:""" 83 | sort -k 4 {input} -o {params.out} -T {params.tmp} && gzip {params.out}; 84 | """ 85 | -------------------------------------------------------------------------------- /scorpios_lorelei.smk: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | SCORPiOs LORelEi is an extension to the main SCORPiOs pipeline, it analyzes sequence-synteny 4 | conflicts in gene tree. 5 | """ 6 | 7 | from snakemake.io import load_configfile 8 | import sys 9 | 10 | 11 | # Get SCORPiOs config 12 | SCORPIOS_CONFIGFILE = config["scorpios_config"] 13 | 14 | # Update main SCOPRiOs current_iter if necessary 15 | ITERATION = config.get("iter", 0) 16 | 17 | if ITERATION != 0: 18 | SCORPIOS_CONFIGFILE_COPY = '.'.join(SCORPIOS_CONFIGFILE.split(".")[:-1]) + '.copy.yaml' 19 | with open(SCORPIOS_CONFIGFILE, 'r') as infile, open(SCORPIOS_CONFIGFILE_COPY, 'w') as outfile: 20 | iter_updated = False 21 | for line in infile: 22 | if 'current_iter' in line: 23 | outfile.write(f"current_iter: {ITERATION}\n") 24 | iter_updated = True 25 | else: 26 | outfile.write(line) 27 | if not iter_updated: 28 | outfile.write(f"current_iter: {ITERATION}\n") 29 | 30 | SCORPIOS_CONFIGFILE = SCORPIOS_CONFIGFILE_COPY 31 | 32 | # SCORPiOs LORelEi is an extension of SCORPiOs, and depends on SCORPiOs as a subworkflow. 33 | subworkflow scorpios: 34 | workdir: 35 | '.' 36 | configfile: 37 | SCORPIOS_CONFIGFILE 38 | 39 | 40 | # Set all output names 41 | def out_name(name, JOBNAME_S, iteration, wcard_wgd='', wcard_outgr=''): 42 | """ 43 | Generates output names with JOBNAME_S directory prefix and iteration suffix. 44 | Also adds required wildcards. 45 | """ 46 | if wcard_wgd: 47 | name+="_"+wcard_wgd 48 | 49 | if wcard_outgr: 50 | name+="_"+wcard_outgr 51 | 52 | name = "SCORPiOs_"+JOBNAME_S+'/'+name+'_'+str(iteration) 53 | return name 54 | 55 | # Check LORelEi mode 56 | assert config.get("mode", "diagnostic").lower() in ["likelihood_tests", "diagnostic"],\ 57 | "Invalid `mode`, please check your config." 58 | 59 | MODE = config["mode"].lower() 60 | 61 | # Check that configs are consistent 62 | SCORPIOS_CONFIG = load_configfile(SCORPIOS_CONFIGFILE) 63 | 64 | if len(SCORPIOS_CONFIG["WGDs"]) == 1: 65 | LORE_WGD = list(SCORPIOS_CONFIG["WGDs"].keys())[0] 66 | assert (not config.get("lore_wgd", "") or config.get("lore_wgd", "") == LORE_WGD),\ 67 | "Invalid `lore_wgd`, please check your config." 68 | 69 | else: 70 | assert config.get("lore_wgd", ""), "SCORPiOs was run to correct several WGDs, you should\ 71 | specify in config the one you want to run LORe analyses on." 72 | LORE_WGD = config["lore_wgd"] 73 | 74 | 75 | LORE_OUTGRS = SCORPIOS_CONFIG["WGDs"][LORE_WGD] 76 | LORE_OUTGR = LORE_OUTGRS.split(',')[0] 77 | 78 | if MODE == "diagnostic": 79 | if len(LORE_OUTGRS.split(',')) > 1: 80 | assert "use_anc" in config["pre_dup_proxy"] or "use_outgr" in config["pre_dup_proxy"] 81 | if "use_outgr" in config["pre_dup_proxy"]: 82 | LORE_OUTGR = config["pre_dup_proxy"]["use_outgr"] 83 | else: 84 | if "use_anc" not in config["pre_dup_proxy"]: 85 | config["pre_dup_proxy"] = config.get("pre_dup_proxy", {}) 86 | config["pre_dup_proxy"]["use_outgr"] = config.get("use_outgr", LORE_OUTGR) 87 | 88 | 89 | # Get SCORPiOs outputs 90 | JOBNAME_S = SCORPIOS_CONFIG["jobname"] 91 | 92 | CONSTREES = scorpios(out_name("Trees/ctrees", JOBNAME_S, ITERATION)) 93 | ACCEPTED = scorpios(out_name("Corrections/Accepted_Trees", JOBNAME_S, ITERATION, LORE_WGD)) 94 | ORTHOTABLE = scorpios(out_name("Families/Homologs", JOBNAME_S, ITERATION, LORE_WGD, LORE_OUTGR)) 95 | 96 | ORTHOTABLES = [] 97 | for OUTGROUP in LORE_OUTGRS.split(','): 98 | ORTHOTABLES.append(scorpios(out_name("Families/Homologs", JOBNAME_S, ITERATION, LORE_WGD, OUTGROUP))) 99 | SUMMARY = scorpios(out_name("Corrections/Trees_summary", JOBNAME_S, ITERATION, LORE_WGD)) 100 | RES = scorpios(out_name("Corrections/Res_polylk", JOBNAME_S, ITERATION)) 101 | SCORPIOS_CORRTREES = scorpios(out_name("SCORPiOs_output", JOBNAME_S, ITERATION)+'.nhx') 102 | COMBIN = out_name("Graphs/outcombin", JOBNAME_S, ITERATION, LORE_WGD) 103 | 104 | SPTREE = SCORPIOS_CONFIG["species_tree"] 105 | 106 | CTREES_DIR = scorpios(CONSTREES+"/"+LORE_WGD+"/") 107 | 108 | JOBNAME_L = JOBNAME_S 109 | if "jname" in config: 110 | JOBNAME_L += '_' + config["jname"] 111 | 112 | 113 | # LORELEI CONFIG (ALL modes) 114 | 115 | SP = config["dup_genome"] 116 | GENES = SCORPIOS_CONFIG["genes"] % SP 117 | 118 | 119 | # Set LORelEi WORKFLOW Targets 120 | 121 | if MODE.lower() == "diagnostic": 122 | rule Target: 123 | input: 124 | f"SCORPiOs-LORelEi_{JOBNAME_L}/diagnostic/seq_synteny_conflicts_by_homeologs.svg", 125 | f"SCORPiOs-LORelEi_{JOBNAME_L}/diagnostic/seq_synteny_conflicts_on_genome.svg" 126 | 127 | #include SCORPiOs LORelEi diagnostic 128 | include: "module_lorelei_diagnostic.smk" 129 | 130 | else: 131 | rule Target: 132 | input: 133 | "SCORPiOs-LORelEi_"+JOBNAME_L+"/lktests/lore_aore_on_genome.svg" 134 | 135 | #include SCORPiOs LORelEi lktests 136 | include: "module_lorelei_lktests.smk" 137 | 138 | 139 | rule check_scorpios_output_integrity: 140 | """ 141 | Explicitly verifies that intermediary outputs from SCORPiOs are complete. 142 | """ 143 | input: scorpios("SCORPiOs_"+JOBNAME_S+"/.cleanup_"+str(ITERATION)) 144 | output: touch(f"SCORPiOs-LORelEi_{JOBNAME_L}/integrity_checkpoint.out") 145 | run: 146 | ctrees_a, = glob_wildcards(CONSTREES+"/"+LORE_WGD+"/C_{ctrees}.nh") 147 | ctrees_b, = glob_wildcards(RES+"/"+LORE_WGD+"/Res_{ctrees}.txt") 148 | sys.stdout.write(f"SCORPiOs LORelEi: {MODE} mode\n") 149 | sys.stderr.write('Checking SCORPiOs output integrity...\n') 150 | if not set(ctrees_a) or set(ctrees_a) != set(ctrees_b): 151 | print("Please re-run SCORPiOs, output of the checkpoint rule appears to be incomplete.") 152 | sys.exit(1) 153 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/correct_subtrees_treebest.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | name=$1 4 | alifile=$2 5 | ctree=$3 6 | enstree=$4 7 | output=$5 8 | lktestpoly=$6 9 | outgr=$7 #comma separated list of outgroups 10 | treebtree=$8 11 | Sptree=$9 12 | 13 | 14 | alidir=${alifile%/*} 15 | dirtreeb=${treebtree%/*} 16 | 17 | #check if solution with polytomysolver was accepted 18 | VARIABLE=`python scripts/trees/parse_au_test.py -i $lktestpoly -one y` 19 | 20 | #if not, try treebest 21 | if [ "$VARIABLE" = false ]; then 22 | 23 | mkdir -p ${dirtreeb} 24 | 25 | #cut the constrained topology to resolve relations in each orthogroup independently 26 | python -m scripts.trees.cut_subtrees -og $name -c "$ctree" -a "$alifile" -ot "${dirtreeb}/${name}" -oa "${alidir}/${name}" 27 | 28 | orthog_subtrees='' 29 | 30 | for i in {1..2} 31 | do 32 | #if cut_subtrees made a subalignment for orthogroup i, compute its topology with treebest 33 | if [ -f ${alidir}/${name}_$i.fa ]; then 34 | >&2 echo "treebest solution computation for: $name, orthogroup $i" 35 | treebest phyml -n -Z 1e-3 -X 400 -S -f ${Sptree} ${alidir}/${name}_$i.fa > ${dirtreeb}/${name}_$i.nh 36 | orthog_subtrees+=' '${dirtreeb}/${name}_$i.nh 37 | 38 | #if orthogroup has only two genes, we don't need treebest to build the topology 39 | #tree was already generated by cut_subtrees 40 | elif [ -f ${dirtreeb}/${name}_$i.nh ]; then 41 | orthog_subtrees+=' '${dirtreeb}/${name}_$i.nh 42 | fi 43 | 44 | 45 | done 46 | 47 | #merge the resolved subtrees back together 48 | python -m scripts.trees.merge_subtrees -t$orthog_subtrees -outgr $name -o $treebtree 49 | 50 | #compute the likelihood test for this treebest solution 51 | bash scripts/make_lk_test_consel.sh $name $enstree $alifile $treebtree $output 52 | 53 | #clean all temp 54 | for i in {1..2} 55 | do 56 | if [ -f ${AliDir}/${name}_${wgd}_$i.fa ]; then 57 | 58 | rm ${Soldir}/${name}_${i}.nh 59 | rm ${AliDir}/${name}_${wgd}_$i.fa 60 | fi 61 | 62 | done 63 | 64 | 65 | #if polytomysolver was accepted don't try treebest 66 | else 67 | touch "$output" 68 | touch "$treebtree" 69 | fi 70 | -------------------------------------------------------------------------------- /scripts/graphs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/scripts/graphs/__init__.py -------------------------------------------------------------------------------- /scripts/lorelei/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/scripts/lorelei/__init__.py -------------------------------------------------------------------------------- /scripts/lorelei/fix_rideogram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Fix RIdeogram karyotype figure by adding a legend and title to it. 5 | 6 | Example:: 7 | 8 | $ python -m scripts.lorelei.fix_rideograms -i fig.svg -o out.svg -l AORe LOre [-c 2] [-t ''] 9 | """ 10 | 11 | import argparse 12 | import os 13 | 14 | import matplotlib.pyplot as plt 15 | 16 | import seaborn as sns 17 | 18 | import svgutils.transform as st 19 | 20 | 21 | def make_legend(outfilename, title, colors, labels): 22 | 23 | """ 24 | Plots to file a matplotlib figure with only legend and title. 25 | 26 | Args: 27 | outfilename (str): name for the output figure file 28 | title (str): title for the figure 29 | colors (list): ordered list of colors for the legend 30 | labels (list): ordered list of labels for the legend 31 | 32 | """ 33 | 34 | plt.figure(figsize=(8.44, 6)) 35 | func = lambda c: plt.plot([], [], marker='s', color=c, ls="none")[0] 36 | handles = [func(colors[i]) for i in range(len(colors))] 37 | 38 | plt.axis('off') 39 | plt.title(title) 40 | legend = plt.legend(handles, labels, loc="upper right") 41 | 42 | fig = legend.figure 43 | fig.canvas.draw() 44 | fig.savefig(outfilename, transparent=True) 45 | plt.close("all") 46 | 47 | def add_legend(input_svg, legend_svg, outfile): 48 | """ 49 | Create a new svg by putting one svg on top of another. 50 | 51 | Args: 52 | input_svg (str): name for first svg file 53 | legend_svg (str): name for second svg file (will be drawn on top of first) 54 | outfile (list): name for the output figure file 55 | 56 | """ 57 | template = st.fromfile(input_svg) 58 | second_svg = st.fromfile(legend_svg) 59 | template.append(second_svg) 60 | template.save(outfile) 61 | 62 | if __name__ == '__main__': 63 | 64 | PARSER = argparse.ArgumentParser(description=__doc__, 65 | formatter_class=argparse.RawDescriptionHelpFormatter) 66 | 67 | #Required 68 | PARSER.add_argument('-i', '--input', required=True) 69 | 70 | PARSER.add_argument('-o', '--output', required=True) 71 | 72 | PARSER.add_argument('-l', '--labels', required=True, nargs='+') 73 | 74 | PARSER.add_argument('-c', '--cat', required=False, type=int, default=2) 75 | 76 | PARSER.add_argument('-t', '--title', required=False, default="") 77 | 78 | 79 | ARGS = vars(PARSER.parse_args()) 80 | 81 | COLORS = sns.color_palette("tab10", ARGS["cat"]) 82 | 83 | if ARGS["cat"] == 1: 84 | COLORS = ["#FF0000"] 85 | 86 | 87 | OUT, _ = os.path.splitext(ARGS["output"]) 88 | 89 | LEGEND_FILE = OUT + '_legend.svg' 90 | 91 | assert len(ARGS["labels"]) == ARGS["cat"], "Error: different nb of labels and colors in legend." 92 | 93 | make_legend(LEGEND_FILE, ARGS["title"], COLORS, ARGS["labels"]) 94 | 95 | add_legend(ARGS["input"], LEGEND_FILE, ARGS["output"]) 96 | 97 | os.remove(LEGEND_FILE) 98 | -------------------------------------------------------------------------------- /scripts/lorelei/plot_genome.R: -------------------------------------------------------------------------------- 1 | library(RIdeogram) 2 | library(optparse) 3 | 4 | option_list = list( 5 | make_option(c("-k", "--karyo.file"), type="character", help="karyotype file for RIdeogram"), 6 | make_option(c("-f", "--features"), type="character", help="RIdeogram overlaid data file."), 7 | make_option(c("-o", "--outfile"), type="character", default="out_karyotype.svg", 8 | help="Output figure filename. [default= %default]"), 9 | make_option(c("-c", "--ncolors"), type="integer", default=1, 10 | help="Number of colors to use (= number of features to plot.) [default= %default]") 11 | ); 12 | 13 | opt_parser = OptionParser(option_list=option_list); 14 | opt = parse_args(opt_parser); 15 | 16 | karyotype <- read.table(opt$karyo.file, sep = "\t", header = T, stringsAsFactors = F) 17 | overlay_data <- read.table(opt$features, sep = "\t", header = T, stringsAsFactors = F) 18 | 19 | palette <- c("#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd","#8c564b","#e377c2","#7f7f7f", 20 | "#bcbd22","#17becf") 21 | 22 | if (opt$ncolors == 1){ 23 | colors <- c('#FF0000') 24 | } else if (opt$ncolors > 1 && opt$ncolors < 11 ){ 25 | colors <- palette[1:opt$ncolors] 26 | } else { 27 | write("Error: can only color karyotype with up to 10 different colors", stderr()) 28 | quit(status=11) 29 | } 30 | 31 | # if (opt$ncolors == 3){ 32 | # colors <- c('#FF0000', "#332288", "#44AA99") #"#DDCC77", "#CC6677", "#117733", "#88CCEE", "#AA4499") 33 | # } 34 | 35 | # if (opt$ncolors == 2){ 36 | # colors <- c('#FF0000', "#332288") #"#DDCC77", "#CC6677", "#117733", "#88CCEE", "#AA4499") 37 | # } 38 | 39 | 40 | ideogram(karyotype = karyotype, overlaid = overlay_data, output = opt$outfile, colorset1=colors) 41 | -------------------------------------------------------------------------------- /scripts/lorelei/write_ancgenes_treeclass.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Writes a 3-columns file for gene families, giving family_id, genes in the family and its class. 5 | Class can be for instance LORe or AORe, tree clustering, synteny consistency etc... 6 | 7 | Example:: 8 | 9 | $ python -m scripts.lorelei.write_ancgenes_treeclust TODO 10 | """ 11 | 12 | import os.path 13 | import argparse 14 | from ete3 import Tree 15 | 16 | 17 | def write_ancgenes(clustered_genes, treedir, out_ancgenes, clusters_to_load=None): 18 | 19 | """ 20 | Writes the output 3-columns file, tab-separated. 21 | 22 | Args: 23 | clustered_genes (dict): class of gene families 24 | treedir (str): path to the gene trees 25 | out_ancgenes (str): name of the output file 26 | clusters_to_load (list, optional): write only entries for these given family classes. 27 | 28 | """ 29 | 30 | k = 0 31 | 32 | with open(out_ancgenes, 'w') as outfile: 33 | 34 | for gene in clustered_genes: 35 | 36 | cluster = clustered_genes[gene] 37 | 38 | #Load only required family classes 39 | if clusters_to_load is not None and cluster not in clusters_to_load: 40 | continue 41 | 42 | #try different name for the input tree given the tree directory 43 | treefile = treedir + '/' + gene + '.nhx' 44 | if not os.path.exists(treefile): 45 | treefile = treedir + '/' + gene + '.nh' 46 | 47 | if not os.path.exists(treefile): 48 | treefile = treedir + '/C_' + gene + '.nh' 49 | 50 | if not os.path.exists(treefile): 51 | treefile = treedir + "/" + gene + "_final.nhx" 52 | 53 | assert os.path.exists(treefile), f"The file {treefile} does not exist" 54 | 55 | tree = Tree(treefile) 56 | 57 | leaves = {'_'.join(i.name.split('_')[:-1]) for i in tree.get_leaves()} 58 | 59 | if leaves == {''}: 60 | leaves = {i.name for i in tree.get_leaves()} 61 | 62 | descendants = sorted(list(leaves)) 63 | 64 | if clusters_to_load is not None: 65 | cluster = str(clusters_to_load.index(cluster)) 66 | 67 | outfile.write(gene+'\t'+ ' '.join(descendants)+'\t'+cluster+'\n') 68 | 69 | k += 1 70 | 71 | 72 | def load_gene_list(input_summary, input_acc=None): 73 | 74 | """ 75 | Loads a tab-delimited summary of tree classes. 76 | 77 | Args: 78 | input_summary(str): path to the two-columns tab-delimited input file, giving a family_id to 79 | tree class correspondance. 80 | The family_id should be the name of the corresponding tree file for 81 | write_ancgenes to work properly. 82 | input_acc(str, optional): if input is SCORPiOs-generated sequence-synteny inconsistent trees 83 | summary, provide here the summary of accepted correction. 84 | Indeed, gene trees that were initially found to be 85 | synteny-inconsistent but were later corrected should be defined as 86 | consistent. 87 | 88 | Returns: 89 | dict: for each gene family, the corresponding gene tree class 90 | 91 | """ 92 | 93 | with open(input_summary, 'r') as infile: 94 | genes = {line.strip().split('\t')[0]:line.strip().split('\t')[1] for line in infile} 95 | 96 | if input_acc is not None: 97 | 98 | with open(input_acc, 'r') as infile: 99 | acc = {line.strip().split('\t')[0] for line in infile} 100 | 101 | for gene in genes: 102 | if genes[gene] == "Inconsistent" and gene in acc: 103 | genes[gene] = "Consistent" 104 | 105 | return genes 106 | 107 | 108 | def write_summary(summary_dict, output_file): 109 | """ 110 | Writes a simpler 2-columns file with family_id and family class. 111 | 112 | Args: 113 | summary_dict (dict): class of gene families 114 | output_file (str): name of the output file 115 | """ 116 | with open(output_file, 'w') as out: 117 | for key in summary_dict: 118 | out.write('\t'.join([key, summary_dict[key]])+'\n') 119 | 120 | 121 | if __name__ == '__main__': 122 | 123 | # Arguments 124 | PARSER = argparse.ArgumentParser(description=__doc__, 125 | formatter_class=argparse.RawDescriptionHelpFormatter) 126 | 127 | PARSER.add_argument('-t', '--treesdir', help='', required=True) 128 | 129 | PARSER.add_argument('-c', '--clusters', help='', required=True) 130 | 131 | PARSER.add_argument('-a', '--accepted', help='SCORPiOs accepted corrections', 132 | required=False, default=None) 133 | 134 | PARSER.add_argument('-o', '--outfile', help='Output file', required=False, default="out") 135 | 136 | PARSER.add_argument('--summary_only', help='Only write outgroup gene name + tree consistency.', 137 | action="store_true") 138 | 139 | PARSER.add_argument('-r', '--restrict_to', required=False, default=None, nargs='*') 140 | 141 | ARGS = vars(PARSER.parse_args()) 142 | 143 | CLUSTERS = load_gene_list(ARGS["clusters"], ARGS["accepted"]) 144 | 145 | if ARGS["summary_only"]: 146 | write_summary(CLUSTERS, ARGS["outfile"]) 147 | 148 | else: 149 | write_ancgenes(CLUSTERS, ARGS["treesdir"], ARGS["outfile"], ARGS["restrict_to"]) 150 | -------------------------------------------------------------------------------- /scripts/make_lk_test_consel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | name=$1 4 | enstree=$2 5 | alifile=$3 6 | cortree=$4 7 | output=$5 8 | 9 | workingDir=$(pwd) 10 | 11 | alidir=${alifile%/*} 12 | ensdir=${enstree%/*} 13 | cordir=${cortree%/*} 14 | 15 | 16 | 17 | #check alignment and remove undetermined columns (if any) 18 | raxmlHPC -f c --print-identical-sequences -n ${name} -m GTRGAMMA --HKY85 -s "${alifile}" -w "${workingDir}/${alidir}/" >&2 19 | if [ -s "${alifile}.reduced" ]; then 20 | alifile="${alifile}.reduced" 21 | fi 22 | rm "${alidir}/RAxML_info.${name}" 23 | 24 | 25 | #if not already computed for original tree, compute sites likelihood under HKY model for both trees 26 | if [ ! -s "${alidir}/${name}_a.lk" ]; then 27 | 28 | #cat the two tree to a single file and remove nhx tags and remove '()' around a single leaf 29 | cat "${cortree}" "${enstree}" | sed -e 's/\[[^][]*\]//g' -e 's/(\([^,]*\))/\1/g' > "${cordir}/trees_${name}.nh" 30 | 31 | #compute site lk 32 | raxmlHPC -f G -n ${name} -m GTRGAMMA --HKY85 -s "${alifile}" -z "${cordir}/trees_${name}.nh" -w "${workingDir}/${alidir}/" >&2 33 | rm "${cordir}/trees_${name}.nh" 34 | 35 | #rename output 36 | mv "${alidir}/RAxML_perSiteLLs.${name}" "${alidir}/${name}_a.lk" 37 | cp "${alidir}/${name}_a.lk" "${alidir}/tmp_${name}.lk" 38 | 39 | #compute sites likelihood under HKY model for new solution tree only (original already computed) 40 | else 41 | #remove nhx tags 42 | sed -e 's/\[[^][]*\]//g' "${cortree}" > "${cordir}/tmp_${name}.nh" 43 | 44 | #compute site lk 45 | raxmlHPC -f g -n ${name} -m GTRGAMMA --HKY85 -s "${alifile}" -z "${cordir}/tmp_${name}.nh" -w "${workingDir}/${alidir}/" >&2 46 | rm "${cordir}/tmp_${name}.nh" 47 | 48 | #extract original tree site log-lk and put it together in one file 49 | sed '1h;1d;'3'x' <(sed 2'!d;q' "${alidir}/RAxML_perSiteLLs.${name}") "${alidir}/${name}_a.lk" > "${alidir}/${name}_b.lk" 50 | cp "${alidir}/${name}_b.lk" "${alidir}/tmp_${name}.lk" 51 | 52 | fi 53 | 54 | 55 | #workaround since consel decides to trim filenames containing '.' (looks like extension split issue) 56 | namenew="${name//./}" 57 | mv "${alidir}/tmp_${name}.lk" "${alidir}/tmp_${namenew}.lk" 58 | 59 | #test if difference in likelihood is signifiant with the AU-Test using consel 60 | makermt --puzzle "${alidir}/tmp_${namenew}.lk" >&2 61 | consel "${alidir}/tmp_${namenew}" >&2 62 | catpv "${alidir}/tmp_$namenew.pv" > "$output" 63 | 64 | ## CLEAN ALL TEMP ## 65 | #clean all consel temp 66 | rm "${alidir}/tmp_${namenew}.lk" 67 | rm "${alidir}/tmp_${namenew}.rmt" 68 | rm "${alidir}/tmp_${namenew}.vt" 69 | rm "${alidir}/tmp_${namenew}.pv" 70 | rm "${alidir}/tmp_${namenew}.ci" 71 | 72 | #clean tmp logs 73 | rm "${alidir}/RAxML_info.${name}" 74 | 75 | #clean raxml tmp 76 | if [ -s "${alifile}.reduced" ]; then 77 | rm "${alifile}.reduced" 78 | fi 79 | -------------------------------------------------------------------------------- /scripts/prototype_au_test3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | name=$1 4 | mltree=$2 5 | alifile=$3 6 | aoretree=$4 7 | loretree=$5 8 | output=$6 9 | 10 | workingDir=$(pwd) 11 | 12 | alidir=${alifile%/*} 13 | 14 | rm "${alidir}/RAxML_info.${name}_lktest" 15 | 16 | #cat the trees to a single file and remove nhx tags and remove '()' around a single leaf 17 | cat "${mltree}" "${aoretree}" "${loretree}" | sed -e 's/\[[^][]*\]//g' -e 's/(\([^,]*\))/\1/g' > "${alidir}/trees_${name}.nh" 18 | 19 | #compute site lk 20 | raxmlHPC -f g -n ${name}_lktest -m GTRGAMMA --HKY85 -s "${alifile}" -z "${alidir}/trees_${name}.nh" -w "${workingDir}/${alidir}/" >&2 21 | rm "${alidir}/trees_${name}.nh" 22 | echo ${alifile} 23 | 24 | #rename output 25 | mv "${alidir}/RAxML_perSiteLLs.${name}_lktest" "${alidir}/${name}.lk" 26 | 27 | #workaround since consel decides to trim filenames containing '.' (looks like extension split issue) 28 | namenew="${name//./}" 29 | mv "${alidir}/${name}.lk" "${alidir}/${namenew}.lk" 30 | 31 | #test if difference in likelihood is signifiant with the AU-Test using consel 32 | makermt --puzzle "${alidir}/${namenew}.lk" >&2 33 | consel "${alidir}/${namenew}" >&2 34 | catpv "${alidir}/$namenew.pv" > "$output" 35 | 36 | 37 | ## CLEAN ALL TEMP ## 38 | #clean all consel temp 39 | rm "${alidir}/${namenew}.lk" 40 | rm "${alidir}/${namenew}.rmt" 41 | rm "${alidir}/${namenew}.vt" 42 | rm "${alidir}/${namenew}" 43 | rm "${alidir}/${namenew}.pv" 44 | rm "${alidir}/${namenew}.ci" 45 | -------------------------------------------------------------------------------- /scripts/synteny/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/scripts/synteny/__init__.py -------------------------------------------------------------------------------- /scripts/synteny/f1_score_optimization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script loads 2 scores distributions and finds the optimal discriminative threshold to 5 | separate distributions based on the F1-score, assuming true positives to recover are in the 6 | distribution of higher scores. 7 | 8 | Inputs are python lists pickled in files, output is written to file with the :code:`--support` 9 | prefix, to call the script missed_orthologies.py in snakemake with the :code:`--support` arg. 10 | 11 | Example:: 12 | 13 | $ python -m scripts.synteny.f1_score_optimization -i1 scores_1.pkl -i2 scores_2.pkl 14 | [-out out] 15 | """ 16 | 17 | 18 | import argparse 19 | import pickle 20 | 21 | import numpy as np 22 | 23 | def load_scores(input1, input2): 24 | 25 | """ 26 | Unpickles the lists of scores. 27 | 28 | Args: 29 | input1 (str) : paths to the pickled object 1 30 | input2 (str) : paths to the pickled object 2 31 | 32 | Returns: 33 | 34 | tuple: a tuple containing: 35 | 36 | scores1, scores2: the unpickled lists 37 | """ 38 | 39 | with open(input1, 'rb') as infile1, open(input2, 'rb') as infile2: 40 | scores1 = pickle.load(infile1) 41 | scores2 = pickle.load(infile2) 42 | 43 | if np.median(scores1) < np.median(scores2): 44 | scores1, scores2 = scores2, scores1 45 | 46 | return scores1, scores2 47 | 48 | 49 | def compute_f1(scores1, scores2, threshold): 50 | 51 | """ 52 | Computes the F1-score for a given threshold. 53 | 54 | Args: 55 | scores1 (list): list of scores 1 56 | scores2 (list): list of scores 2 57 | threshold (float): threshold value 58 | 59 | Returns: 60 | float: F1-score 61 | """ 62 | 63 | true_pos = len([x for x in scores1 if x >= threshold]) 64 | false_neg = len(scores1) - true_pos 65 | false_pos = len([x for x in scores2 if x >= threshold]) 66 | 67 | recall = true_pos / float(true_pos + false_neg) if (true_pos + false_neg) else 0 68 | precision = true_pos / float(true_pos + false_pos) if (true_pos + false_pos) else 0 69 | 70 | f1_score = 2*precision*recall / (precision + recall) if (precision + recall) else 0 71 | 72 | return f1_score 73 | 74 | 75 | def get_discriminant_threshold(input1, input2, test_range=[j for j in range(30)]): 76 | 77 | """ 78 | Finds the most discriminative threshold between the two distributions based on F1-score. 79 | 80 | Args: 81 | input1, input2 (str): paths to the pickled objects 82 | test_range (list, optional): list of thresholds to test 83 | 84 | Returns: 85 | int: optimized threshold based on F1-score 86 | """ 87 | 88 | scores1, scores2 = load_scores(input1, input2) 89 | 90 | if not scores1 or not scores2: 91 | return 2 92 | 93 | best_threshold = 0 94 | max_f1 = 0 95 | for i in test_range: 96 | f1_score = compute_f1(scores1, scores2, i) 97 | f1_score = round(f1_score, 2) #take the most conservative threshold for improvement < 0.01 98 | if f1_score >= max_f1: 99 | best_threshold = i 100 | max_f1 = f1_score 101 | return best_threshold 102 | 103 | 104 | if __name__ == '__main__': 105 | 106 | #Arguments 107 | 108 | PARSER = argparse.ArgumentParser(description=__doc__,\ 109 | formatter_class=argparse.RawDescriptionHelpFormatter) 110 | 111 | #Required 112 | 113 | PARSER.add_argument('-i1', '--input1', help='First score distribution', required=True) 114 | 115 | PARSER.add_argument('-i2', '--input2', help='Second score distribution', required=True) 116 | 117 | 118 | #Optional 119 | 120 | PARSER.add_argument('-out', '--output', help='Output file.', required=False, default="out") 121 | 122 | ARGUMENTS = vars(PARSER.parse_args()) 123 | 124 | THRESHOLD = get_discriminant_threshold(ARGUMENTS['input1'], ARGUMENTS["input2"]) 125 | 126 | with open(ARGUMENTS["output"], 'w') as OUTFILE: 127 | 128 | if THRESHOLD: 129 | OUTFILE.write('--support '+str(round(THRESHOLD, 1))) 130 | -------------------------------------------------------------------------------- /scripts/synteny/filter_no_synteny_genes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This script identifies genes in the orthology table that never, in any of their sliding 5 | windows, have genes on the same chromosome in the orthology table. 6 | A new orthology table is written as output, where genomic posistion of these genes is omitted, 7 | which forces SCORPiOs other scripts to not use them in the synteny analysis. 8 | 9 | Example:: 10 | 11 | $ python -m scripts.synteny.filter_no_synteny_genes -i OrthoTable.txt -chr Chr_outgr_file 12 | [-o out] [-w 15] 13 | """ 14 | 15 | 16 | import argparse 17 | import collections 18 | 19 | import numpy as np 20 | 21 | from . import syntenycompare as synt 22 | from . import utilities as ut 23 | 24 | def print_out_stats(stats_dict, wgd=''): 25 | 26 | """ 27 | Prints to stdout some statistics on the genes without syteny support that will be ignored in 28 | scorpios synteny analysis. 29 | 30 | Args: 31 | stats_dict (dict): a dict with the number of filtered genes per species 32 | wgd (str, optional): the wgd for which the filter was run 33 | 34 | """ 35 | 36 | outgr = '' 37 | if ',' in wgd: 38 | wgd, outgr = wgd.split(",") 39 | 40 | if stats_dict: 41 | 42 | print('\n') 43 | print("-----------------------Genes without synteny support-------------------------") 44 | print(" Whole-genome duplication: {} outgroup {}".format(wgd, outgr)) 45 | print("\n") 46 | 47 | for species in stats_dict: 48 | print(" {} {} orthologs in the final table without synteny support"\ 49 | .format(stats_dict[species], species)) 50 | print("\n") 51 | print("----------------------------------------------------------------------------") 52 | print("\n") 53 | 54 | else: 55 | print('\n') 56 | print("-----------------------Genes without synteny support-------------------------") 57 | print("Whole-genome duplication: {}".format(wgd)) 58 | print("\n") 59 | print("0 orthologs in the final table without synteny support") 60 | print("----------------------------------------------------------------------------") 61 | print("\n") 62 | 63 | 64 | if __name__ == '__main__': 65 | 66 | ## Arguments 67 | PARSER = argparse.ArgumentParser(description=__doc__,\ 68 | formatter_class=argparse.RawDescriptionHelpFormatter) 69 | 70 | # Required 71 | 72 | PARSER.add_argument('-i', '--input', type=str, help='Orthology Table', required=True) 73 | 74 | PARSER.add_argument('-chr', '--chr_outgr', type=str, help='Outgroup chromosome to consider.', 75 | required=True) 76 | 77 | # Optional 78 | 79 | PARSER.add_argument('-o', '--out', type=str, help='Result file', required=False, default="out") 80 | 81 | PARSER.add_argument('-w', '--windowSize', type=int, help='Size of the sliding window', 82 | required=False, default=15) 83 | 84 | PARSER.add_argument('-wgd', '--wgd', type=str, help='Tag for the run to write along with\ 85 | output statistics', required=False, default="") 86 | 87 | ARGS = vars(PARSER.parse_args()) 88 | 89 | ORTHOTABLE = ARGS["input"] 90 | 91 | with open(ORTHOTABLE, 'r') as infile: 92 | FIRST_LINE = infile.readline() 93 | SPECIES = [i for i in FIRST_LINE.strip().split('\t')[1:] if i] 94 | OUTGR = FIRST_LINE.strip().split('\t')[0] 95 | 96 | CHROMOSOMES = [] 97 | 98 | with open(ARGS["chr_outgr"], 'r') as infile: 99 | CHROMOSOMES += [line.strip() for line in infile] 100 | 101 | DGENES = {} 102 | ALL_ORTHOS = collections.defaultdict(dict) 103 | 104 | for chrom in CHROMOSOMES: 105 | for spec in SPECIES: 106 | 107 | if spec not in DGENES: 108 | DGENES[spec] = {"synteny":set(), "nosynteny":set()} 109 | 110 | table_entries_sp = ut.complete_load_orthotable(ORTHOTABLE, chrom, spec) 111 | 112 | ALL_ORTHOS[chrom][spec] = ut.complete_load_orthotable(ORTHOTABLE, chrom, spec, 113 | load_no_position_genes=True) 114 | 115 | start = 0 116 | stop = len(table_entries_sp) 117 | 118 | #check that regions is at least as long as windowSize 119 | if (stop - start) + 1 >= ARGS["windowSize"]: 120 | 121 | #sliding window of size win_size 122 | for i in range(start, stop - ARGS["windowSize"] + 1): 123 | 124 | dup_seg_sp = synt.to_dup_segments(table_entries_sp[i:i+ARGS["windowSize"]]) 125 | 126 | ind_chrom_one_gene = np.where(np.sum(dup_seg_sp.matrix, axis=0) == 1)[0] 127 | 128 | for pos in dup_seg_sp.genes_dict: 129 | 130 | for curr_chrom in dup_seg_sp.genes_dict[pos]: 131 | 132 | if curr_chrom not in ind_chrom_one_gene: 133 | 134 | DGENES[spec]["synteny"].update(set(dup_seg_sp.genes_dict[pos]\ 135 | [curr_chrom])) 136 | 137 | else: 138 | DGENES[spec]["nosynteny"].update(set(dup_seg_sp.genes_dict[pos]\ 139 | [curr_chrom])) 140 | NO_SYNT = set() 141 | STATS = {} 142 | for spec in DGENES: 143 | DGENES[spec]["nosynteny"] = DGENES[spec]["nosynteny"].difference(DGENES[spec]["synteny"]) 144 | NO_SYNT.update(DGENES[spec]["nosynteny"]) 145 | STATS[spec] = STATS.get(spec, 0) + len(DGENES[spec]["nosynteny"]) 146 | 147 | 148 | ut.write_updated_orthotable(ALL_ORTHOS, OUTGR, SPECIES, CHROMOSOMES, ARGS["out"], 149 | wsize=ARGS['windowSize'], filt_genes=NO_SYNT) 150 | 151 | print_out_stats(STATS, wgd=ARGS["wgd"]) 152 | -------------------------------------------------------------------------------- /scripts/trees/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DyogenIBENS/SCORPIOS/ed2c5a28fc8b98f08e5adc9334a62904b12ca18b/scripts/trees/__init__.py -------------------------------------------------------------------------------- /scripts/trees/build_treebest_trees.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Script to build starting gene trees with TreeBeST best, from CDS back translated nucleotide 5 | alignments, given a species tree and a gene species mapping file. 6 | 7 | Example:: 8 | 9 | $ python -m build_treebest_trees -a alis_v89.fa.gz -sp species_tree_v89.nwk 10 | -m genesp_v89.txt [-o treebest_forest_v89.nhx] [-nc 1] [-tmp tmp] 11 | """ 12 | 13 | import os 14 | import argparse 15 | import sys 16 | import gzip 17 | import multiprocessing 18 | import traceback 19 | import glob 20 | import signal 21 | 22 | from ete3 import Tree 23 | 24 | from . import utilities as ut 25 | 26 | def init_worker(): 27 | signal.signal(signal.SIGINT, signal.SIG_IGN) 28 | 29 | def worker_build_tree(ali, genes_sp, sptree, ali_id, tmp_folder='', X=10): 30 | 31 | """ 32 | Build a gene tree from the multiple alignment string in `ali`, while accounting for the 33 | species tree `sptree`, using treebest best. 34 | 35 | If the output tree file already exists, the file will not be updated. This allows to re- 36 | execute a SCORPiOs snakemake run without recomputing all trees in case of error. 37 | 38 | Args: 39 | ali (str): the fasta multiple alignment 40 | genes_sp (str): the corresponding genes to species mapping 41 | sptree (str): path to the newick species tree 42 | ali_id (str): identifier of the tree, used in the output .nhx file name. 43 | tmp_folder (str): path to temp individual ali, will store temp individual tree. 44 | X (int, optional): -X parameter for treebest best (default=10). 45 | 46 | Returns: 47 | bool: True if no Exception was raised. 48 | 49 | """ 50 | try: 51 | 52 | tmp_ali = tmp_folder+"tmp_ali_"+str(ali_id)+".fa" 53 | out_tree = tmp_folder+"tmp_tree_"+str(ali_id)+".nhx" 54 | 55 | if os.path.isfile(out_tree) and os.path.getsize(out_tree) > 0: 56 | return True 57 | 58 | sys.stderr.write("Building tree for alignment number "+str(ali_id)+"\n") 59 | sys.stderr.flush() 60 | 61 | mapping = {} 62 | genes_sp = genes_sp.strip().split('\n') 63 | for line in genes_sp: 64 | name, species = line.strip().split('\t') 65 | mapping[name] = species 66 | 67 | seq = ut.get_subali(ali, mapping, mapping) 68 | 69 | ut.write_fasta(seq, tmp_ali) 70 | cmd = "treebest best "+tmp_ali+" -f "+sptree+" -X "+str(X)+" -Z 1e-3 -q > "+out_tree 71 | return_value = os.system(cmd) 72 | 73 | #if treebest failed, we try without filtering the alignment 74 | #TODO it would be better to catch treebest errors using subprocess rather than os.system 75 | if return_value != 0: 76 | 77 | cmd = "treebest best "+tmp_ali+" -F 0 -f "+sptree+" -X "+str(X)+" -Z 1e-3 -q > "+\ 78 | out_tree 79 | return_value = os.system(cmd) 80 | 81 | if return_value != 0: 82 | raise Exception('treebest failed to build a tree for alignment {}'.format(ali_id)) 83 | 84 | os.remove(tmp_ali) 85 | return True 86 | 87 | except Exception: 88 | 89 | traceback.print_exc() 90 | raise 91 | 92 | 93 | 94 | if __name__ == '__main__': 95 | 96 | PARSER = argparse.ArgumentParser(description=__doc__, 97 | formatter_class=argparse.RawDescriptionHelpFormatter) 98 | 99 | PARSER.add_argument('-a', '--ali', type=str, help='Single file with all alignments (.fa).', 100 | required=True) 101 | 102 | PARSER.add_argument('-sp', '--species_tree', help='Newick species tree file.', required=True) 103 | 104 | PARSER.add_argument('-m', '--genes_sp_map', help='Single file with corresponding gene to\ 105 | species mapping', required=True) 106 | 107 | PARSER.add_argument('-o', '--output', type=str, help='Output name for the output forest', 108 | required=True) 109 | 110 | PARSER.add_argument('-nc', '--ncores', type=int, help='Number of threads', required=False, 111 | default=1) 112 | 113 | PARSER.add_argument('-tmp', '--tmp_folder', type=str, help='Path for tmp invidual trees', 114 | required=False, default='') 115 | 116 | PARSER.add_argument('-X', '--X', type=int, help='treebest -X argument', required=False, 117 | default=10) 118 | 119 | ARGS = vars(PARSER.parse_args()) 120 | 121 | if ARGS["tmp_folder"]: 122 | os.makedirs(ARGS["tmp_folder"], exist_ok=True) 123 | 124 | sys.stderr.write("Building starting trees with TreeBeST\n") 125 | 126 | OPEN = open 127 | if ARGS["ali"].split('.')[-1] == 'gz': 128 | OPEN = gzip.open 129 | 130 | try: 131 | POOL = multiprocessing.Pool(ARGS["ncores"], init_worker) 132 | 133 | i = 0 134 | 135 | with OPEN(ARGS["ali"], "rt") as INFILE_A, open(ARGS["genes_sp_map"], 'r') as INFILE_GSP: 136 | 137 | ASYNC_RES = [] 138 | 139 | for i, (ALI, MAP) in enumerate(zip(ut.read_multiple_objects(INFILE_A), 140 | ut.read_multiple_objects(INFILE_GSP))): 141 | 142 | RES = POOL.apply_async(worker_build_tree, args=(ALI, MAP, ARGS["species_tree"], i, 143 | ARGS["tmp_folder"], ARGS["X"])) 144 | ASYNC_RES += [RES] 145 | 146 | POOL.close() 147 | POOL.join() 148 | 149 | 150 | for RES in ASYNC_RES: 151 | if not RES.get(): 152 | sys.stderr.write("An error occured in a child process\n") 153 | sys.exit(1) 154 | 155 | except KeyboardInterrupt: 156 | print("Caught KeyboardInterrupt, terminating workers") 157 | POOL.terminate() 158 | POOL.join() 159 | sys.exit(1) 160 | 161 | sys.stderr.write("Writing trees into a single gene tree forest file...\n") 162 | 163 | with open(ARGS["output"], 'w') as outforest: 164 | for j in range(i+1): 165 | 166 | TREE = Tree(ARGS["tmp_folder"]+"tmp_tree_"+str(j)+".nhx") 167 | 168 | #remove sp_name 169 | for leaf in TREE.get_leaves(): 170 | sp_name = leaf.name.split('_')[-1] 171 | leaf.name = leaf.name.replace('_'+sp_name, '', 1) 172 | 173 | #format root node and include features and write 174 | TREE = TREE.write(features=["D", "S", "DD", "DCS", "B"], format_root_node=True, 175 | format=1) 176 | outforest.write(TREE) 177 | outforest.write('\n//\n') 178 | 179 | #remove single tree files 180 | TMP_TREES = glob.glob(ARGS["tmp_folder"]+"tmp_tree_*.nhx") 181 | for tmp_tree in TMP_TREES: 182 | os.remove(tmp_tree) 183 | 184 | #remove treebest temp... 185 | os.remove("filtalign.fa") 186 | -------------------------------------------------------------------------------- /scripts/trees/convert_ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Script to convert gene IDs in the trees and alignment files to shorter IDs. 5 | This will allow the alignment to be converted to the phylip format so that phyml can be run 6 | with correct input formats (trees and ali). 7 | Converted output filenames are input filenames prefixed with `tmp_`. 8 | 9 | Example:: 10 | 11 | $ python -m scripts.trees.convert_ids -t gene_tree1.nh gene_tree2.nh -a ali.fa 12 | """ 13 | 14 | import os 15 | import argparse 16 | 17 | from ete3 import Tree 18 | 19 | 20 | def convert_tree(treefile, output, d_conv=None, text=''): 21 | 22 | """ 23 | Converts gene IDs in an input tree. A conversion dictionary can be given, otherwise it is 24 | generated. 25 | 26 | Args: 27 | treefile (file): input tree in newick format. 28 | output (str): name for the output file. 29 | d_conv (dict, optional): Conversion from old to new IDs. 30 | text (str, optional): Debug information 31 | 32 | Returns: 33 | dict: Conversion old to new IDs. 34 | 35 | """ 36 | 37 | 38 | tree = Tree(treefile) 39 | 40 | if not d_conv: 41 | 42 | leaves = [i.name for i in tree.get_leaves()] 43 | 44 | #For treebest-type IDs (i.e last '_' is followed by species name): 45 | #generated IDs are 3 letters from species name + a unique number. 46 | ids = [gene.split('_')[-1][0:3]+str(nb) for nb, gene in enumerate(leaves)] 47 | d_conv = dict(zip(leaves, ids)) 48 | 49 | leaves = tree.get_leaves() 50 | 51 | assert len(leaves) == len(d_conv), "Trees have different number of leaves {}".format(text) 52 | 53 | for leaf in leaves: 54 | 55 | assert leaf.name in d_conv, "{} present in {} but not in all trees".format(leaf.name, 56 | treefile) 57 | leaf.name = d_conv[leaf.name] 58 | 59 | tree.prune([i for i in tree.get_leaves()]) 60 | 61 | tree.write(outfile=output, format=9) 62 | 63 | return d_conv 64 | 65 | 66 | def convert_ali(fastafile, output, d_conv): 67 | """ 68 | Converts gene IDs in an input multiple gene alignment in fasta format. 69 | The conversion dictionary must be given. 70 | 71 | Args: 72 | fastafile (file): input tree in newick format. 73 | output (str): name for the output file. 74 | d_conv (dict): Conversion from old to new IDs. 75 | 76 | """ 77 | 78 | nb_genes = 0 79 | 80 | with open(fastafile, 'r') as infile, open(output, 'w') as outfile: 81 | 82 | for line in infile: 83 | 84 | if ">" in line: 85 | 86 | nb_genes += 1 87 | 88 | leaf = line[1:-1] 89 | assert leaf in d_conv, "{} present in {} but not in trees".format(leaf, fastafile) 90 | outfile.write(line.replace(leaf, d_conv[leaf])) 91 | 92 | else: 93 | 94 | outfile.write(line) 95 | 96 | assert len(d_conv) == nb_genes,\ 97 | "Trees and alignment {} have different number of genes".format(fastafile) 98 | 99 | 100 | if __name__ == '__main__': 101 | 102 | ## ARGS 103 | PARSER = argparse.ArgumentParser(description=__doc__, 104 | formatter_class=argparse.RawDescriptionHelpFormatter) 105 | 106 | PARSER.add_argument('-t', '--treefiles', type=str, help='Folder with constrained trees', 107 | required=True, nargs='+') 108 | 109 | PARSER.add_argument('-a', '--alifiles', type=str, help='Folder with subalis', 110 | required=True, nargs='+') 111 | 112 | ARGS = vars(PARSER.parse_args()) 113 | 114 | for i, treef in enumerate(ARGS['treefiles']): 115 | 116 | directory, filename = os.path.split(treef) 117 | if directory: 118 | outfilename = directory + '/tmp_' + filename 119 | 120 | else: 121 | outfilename = 'tmp_' + filename 122 | 123 | if i == 0: 124 | 125 | conversion = convert_tree(treef, outfilename) 126 | 127 | else: 128 | 129 | convert_tree(treef, outfilename, conversion, ARGS['treefiles']) 130 | 131 | for ali in ARGS['alifiles']: 132 | 133 | directory, filename = os.path.split(ali) 134 | if directory: 135 | outfilename = directory + '/tmp_' + filename 136 | 137 | else: 138 | outfilename = 'tmp_' + filename 139 | convert_ali(ali, outfilename, conversion) 140 | -------------------------------------------------------------------------------- /scripts/trees/cut_subtrees.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | From a synteny-derived constrained tree topology, extract genes together in an orthogroup and 5 | their sequence alignment, for treebest phyml independent resolution of each orthogroup. 6 | 7 | Example:: 8 | 9 | $ python -m scripts.trees.cut_subtrees -t ctree.nh -a ali.fa -og outgr_gene_name 10 | -oa outali -ot outtree 11 | """ 12 | 13 | import os 14 | import argparse 15 | import operator 16 | 17 | from ete3 import Tree 18 | 19 | from .utilities import get_subali, write_fasta 20 | 21 | def get_orthogroups_genes(ctree, outgr_gene_name): 22 | 23 | """ 24 | Finds the two polytomies in the constrained tree topology. 25 | 26 | Args: 27 | ctree (str): input tree file in newick format. 28 | outgr_gene_name (str): gene name of the outgroup gene. 29 | 30 | Returns: 31 | dict: the 1 or 2 polytomy node(s) and their corresponding size. 32 | str: full outgroup gene name (with species tag) 33 | """ 34 | 35 | ctree = Tree(ctree) 36 | orthogroups = {} 37 | outgr = '' 38 | 39 | for leaf in ctree.get_leaves(): 40 | 41 | if outgr_gene_name != '_'.join(leaf.name.split('_')[:-1]): 42 | 43 | parent_node = leaf.up 44 | 45 | if parent_node not in orthogroups: 46 | 47 | orthogroups[parent_node] = len(parent_node.get_leaves()) 48 | else: 49 | outgr = leaf.name 50 | 51 | if len(orthogroups) == 2: 52 | break 53 | 54 | return orthogroups, outgr 55 | 56 | 57 | def write_resolved_tree(orthog_tree, outgr_gene_name, out): 58 | 59 | """ 60 | Writes solution trees for orthogroup with only 2 genes. 61 | 62 | Args: 63 | orthogroup tree (ete3.Treeode) : Node with the 2 descendants of the orthogroup. 64 | outgr_gene_name (str): full outgroup gene name (with species tag). 65 | outfile (str): filename to write the tree. 66 | """ 67 | 68 | new_tree = Tree() 69 | 70 | new_tree.add_child(orthog_tree) 71 | new_tree.add_child(name=outgr_gene_name) 72 | 73 | new_tree.prune([i for i in new_tree.get_leaves()]) 74 | 75 | new_tree.write(outfile=out, format=1) 76 | 77 | 78 | if __name__ == '__main__': 79 | 80 | PARSER = argparse.ArgumentParser(description=__doc__, 81 | formatter_class=argparse.RawDescriptionHelpFormatter) 82 | 83 | PARSER.add_argument('-c', '--ctree', help='Newick constrained tree to resolve', required=True) 84 | 85 | PARSER.add_argument('-a', '--ali', help='Corresponding sub-alignment file (fasta)', 86 | required=True) 87 | 88 | PARSER.add_argument('-og', '--outgr_gene', help='Name of the outgroup gene', required=True) 89 | 90 | PARSER.add_argument('-ot', '--outrees', help='Path to write resolved subtrees', 91 | required=True) 92 | 93 | PARSER.add_argument('-oa', '--outalis', help='Path to write resolved subalis', 94 | required=True) 95 | 96 | 97 | ARGS = vars(PARSER.parse_args()) 98 | 99 | 100 | ORTHOGROUPS, OUTGR_GENE = get_orthogroups_genes(ARGS["ctree"], ARGS["outgr_gene"]) 101 | 102 | #Load alignment 103 | with open(ARGS["ali"], 'r') as infile: 104 | ALI = infile.read().strip() 105 | 106 | #Sort orthogroups by size 107 | ORTHOGROUPS = sorted(ORTHOGROUPS.items(), key=operator.itemgetter(1), reverse=True) 108 | 109 | for k, (tree, size) in enumerate(ORTHOGROUPS): 110 | 111 | if size > 2: 112 | 113 | #write subalignment to compute a gene tree with it 114 | seq = get_subali(ALI, [i.name for i in tree.get_leaves()] + [OUTGR_GENE]) 115 | write_fasta(seq, ARGS["outalis"]+'_'+str(k+1)+'.fa') 116 | 117 | else: 118 | #if 2 leaves + outgroup gene, the topology is already resolved 119 | outfile = ARGS["outrees"]+'_'+str(k+1)+'.nh' 120 | write_resolved_tree(tree, OUTGR_GENE, outfile) 121 | 122 | #safeguard to remove any potential artefacts (very unlikely) from previous run 123 | if os.path.isfile(os.path.isfile(ARGS["outalis"]+'_'+str(k+1)+'.fa')): 124 | os.remove(ARGS["outalis"]+'_'+str(k+1)+'.fa') 125 | 126 | #safeguard to remove potential artefacts (very unlikely) from previous run 127 | if len(ORTHOGROUPS) == 1: 128 | if os.path.isfile(ARGS["outalis"]+'_2.nh'): 129 | os.remove(ARGS["outalis"]+'_2.fa') 130 | -------------------------------------------------------------------------------- /scripts/trees/iteration_nhx_tags.py: -------------------------------------------------------------------------------- 1 | """ 2 | This scripts allows to recover correction tags for trees that have been corrected multiple times 3 | during iterative correction (currently .nhx tags are wiped out if a same tree is corrected again) 4 | Optionally also adds tag to internal corrected nodes that are corrected subtrees. 5 | 6 | Example:: 7 | 8 | $ python -m scripts.trees.iteration_nhx_tags -i 5 9 | -c SCORPiOs_example/corrected_forest_%d.nhx [-o out.nhx] [--internal] 10 | """ 11 | 12 | import sys 13 | import argparse 14 | 15 | from ete3 import Tree 16 | 17 | from scripts.trees import speciestree as spt, utilities as ut 18 | from scripts.synteny.duplicated_families import tag_duplicated_species 19 | 20 | 21 | def corr_tag_below_node(node, tags_corr): 22 | 23 | """ 24 | Search for the presence of .nhx tags for leaves below the node `node`. 25 | 26 | Args: 27 | node (ete3 TreeNode): the input node 28 | tags_corr (list of str): list of tags to search for 29 | 30 | Returns: 31 | bool: True if at least one of the input `tags_corr` is in leaves below `node` 32 | """ 33 | 34 | has_tag = False 35 | 36 | for leaf in node.get_leaves(): 37 | for tag_corr in tags_corr: 38 | if hasattr(leaf, tag_corr): 39 | has_tag = True 40 | break 41 | 42 | return has_tag 43 | 44 | 45 | if __name__ == '__main__': 46 | 47 | PARSER = argparse.ArgumentParser(description=__doc__, 48 | formatter_class=argparse.RawDescriptionHelpFormatter) 49 | 50 | PARSER.add_argument('-i', '--iter', help='Total number of iterations', required=True, type=int) 51 | 52 | PARSER.add_argument('-c', '--cor_f', help='path to corrected forests', required=True, type=str) 53 | 54 | PARSER.add_argument('--internal', help='tag also internal corrected wgd nodes', 55 | action='store_true') 56 | 57 | PARSER.add_argument('-sp', '--sptree', type=str, required=False, default='') 58 | 59 | PARSER.add_argument('-o', '--out', type=str, required=False, default="out.nhx") 60 | 61 | ARGS = vars(PARSER.parse_args()) 62 | 63 | assert (ARGS["internal"] and ARGS["sptree"]) or not ARGS["internal"],\ 64 | "A species tree should be provided to tag corrected wgd internal nodes" 65 | 66 | CORRECTIONS = {} 67 | 68 | COR_FOREST = ARGS["cor_f"] 69 | 70 | COR_TAGS_ALL = [] 71 | 72 | COR_TAGS_INT = [] 73 | 74 | COR_TAGS = [] 75 | 76 | 77 | for itera in range(1, ARGS["iter"] + 1): 78 | k = 0 79 | 80 | if itera != ARGS["iter"]: 81 | 82 | sys.stderr.write(f"Browsing corrected forest at iteration {itera}\n") 83 | 84 | else: 85 | eprint = (f"Browsing corrected forest at final iteration ({itera}) and writing final " 86 | f"forest with tags\n") 87 | sys.stderr.write(eprint) 88 | 89 | with open(COR_FOREST % itera, 'r') as f, open(ARGS["out"], 'w') as OUTFILE: 90 | 91 | for tree in ut.read_multiple_objects(f): 92 | 93 | if k%1000 == 0 and k > 0: 94 | sys.stderr.write(f"Browsed {k} trees\n") 95 | 96 | k += 1 97 | 98 | t = Tree(tree) 99 | leaves = t.get_leaves() 100 | 101 | for i in leaves: 102 | 103 | corr = [att for att in vars(i) if "CORR_ID_" in att] 104 | 105 | for tag in corr: 106 | 107 | CORRECTIONS[i.name] = CORRECTIONS.get(i.name, []) 108 | 109 | CORRECTIONS[i.name].append((tag+'_'+str(itera), getattr(i, tag))) 110 | 111 | if tag+'_'+str(itera) not in COR_TAGS_ALL: 112 | 113 | COR_TAGS_ALL.append(tag+'_'+str(itera)) 114 | 115 | if tag not in COR_TAGS_INT: 116 | COR_TAGS_INT.append(tag) 117 | 118 | if itera == ARGS["iter"]: 119 | if i.name in CORRECTIONS: 120 | seen = [] 121 | for tag, value in CORRECTIONS[i.name]: 122 | wgd = tag.split("_")[-2] 123 | if wgd not in seen: 124 | setattr(i, tag, value) 125 | seen.append(wgd) 126 | 127 | if itera == ARGS["iter"]: 128 | 129 | all_features = ["S", "D", "DD", "DCS"] + COR_TAGS_ALL 130 | 131 | if not ARGS["internal"]: 132 | 133 | tree_nhx = t.write(format=1, features=all_features, format_root_node=True) 134 | 135 | OUTFILE.write(tree_nhx+'\n//\n') 136 | 137 | 138 | else: 139 | 140 | wgds = {i.split('_')[-2] for i in COR_TAGS_ALL} 141 | wgds_dict = {} 142 | for wgd in wgds: 143 | DUP_SPECIES = spt.get_species(ARGS["sptree"], wgd) 144 | wgds_dict[wgd] = DUP_SPECIES 145 | 146 | t = Tree(t.write(format=1, features=all_features, format_root_node=True)) 147 | for wgd in wgds_dict: 148 | 149 | leaves = t.get_leaves() 150 | if len(leaves) == 1: 151 | continue 152 | 153 | #find all monphyletic teleost groups 154 | tag_duplicated_species(leaves, wgds_dict[wgd]) 155 | 156 | #all clades of teleost genes, 157 | #by definition corrected subtrees will only contain dup. sp 158 | subtrees = t.get_monophyletic(values=["Y"], target_attr="duplicated") 159 | 160 | for subtree in subtrees: 161 | 162 | if subtree.is_leaf(): 163 | continue 164 | 165 | #if corrected leaves at each side of the node: corrected node 166 | child1, child2 = subtree.get_children() 167 | 168 | tags_wgd = [i for i in COR_TAGS_ALL if wgd in i] 169 | 170 | ok_child1 = corr_tag_below_node(child1, tags_wgd) 171 | 172 | if ok_child1: 173 | ok_child2 = corr_tag_below_node(child2, tags_wgd) 174 | 175 | if ok_child1 and ok_child2: 176 | 177 | setattr(subtree, 'CORR_ID_'+wgd, 'Y') 178 | 179 | t = Tree(t.write(format=1, features=all_features+COR_TAGS_INT, 180 | format_root_node=True)) 181 | 182 | tree_nhx = t.write(format=1, features=all_features+COR_TAGS_INT, 183 | format_root_node=True) 184 | 185 | OUTFILE.write(tree_nhx+'\n//\n') 186 | -------------------------------------------------------------------------------- /scripts/trees/merge_subtrees.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Script to merge together independently resolved orthogroups of the same family into a single 5 | tree. 6 | 7 | Example:: 8 | 9 | $ python -m scripts.trees.merge_subtrees -t orthogroup_tree1.nh orthogroup_tree2.nh 10 | -outgr gene_name [-o out] 11 | """ 12 | 13 | 14 | import argparse 15 | 16 | from ete3 import Tree 17 | 18 | 19 | def remove_outgroup(tree, outgr): 20 | 21 | """ 22 | Loads a subtree and removes the outgroup gene. 23 | 24 | Args: 25 | tree (ete3.Tree): Input trree 26 | outgr (str): Outgroup gene name 27 | 28 | """ 29 | tree = Tree(tree) 30 | leaves = [i.name for i in tree.get_leaves()] 31 | 32 | outgr_gene = [i for i in leaves if outgr == '_'.join(i.split('_')[:-1])][0] 33 | tree.set_outgroup(tree&outgr_gene) 34 | 35 | tree.prune([i for i in leaves if i != outgr_gene]) 36 | return tree, outgr_gene 37 | 38 | 39 | def merge_trees_and_write(trees, outgr, outfile, keep_br=False): 40 | 41 | """ 42 | Merges two subtrees independently resolved into a single tree and adds the outgroup gene. 43 | Writes the result to file. 44 | 45 | Args: 46 | trees (list of ete3.Tree): Tree(s) to merge 47 | outgr (str): Outgroup gene name 48 | outfile (str): Output filename 49 | """ 50 | 51 | merged_tree = Tree() 52 | 53 | for tree in trees: 54 | merged_tree.add_child(tree) 55 | 56 | #merge the two and place outgroup correctly 57 | merged_final = Tree() 58 | merged_final.add_child(merged_tree) 59 | merged_final.add_child(name=outgr) 60 | merged_final.prune([i for i in merged_final.get_leaves()]) 61 | 62 | if keep_br: 63 | merged_final.write(outfile=outfile) 64 | 65 | else: 66 | merged_final.write(outfile=outfile, format=9) 67 | 68 | 69 | if __name__ == '__main__': 70 | 71 | PARSER = argparse.ArgumentParser(description=__doc__, 72 | formatter_class=argparse.RawDescriptionHelpFormatter) 73 | 74 | PARSER.add_argument('-outgr', '--outgroup', help='Outgroup gene', required=True) 75 | PARSER.add_argument('-t', '--trees', help='Resolved binary subtrees.', required=True, 76 | nargs='+') 77 | PARSER.add_argument('-o', '--outfile', required=False, default='out') 78 | PARSER.add_argument('-br', '--brlength', action='store_true') 79 | ARGS = vars(PARSER.parse_args()) 80 | 81 | OUTGR = ARGS["outgroup"] 82 | 83 | TREES = [] 84 | for i, subtree in enumerate(ARGS['trees']): 85 | if i < 2: 86 | subtree, outgr_genename = remove_outgroup(subtree, OUTGR) 87 | TREES.append(subtree) 88 | else: 89 | raise ValueError("Only a maximum of two trees are expected") 90 | 91 | merge_trees_and_write(TREES, outgr_genename, ARGS['outfile']) 92 | -------------------------------------------------------------------------------- /scripts/trees/orthologs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | Script to extract orthologous genes within a gene tree forest amongst a given list of species. 6 | All pairwise orthologies will be stored in the output folder (one file for each species pair). 7 | 8 | Example:: 9 | 10 | $ python -m scripts.trees.orthologs -t gene_trees.nhx -d Clupeocephala -s sptree.nwk 11 | [-o out] [-ow Salmonids] [-l lowcov_sp1,lowcov_sp2] 12 | """ 13 | 14 | import itertools 15 | import os 16 | import sys 17 | import argparse 18 | import errno 19 | 20 | from ete3 import Tree 21 | 22 | from . import utilities as ut 23 | from . import speciestree as spt 24 | 25 | 26 | def is_speciation(node): 27 | 28 | """ 29 | Is the node a speciation node? 30 | 31 | Args: 32 | tree (ete3.TreeNode): input node, with duplications annotated with the `D` attribute. 33 | D=Y if duplication, D=N otherwise. Note that dubious nodes 34 | (DD=Y or DCS=0) are considered speciation nodes. 35 | 36 | Returns: 37 | bool: True if speciation, False otherwise. 38 | 39 | """ 40 | 41 | speciation = False 42 | 43 | if (hasattr(node, "D") and node.D == 'N'): 44 | speciation = True 45 | 46 | elif (hasattr(node, "DD") and node.DD == 'Y'): 47 | speciation = True 48 | 49 | elif (hasattr(node, "DCS") and float(node.DCS) == 0.0): 50 | speciation = True 51 | 52 | return speciation 53 | 54 | 55 | def get_speciation_events(tree, species_pairs, sp_ortho_dict): 56 | 57 | """ 58 | Extracts all orthologies relationships in a gene tree involving the species given in input, 59 | and adds them to the orthology dict. 60 | 61 | Args: 62 | tree (ete3.Tree): input Tree object. 63 | species_pairs (list): species pairs to consider. 64 | sp_ortho_dict (dict): dictionary to store orthologies. 65 | """ 66 | 67 | #browse the tree 68 | for node in tree.traverse(): 69 | 70 | #ignore leaves or artefactual single child internal nodes 71 | if len(node.get_children()) == 2: 72 | 73 | #get type of event according to duplication annotations, and continue if speciation 74 | #'dubious' nodes i.e duplications with confidence 0 are considered speciations 75 | if is_speciation(node): 76 | 77 | children = node.get_children() 78 | 79 | #get leaves separated by the speciation 80 | side_a_leaves = {i for i in children[0].get_leaves()} 81 | side_b_leaves = {i for i in children[1].get_leaves()} 82 | 83 | #get species at both sides of event 84 | inspecies = {i.S for i in side_a_leaves} 85 | outspecies = {i.S for i in side_b_leaves} 86 | 87 | #store genes separated by speciation in all species pairs 88 | for sp_pair in species_pairs: 89 | sp1, sp2 = sp_pair 90 | if (sp1 in inspecies and sp2 in outspecies) or (sp1 in outspecies and sp2 in\ 91 | inspecies): 92 | 93 | if sp1 in inspecies: 94 | genes_1 = {i.name for i in list(side_a_leaves) if i.S == sp1} 95 | genes_2 = {i.name for i in list(side_b_leaves) if i.S == sp2} 96 | 97 | else: 98 | genes_1 = {i.name for i in list(side_b_leaves) if i.S == sp1} 99 | genes_2 = {i.name for i in list(side_a_leaves) if i.S == sp2} 100 | 101 | sp_ortho_dict[(sp1, sp2)] = sp_ortho_dict.get((sp1, sp2), []) 102 | 103 | for (gene1, gene2) in itertools.product(genes_1, genes_2): 104 | sp_ortho_dict[(sp1, sp2)].append((gene1, gene2)) 105 | 106 | 107 | 108 | if __name__ == '__main__': 109 | 110 | # Arguments 111 | PARSER = argparse.ArgumentParser(description=__doc__, 112 | formatter_class=argparse.RawDescriptionHelpFormatter) 113 | 114 | 115 | #Required 116 | PARSER.add_argument('-t', '--treesFile', help='Forest of trees in newhampshire format (nhx),\ 117 | with species, duplication/speciation nodes + duplication confidence tags.', 118 | required=True) 119 | 120 | PARSER.add_argument('-d', '--dupSp', help='Name of the ancestor of duplicated species.', 121 | required=True) 122 | 123 | PARSER.add_argument('-s', '--speciesTree', help='Species tree (newick), with ancestor names.', 124 | required=True) 125 | 126 | #Optional 127 | PARSER.add_argument('-o', '--out', help='Result folder.', required=False, default="out") 128 | 129 | PARSER.add_argument('-ow', '--other_wgds', help='Ancestor(s) to exclude (Comma-delimited,\ 130 | exclude all species below these ancestors).', required=False, default='') 131 | 132 | PARSER.add_argument('-l', '--lowcov', type=str, help='Species to exclude (Comma-delimited).', 133 | required=False, default='') 134 | 135 | PARSER.add_argument('-sp', '--species', help='Use the given species list (Comma-delimited)\ 136 | instead of species below ancestor', required=False, default=None) 137 | 138 | ARGS = vars(PARSER.parse_args()) 139 | 140 | if ARGS["species"] is None: 141 | 142 | #Get all pairs of duplicated species 143 | SPECIES = spt.get_species(ARGS["speciesTree"], ARGS["dupSp"], ARGS["other_wgds"], 144 | ARGS["lowcov"]) 145 | 146 | else: 147 | SPECIES = ARGS["species"].split(',') 148 | 149 | SPECIES = list(itertools.combinations(SPECIES, 2)) 150 | 151 | ORTHOLOGIES = {} 152 | 153 | sys.stderr.write("Browsing gene trees for orthologies between duplicated species...\n") 154 | 155 | #browse gene tree forest 156 | with open(ARGS["treesFile"], "r") as infile: 157 | 158 | #for each gene tree 159 | for TREE in ut.read_multiple_objects(infile): 160 | TREE = Tree(TREE, format=1) 161 | 162 | #find all speciation nodes and corresponding orthologies 163 | get_speciation_events(TREE, SPECIES, ORTHOLOGIES) 164 | 165 | sys.stderr.write("OK\n") 166 | 167 | #create output folder if it does not exist 168 | OUT_PATH = ARGS["out"] 169 | try: 170 | os.mkdir(OUT_PATH) 171 | except OSError as exc: 172 | if exc.errno != errno.EEXIST: 173 | raise 174 | 175 | #Write orthologies to result files 176 | sys.stderr.write("Writing orthologies between duplicated species...\n") 177 | for (SP1, SP2) in ORTHOLOGIES: 178 | with open(OUT_PATH+'/ens_'+SP1+'_'+SP2+'.txt', 'w') as OUTFILE: 179 | for (GENE1, GENE2) in ORTHOLOGIES[(SP1, SP2)]: 180 | OUTFILE.write(GENE1+'\t'+GENE2+'\n') 181 | -------------------------------------------------------------------------------- /scripts/trees/speciestree.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Module with functions to work with a species tree. 5 | """ 6 | 7 | import sys 8 | from collections import OrderedDict 9 | from ete3 import Tree 10 | 11 | 12 | def search_one_node(tree, node_name): 13 | 14 | """ 15 | Searches for a node in the input tree given its name. Throws AssertionError if the node name 16 | is not found or is not unique. 17 | 18 | Args: 19 | tree (ete3 Tree): input tree 20 | node_name (str): node to search 21 | 22 | Returns: 23 | ete3 TreeNode: the matched node 24 | """ 25 | 26 | nodes = tree.search_nodes(name=node_name) 27 | assert nodes, "{} not in tree".format(node_name) 28 | assert len(nodes) == 1, "{} ambiguous node name".format(node_name) 29 | return nodes[0] 30 | 31 | 32 | def remove_anc(tree_file, out_file): 33 | 34 | """ 35 | Removes any internal node name, such as ancestor names, in the input tree and writes it to a 36 | new file. 37 | 38 | Args: 39 | tree_file (str): Path to the input newick formatted tree. 40 | out_file (str): Path for the output file. 41 | """ 42 | 43 | tree = Tree(tree_file, format=1) 44 | tree.prune([i for i in tree.get_leaves()]) 45 | tree.write(outfile=out_file, format=9) 46 | 47 | 48 | def get_species(species_tree, anc, other_wgd_anc='', lowcov_species=''): 49 | 50 | """ 51 | Extracts a list of species descending from a given ancestor in a species tree. Filter out 52 | species under particular ancestors (i.e subsequent WGDs for instance) given by 53 | 'other_WGD_anc', as well as 'low-coverage' species given by 'lowcov_species'. 54 | 55 | Args: 56 | tree_file (str): Path to the input newick tree. 57 | out_file (str): Path for the output file. 58 | other_wgd_anc (str, optional): Comma-delimited names of ancestors with subsequent WGDs. 59 | lowcov_species (str, optional): Comma-delimited names of 'lowcoverage' species to exclude. 60 | 61 | Returns: 62 | species (list): The list of species. 63 | """ 64 | 65 | #get species under the WGD node 66 | tree = Tree(species_tree, format=1) 67 | lca = search_one_node(tree, anc) 68 | 69 | species = [i.name for i in lca.get_leaves()] 70 | 71 | #get species under subsequent WGD nodes 72 | if other_wgd_anc: 73 | other_wgd_anc = other_wgd_anc.split(',') 74 | if anc not in other_wgd_anc: 75 | other_wgd_anc.append(anc) 76 | other_wgd_anc = get_anc_order(species_tree, other_wgd_anc, tips_to_root=False) 77 | all_wgd = other_wgd_anc[anc] #WGD that occurred after 'anc' 78 | species_with_other_wgd = [] 79 | for wgd in all_wgd: 80 | lca = search_one_node(tree, wgd) 81 | species_with_other_wgd.extend([i.name for i in lca.get_leaves()]) 82 | 83 | species = [sp for sp in species if sp not in species_with_other_wgd] 84 | 85 | #filter out 'lowcov' 86 | if lowcov_species: 87 | list_lowcov_sp = lowcov_species.split(',') 88 | species = [sp for sp in species if sp not in list_lowcov_sp] 89 | 90 | return species 91 | 92 | 93 | def get_sister_species(species_tree, species, anc): 94 | 95 | """ 96 | Extracts a list of species related to a given species: species branching between `species` and 97 | the ancestor `anc`. 98 | 99 | Args: 100 | species_tree (ete3 Tree): ete3 tree object 101 | species (str): name of the species 102 | anc (str): name of the ancestor 103 | 104 | Returns: 105 | list: species branching between `species` and `anc` 106 | """ 107 | 108 | sp_and_sisters = [species] 109 | duplicated_sp = get_species(species_tree, anc) 110 | tree = Tree(species_tree, format=1) 111 | lca = tree.get_common_ancestor([species]+duplicated_sp) 112 | sp_and_sisters += [i.name for i in lca.get_leaves() if i.name not in [species]+duplicated_sp] 113 | 114 | return sp_and_sisters 115 | 116 | 117 | def is_below(node1, node2): 118 | 119 | """ 120 | Checks if node2 is below node1 in the tree topology. 121 | 122 | Args: 123 | node1 (ete3 TreeNode): node1 124 | node2 (ete3 TreeNode): node2 125 | 126 | Returns: 127 | bool: True if node2 is below node1, False otherwise. 128 | """ 129 | 130 | below = False 131 | all_below = [i.name for i in node1.get_descendants()] 132 | if node2 in all_below: 133 | below = True 134 | return below 135 | 136 | 137 | def get_anc_order(tree_file, ancestors=None, tips_to_root=False, prune=True): 138 | 139 | """ 140 | Orders input ancestors with respect to their position in the species tree. Can be ordered from 141 | root to tips (default) or tips to root. 142 | 143 | Args: 144 | tree_file (str): Path to the input newick formatted tree. 145 | ancestors (optional, list of str): List of ancestor names. If unspecified, all the ancestors 146 | in the trees will be returned. 147 | 148 | Returns: 149 | OrderedDict: ancestor names in the requested order (keys) and list of ancestors in the 150 | input list that are below it (values). 151 | """ 152 | 153 | tree = Tree(tree_file, format=1) 154 | if not ancestors: 155 | ancestors = [i.name for i in tree.traverse() if not i.is_leaf()] 156 | if prune: 157 | tree.prune([i for i in tree.get_leaves()]) 158 | dist_to_root = {i:tree.get_distance(i) for i in ancestors} 159 | anc_order = sorted(dist_to_root, key=dist_to_root.get) 160 | 161 | if tips_to_root: 162 | anc_order = anc_order[::-1] 163 | 164 | anc_order_dict = OrderedDict() 165 | for anc in anc_order: 166 | 167 | anc_order_dict[anc] = [] 168 | anc_node = search_one_node(tree, anc) 169 | 170 | for anc2 in ancestors: 171 | 172 | if anc != anc2: 173 | if is_below(anc_node, anc2): 174 | anc_order_dict[anc].append(anc2) 175 | 176 | return anc_order_dict 177 | 178 | 179 | if __name__ == '__main__': 180 | sys.exit() 181 | --------------------------------------------------------------------------------